npm - twl-generator - Versions diffs - 1.3.7 → 1.4.1 - Mend

twl-generator 1.3.7 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +1 -1
package/src/cli.js +6 -1
package/src/index.js +113 -141
package/src/utils/twl-matcher.js +94 -12
package/src/utils/usfm-alignment-remover.js +5 -3

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "twl-generator",
-  "version": "1.3.7",
+  "version": "1.4.1",
   "description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
   "main": "src/index.js",
   "bin": {

package/src/cli.js CHANGED Viewed

@@ -63,7 +63,12 @@ async function main() {
     console.log(`Wrote ${out}`);
     const dir = path.dirname(outPath);
     const base = path.basename(outPath);
-    const nmPath = path.join(dir, base.replace(/\.twl\.tsv$/i, '.no-match.twl.tsv'));
+    // Derive a sensible no-match filename when --out doesn't follow *.twl.tsv
+    let nmFile;
+    if (/\.twl\.tsv$/i.test(base)) nmFile = base.replace(/\.twl\.tsv$/i, '.no-match.twl.tsv');
+    else if (/\.tsv$/i.test(base)) nmFile = base.replace(/\.tsv$/i, '.no-match.twl.tsv');
+    else nmFile = base + '.no-match.twl.tsv';
+    const nmPath = path.join(dir, nmFile);
     await fs.writeFile(nmPath, noMatchTsv, 'utf8');
     console.log(`Wrote ${nmPath}`);
   } else if (outDir) {

package/src/index.js CHANGED Viewed

@@ -863,79 +863,37 @@ function chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, twMap, opts
 }
 export async function generateTwlByBook(bookCode, options = {}) {
-  // Import Node-specific modules conditionally
-  const { addGLQuoteCols, convertGLQuotes2OLQuotes } = await import('tsv-quote-converters');
-  const useCompromise = !!options.useCompromise;
-  let nlp = null;
-  if (useCompromise) {
-    const mod = await import('compromise');
-    nlp = mod.default || mod;
+  // New: English-first matching (no Strong's), using ULT USFM verses
+  // Build term -> [articles] from local tw_strongs_list.json (terms only; ignore Strong's)
+  const twJson = await loadTwJsonLocal();
+  const termToArticles = {};
+  for (const [article, val] of Object.entries(twJson)) {
+    const terms = (val && val.article && Array.isArray(val.article.terms)) ? val.article.terms : [];
+    for (const raw of terms) {
+      const term = String(raw || '').replace(/\s*\([^)]*\)\s*$/, '').trim();
+      if (!term) continue;
+      if (!termToArticles[term]) termToArticles[term] = [];
+      // Use slug as-is (e.g., kt/grace)
+      termToArticles[term].push(article);
+    }
   }
+  // Build trie for fast scanning
+  const { buildTermTrie, scanVerseMatches } = await import('./utils/twl-matcher.js');
+  const trie = buildTermTrie(termToArticles);
+  // Fetch and parse ULT USFM into verses
+  const { processUsfmForBook } = await import('./utils/usfm-alignment-remover.js');
   const bibleData = await readBooks();
   const meta = findBookMeta(bibleData, bookCode);
   if (!meta) throw new Error(`Unknown book code: ${bookCode}`);
-  const usfm = await fetchUsfm(meta.usfm, meta.testament);
-  const twJson = await loadTwJsonLocal();
-  const strongPivot = pivotByStrong(twJson);
-  // 1) initial TSV
-  const baseTsv = buildInitialTsv(usfm, strongPivot, meta.key);
-  // 2) add GLQuote and GLOccurrence
-  const glRes = await addGLQuoteCols({
-    bibleLinks: ["unfoldingWord/en_ult/master"],
-    bookCode: meta.key,
-    tsvContent: baseTsv,
-    trySeparatorsAndOccurrences: true,
-  });
-  const withGl = glRes.output;
-  // 3) Convert GLQuote/GLOccurrence into OrigWords/Occurrence and convert to OL quotes BEFORE matching
-  const lines0 = withGl.split(/\r?\n/);
-  const header0 = lines0.shift();
-  const h0 = header0.split('\t');
-  const I0 = {
-    Reference: h0.indexOf('Reference'),
-    ID: h0.indexOf('ID'),
-    Tags: h0.indexOf('Tags'),
-    OrigWords: h0.indexOf('OrigWords'),
-    Occurrence: h0.indexOf('Occurrence'),
-    TWLink: h0.indexOf('TWLink'),
-    GLQuote: h0.indexOf('GLQuote'),
-    GLOccurrence: h0.indexOf('GLOccurrence'),
-  };
-  const rebuilt0 = [header0].concat(lines0.filter(Boolean).map(row => {
-    const c = row.split('\t');
-    const newCols = c.slice();
-    if (I0.GLQuote >= 0) newCols[I0.OrigWords] = c[I0.GLQuote];
-    if (I0.GLOccurrence >= 0) newCols[I0.Occurrence] = c[I0.GLOccurrence];
-    return newCols.join('\t');
-  })).join('\n');
-  const convEarly = await convertGLQuotes2OLQuotes({
-    bibleLinks: ["unfoldingWord/en_ult/master"],
-    bookCode: meta.key,
-    tsvContent: rebuilt0,
-    trySeparatorsAndOccurrences: true,
-  });
+  const versesByChapter = await processUsfmForBook(meta.key);
-  // 4) Reorder columns and add Strongs + randomized 4-char IDs before matching
-  const linesA = convEarly.output.split(/\r?\n/);
-  const headerA = linesA.shift();
-  const aCols = headerA.split('\t');
-  const A = {
-    Reference: aCols.indexOf('Reference'),
-    ID: aCols.indexOf('ID'),
-    Tags: aCols.indexOf('Tags'),
-    OrigWords: aCols.indexOf('OrigWords'),
-    Occurrence: aCols.indexOf('Occurrence'),
-    TWLink: aCols.indexOf('TWLink'),
-    GLQuote: aCols.indexOf('GLQuote'),
-    GLOccurrence: aCols.indexOf('GLOccurrence'),
-  };
+  // Header without Strongs; keep GLQuote/GLOccurrence and add Variant of, Disambiguation
+  const header = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Variant of', 'Disambiguation'];
+  const outRows = [header.join('\t')];
-  // New header order: Reference, ID, Tags, OrigWords, Occurrence, TWLink, Strongs, GLQuote, GLOccurrence
-  const finalHeaderBase = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Strongs'];
+  // ID generator
   const usedIds = new Set();
   const genId = () => {
     const letters = 'abcdefghijklmnopqrstuvwxyz';
@@ -949,86 +907,100 @@ export async function generateTwlByBook(bookCode, options = {}) {
     }
   };
-  const preparedRows = [];
-  for (const ln of linesA) {
-    if (!ln.trim()) continue;
-    const c = ln.split('\t');
-    if (c.length < 7) continue;
-    const strongsVal = c[A.ID];
-    const newId = genId();
-    const newRow = [
-      c[A.Reference],
-      newId,
-      c[A.Tags],
-      c[A.OrigWords],
-      c[A.Occurrence],
-      c[A.TWLink],
-      c[A.GLQuote],
-      c[A.GLOccurrence],
-      strongsVal,
-    ];
-    preparedRows.push(newRow);
-  }
-  // Indexes for prepared rows
-  const H = {
-    Reference: 0,
-    ID: 1,
-    Tags: 2,
-    OrigWords: 3,
-    Occurrence: 4,
-    TWLink: 5,
-    GLQuote: 6,
-    GLOccurrence: 7,
-    Strongs: 8,
+  // Helpers for Variant of decision (allow only plural/-ed/-ing without marking variant)
+  const pluralizeWord = (w) => {
+    if (/[^aeiou]y$/i.test(w)) return w.replace(/y$/i, 'ies');
+    if (/(s|x|z|ch|sh)$/i.test(w)) return w + 'es';
+    if (/f$/i.test(w) && !/(roof|belief|chief|proof)$/i.test(w)) return w.replace(/f$/i, 'ves');
+    if (/fe$/i.test(w)) return w.replace(/fe$/i, 'ves');
+    if (/o$/i.test(w)) return w + 'es';
+    return w + 's';
+  };
+  const isVowel = (ch) => /[aeiou]/i.test(ch);
+  const isConsonant = (ch) => /[a-z]/i.test(ch) && !isVowel(ch);
+  const endsWithCVC = (w) => w.length >= 3 && isConsonant(w[w.length - 3]) && isVowel(w[w.length - 2]) && isConsonant(w[w.length - 1]) && !/[wxy]/i.test(w[w.length - 1]);
+  const edForm = (w) => (/e$/i.test(w) ? w + 'd' : (/[^aeiou]y$/i.test(w) ? w.replace(/y$/i, 'ied') : (endsWithCVC(w) ? w + w[w.length - 1] + 'ed' : w + 'ed')));
+  const ingForm = (w) => (/ie$/i.test(w) ? w.replace(/ie$/i, 'ying') : (/ee$/i.test(w) ? w + 'ing' : (/e$/i.test(w) ? w.replace(/e$/i, 'ing') : (endsWithCVC(w) ? w + w[w.length - 1] + 'ing' : w + 'ing'))));
+  const allowNoVariant = (base, match) => {
+    const b = String(base || '');
+    const m = String(match || '');
+    if (!b || !m) return true;
+    if (b.toLowerCase() === m.toLowerCase()) return true;
+    const parts = b.trim().split(/\s+/);
+    const head = parts.length > 1 ? parts.slice(0, -1).join(' ') + ' ' : '';
+    const last = parts[parts.length - 1];
+    const allowed = new Set([
+      head + pluralizeWord(last),
+      head + edForm(last),
+      head + ingForm(last),
+    ].map(x => x.toLowerCase()));
+    return allowed.has(m.toLowerCase());
   };
-  // 5) pick best TWLink based on GLQuote terms using Strongs column; include Variant of column
-  const termMap = buildArticleTermMap(twJson);
-  const outRows = [finalHeaderBase.concat(['Variant of', 'Disambiguation']).join('\t')];
-  const noMatchRows = [finalHeaderBase.concat(['Disambiguation']).join('\t')];
-  let totalRows = 0;
-  let droppedRows = 0;
-  let multiDisambRows = 0;
-  const noMatchSamples = [];
-  for (const cols of preparedRows) {
-    totalRows++;
-    const strongId = cols[H.Strongs];
-    const glq = cols[H.GLQuote] || '';
-    const result = chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, twJson, { useCompromise, nlp });
-    if (!result) {
-      droppedRows++;
-      if (noMatchSamples.length < 8) {
-        const ref = cols[H.Reference] || '';
-        noMatchSamples.push(`${ref}\t${strongId}\t${glq}`);
+  // Walk through verses in order
+  const chapterNums = Object.keys(versesByChapter).map(n => parseInt(n, 10)).sort((a, b) => a - b);
+  for (const c of chapterNums) {
+    const verses = versesByChapter[c] || {};
+    const verseNums = Object.keys(verses).map(n => parseInt(n, 10)).sort((a, b) => a - b);
+    for (const v of verseNums) {
+      const text = verses[v] || '';
+      const matches = scanVerseMatches(text, trie);
+      // Count occurrences per exact matchedText (case-sensitive)
+      const occMap = new Map();
+      for (const m of matches) {
+        const glq = m.matchedText;
+        const occ = (occMap.get(glq) || 0) + 1;
+        occMap.set(glq, occ);
+        const ref = `${c}:${v}`;
+        const id = genId();
+        const primaryArticle = (m.articles && m.articles[0]) || '';
+        let tag = '';
+        if (primaryArticle.startsWith('kt/')) tag = 'keyterm';
+        else if (primaryArticle.startsWith('names/')) tag = 'name';
+        const twLink = primaryArticle ? `rc://*/tw/dict/bible/${primaryArticle}` : '';
+        // Variant of: only if beyond plural/-ed/-ing differences
+        const variantOf = allowNoVariant(m.term, glq) ? '' : m.term;
+        // Disambiguation: list all candidate articles for this match
+        const disamb = (m.articles && m.articles.length > 1) ? `(${m.articles.join(', ')})` : '';
+        // Set OrigWords/Occurrence equal to GLQuote/GLOccurrence for English-first output
+        outRows.push([
+          ref,
+          id,
+          tag,
+          glq,
+          String(occ),
+          twLink,
+          glq,
+          String(occ),
+          variantOf,
+          disamb,
+        ].join('\t'));
       }
-      const tried = prioritizeArticles(glq, strongId, strongPivot) || [];
-      const disambTried = tried.length ? `(${tried.join(', ')})` : '';
-      noMatchRows.push(cols.join('\t') + '\t' + disambTried);
-      continue;
     }
-    const art = result.article;
-    cols[H.TWLink] = `rc://*/tw/dict/bible/${art}`;
-    // Update Tags based on selected article prefix
-    let tag = '';
-    if (art.startsWith('kt/')) tag = 'keyterm';
-    else if (art.startsWith('names/')) tag = 'name';
-    cols[H.Tags] = tag;
-    if (result.disamb) multiDisambRows++;
-    const variantOf = result.variantTerm || '';
-    outRows.push(cols.join('\t') + '\t' + variantOf + '\t' + (result.disamb || ''));
   }
-  const keptRows = totalRows - droppedRows;
-  const pct = totalRows ? ((keptRows / totalRows) * 100).toFixed(1) : '0.0';
-  console.log(`[TWL] ${bookCode.toUpperCase()}: kept ${keptRows}/${totalRows} (${pct}%), dropped ${droppedRows}, disambiguated ${multiDisambRows}`);
-  if (noMatchSamples.length) {
-    console.log(`[TWL] ${bookCode.toUpperCase()}: no-match samples (up to 8):`);
-    for (const s of noMatchSamples) console.log(`  ${s}`);
+  // Build TSV and convert GL OrigWords back to OL using tsv-quote-converters
+  let matchedTsv = outRows.join('\n');
+  try {
+    const { convertGLQuotes2OLQuotes } = await import('tsv-quote-converters');
+    const conv = await convertGLQuotes2OLQuotes({
+      bibleLinks: ['unfoldingWord/en_ult/master'],
+      bookCode: String(meta.key || bookCode).toLowerCase(),
+      tsvContent: matchedTsv,
+      trySeparatorsAndOccurrences: true,
+      quiet: true,
+    });
+    if (conv && typeof conv.output === 'string' && conv.output.length) {
+      matchedTsv = conv.output;
+    }
+  } catch (e) {
+    // If conversion fails (e.g., no network), fall back to unconverted TSV
   }
-  const matchedTsv = outRows.join('\n');
-  const noMatchTsv = noMatchRows.join('\n');
+  const noMatchHeader = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Disambiguation'];
+  const noMatchTsv = [noMatchHeader.join('\t')].join('\n');
   return { matchedTsv, noMatchTsv };
 }

package/src/utils/twl-matcher.js CHANGED Viewed

@@ -160,16 +160,86 @@ class PrefixTrie {
       if (node._terms) {
         const matchLength = currentPos - startPos;
         // Always extract from the original text to preserve case
-        const originalMatchedText = originalText.substring(startPos, currentPos);
+        let originalMatchedText = originalText.substring(startPos, currentPos);
+        // Extend match backwards to include dash-connected words and possessive forms
+        let extendedStartPos = startPos;
+        // Check backwards for dash preceded by word characters (no space between)
+        if (extendedStartPos > 0 && originalText[extendedStartPos - 1] === '-') {
+          let dashPos = extendedStartPos - 1;
+          dashPos--; // Move before the dash
+          // Check if there are word characters immediately before the dash
+          if (dashPos >= 0 && /[\w]/.test(originalText[dashPos])) {
+            // Find the start of the word before the dash
+            while (dashPos >= 0 && /[\w]/.test(originalText[dashPos])) {
+              dashPos--;
+            }
+            extendedStartPos = dashPos + 1;
+          }
+        }
+        // Check backwards for apostrophe (straight or curly) preceded by text
+        if (extendedStartPos > 0 && /['']/.test(originalText[extendedStartPos - 1])) {
+          let apostrophePos = extendedStartPos - 1;
+          apostrophePos--; // Move before the apostrophe
+          // Check if there are word characters immediately before the apostrophe
+          if (apostrophePos >= 0 && /[\w]/.test(originalText[apostrophePos])) {
+            // Find the start of the text before the apostrophe
+            while (apostrophePos >= 0 && /[\w]/.test(originalText[apostrophePos])) {
+              apostrophePos--;
+            }
+            extendedStartPos = apostrophePos + 1;
+          }
+        }
+        // Extend match forwards to include dash-connected words and possessive forms
+        let extendedEndPos = currentPos;
+        // Check for dash followed by word characters (no space between)
+        if (extendedEndPos < originalText.length && originalText[extendedEndPos] === '-') {
+          let dashPos = extendedEndPos;
+          dashPos++; // Move past the dash
+          // Check if there are word characters immediately after the dash
+          if (dashPos < originalText.length && /[\w]/.test(originalText[dashPos])) {
+            // Find the end of the word after the dash
+            while (dashPos < originalText.length && /[\w]/.test(originalText[dashPos])) {
+              dashPos++;
+            }
+            extendedEndPos = dashPos;
+          }
+        }
+        // Check for apostrophe (straight or curly) followed by text
+        if (extendedEndPos < originalText.length && /['']/.test(originalText[extendedEndPos])) {
+          let apostrophePos = extendedEndPos;
+          apostrophePos++; // Move past the apostrophe
+          // Check if there are word characters immediately after the apostrophe
+          if (apostrophePos < originalText.length && /[\w]/.test(originalText[apostrophePos])) {
+            // Find the end of the text after the apostrophe
+            while (apostrophePos < originalText.length && /[\w]/.test(originalText[apostrophePos])) {
+              apostrophePos++;
+            }
+            extendedEndPos = apostrophePos;
+          } else {
+            // Include the apostrophe even if no text follows (for possessives ending in s)
+            extendedEndPos = apostrophePos;
+          }
+        }
+        // Update the matched text if we extended it
+        if (extendedStartPos < startPos || extendedEndPos > currentPos) {
+          originalMatchedText = originalText.substring(extendedStartPos, extendedEndPos);
+        }
         // Check if this is a valid word boundary match (both start and end)
-        const isStartBoundary = startPos === 0 ||
-          /[\s\p{P}]/.test(originalText[startPos - 1]) ||
-          !/[\w]/.test(originalText[startPos - 1]);
+        const isStartBoundary = extendedStartPos === 0 ||
+          /[\s\p{P}]/.test(originalText[extendedStartPos - 1]) ||
+          !/[\w]/.test(originalText[extendedStartPos - 1]);
-        const isEndBoundary = currentPos >= originalText.length ||
-          /[\s\p{P}]/.test(originalText[currentPos]) ||
-          !/[\w]/.test(originalText[currentPos]);
+        const isEndBoundary = extendedEndPos >= originalText.length ||
+          /[\s\p{P}]/.test(originalText[extendedEndPos]) ||
+          !/[\w]/.test(originalText[extendedEndPos]);
         const isWordBoundary = isStartBoundary && isEndBoundary;
@@ -178,8 +248,9 @@ class PrefixTrie {
             matches.push({
               term: termData.term,
               articles: termData.articles,
-              matchedText: originalMatchedText, // Use the original text, not the normalized version
-              length: matchLength,
+              matchedText: originalMatchedText, // Use the extended matched text
+              length: originalMatchedText.length, // Use extended length
+              originalLength: matchLength, // Keep track of original match length for advancement
               priority: termData.priority,
               isExactCase: isExactCase
             });
@@ -283,9 +354,11 @@ function findMatches(verseText, termTrie) {
         priority: bestMatch.priority
       });
-      // Move past the matched text
-      processedText += matchedText;
-      currentPos += bestMatch.length;
+      // Move past only the original matched text (not the extended part)
+      // This allows finding additional matches within the extended portion
+      const advanceBy = bestMatch.originalLength || bestMatch.length;
+      processedText += normalizedText.substring(currentPos, currentPos + advanceBy);
+      currentPos += advanceBy;
     } else {
       // No match found, move to next character/word boundary
       const nextWordBoundary = normalizedText.substring(currentPos).search(/[\s\p{P}]/u);
@@ -427,3 +500,12 @@ export function generateTWLMatches(twTerms, verses) {
   return tsvRows.join('\n');
 }
+// Expose lightweight building and scanning APIs for reuse
+export function buildTermTrie(twTerms) {
+  return createOptimizedTermMap(twTerms);
+}
+export function scanVerseMatches(verseText, termTrie) {
+  return findMatches(verseText, termTrie);
+}

package/src/utils/usfm-alignment-remover.js CHANGED Viewed

@@ -69,10 +69,12 @@ export const removeAllTagsExceptChapterVerse = (usfmContent) => {
  * @return {Promise<Object>} - Object with chapters and verses
  */
 export async function processUsfmForBook(book) {
-  if (!BibleBookData[book]) throw new Error(`Unknown book: ${book}`);
+  // Normalize book key to lowercase to match BibleBookData keys
+  const key = String(book || '').toLowerCase();
+  if (!BibleBookData[key]) throw new Error(`Unknown book: ${book}`);
   const fetch = await getFetch();
-  const usfmUrl = `https://git.door43.org/api/v1/repos/unfoldingWord/en_ult/contents/${BibleBookData[book].usfm}.usfm?ref=master`;
+  const usfmUrl = `https://git.door43.org/api/v1/repos/unfoldingWord/en_ult/contents/${BibleBookData[key].usfm}.usfm?ref=master`;
   const usfmRes = await fetch(usfmUrl);
   if (!usfmRes.ok) throw new Error(`Failed to download USFM file for ${book}`);
   const usfmData = await usfmRes.json();
@@ -120,4 +122,4 @@ export function parseUsfmToVerses(usfm) {
   }
   return versesObj;
-}
+}