npm - twl-generator - Versions diffs - 1.4.9 → 1.4.11 - Mend

twl-generator 1.4.9 → 1.4.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +4 -3
package/src/index.js +12 -26
package/src/utils/twl-matcher.js +41 -28
package/src/utils/usfm-alignment-remover.js +2 -10

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "twl-generator",
-  "version": "1.4.9",
+  "version": "1.4.11",
   "description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
   "main": "src/index.js",
   "bin": {
@@ -51,7 +51,8 @@
     "csv-stringify": "^6.5.0",
     "en-inflectors": "^1.0.12",
     "jszip": "^3.10.1",
-    "tsv-quote-converters": "^1.1.14"
+    "tsv-quote-converters": "^1.1.14",
+    "usfm-alignment-remover": "^0.1.6"
   },
   "peerDependencies": {
     "react": ">=16.8.0"
@@ -61,4 +62,4 @@
       "optional": true
     }
   }
-}
+}

package/src/index.js CHANGED Viewed

@@ -442,7 +442,7 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
     let termHit = '';
     let truncated = false;
-    // Stage 1: case-sensitive, word-boundary
+    // Stage 1: case-insensitive, word-boundary (prioritized)
     if (stage === 0) {
       for (const tobj of terms) {
         const termOrig = tobj.orig;
@@ -451,28 +451,14 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
         for (const a of irregularFormsForTerm(termOrig)) alts.add(a);
         for (const a of conjugationsForTerm(termOrig)) alts.add(a);
         for (const alt of alts) {
-          const re1 = new RegExp(`\\b${escapeRegExp(alt)}\\b`);
+          const re1 = new RegExp(`\\b${escapeRegExp(alt)}\\b`, 'i');
           if (re1.test(textOrig)) { stage = 1; termHit = termOrig; break; }
         }
         if (stage === 1) break;
       }
     }
-    // Stage 2: case-insensitive, word-boundary
-    if (stage === 0) {
-      for (const tobj of terms) {
-        const termOrig = tobj.orig;
-        const alts = new Set([termOrig]);
-        for (const a of pluralizeTerm(termOrig)) alts.add(a);
-        for (const a of irregularFormsForTerm(termOrig)) alts.add(a);
-        for (const a of conjugationsForTerm(termOrig)) alts.add(a);
-        for (const alt of alts) {
-          const re2 = new RegExp(`\\b${escapeRegExp(alt)}\\b`, 'i');
-          if (re2.test(textOrig)) { stage = 2; termHit = termOrig; break; }
-        }
-        if (stage === 2) break;
-      }
-    }
-    // Stage 3: case-sensitive, substring matching at word boundaries or after dashes
+    // Stage 2: case-insensitive, substring matching at word boundaries or after dashes
     if (stage === 0) {
       for (const tobj of terms) {
         const termOrig = tobj.orig;
@@ -480,12 +466,12 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
           // Match if the term appears:
           // - At word boundary (beginning of word or after dash)
           // - Allow substring matching (e.g., "reap" matches "reapers")
-          const re3 = new RegExp(`(?:^|\\b|[—–-])${escapeRegExp(termOrig)}`, '');
-          if (re3.test(textOrig)) { stage = 3; termHit = termOrig; break; }
+          const re2 = new RegExp(`(?:^|\\b|[—–-])${escapeRegExp(termOrig)}`, 'i');
+          if (re2.test(textOrig)) { stage = 2; termHit = termOrig; break; }
         }
       }
     }
-    // Stage 4: case-insensitive, substring on derived stripped forms
+    // Stage 3: case-insensitive, substring on derived stripped forms
     if (stage === 0) {
       const strippedForms = (base) => {
         const { head, last } = splitHeadLast(base);
@@ -550,7 +536,7 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
             // Only match if the stripped form is followed by a grammatical ending
             const regex = new RegExp(escapeRegExp(form) + '(ed|ing|er|est|es|ies|s|d|n|t)\\b', 'i');
             if (regex.test(textLower)) {
-              stage = 4;
+              stage = 3;
               termHit = termOrig;
               truncated = false;
               break outerStrip;
@@ -558,9 +544,9 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
           } else {
             // For non-stripped forms, match at word boundaries or after dashes (case-insensitive)
             // Allow substring matching (e.g., "reap" matches "reapers")
-            const regex4 = new RegExp(`(?:^|\\b|[—–-])${escapeRegExp(form)}`, 'i');
-            if (regex4.test(textOrig)) {
-              stage = 4;
+            const regex3 = new RegExp(`(?:^|\\b|[—–-])${escapeRegExp(form)}`, 'i');
+            if (regex3.test(textOrig)) {
+              stage = 3;
               termHit = termOrig;
               truncated = false;
               break outerStrip;
@@ -930,7 +916,7 @@ export async function generateTwlByBook(bookCode, options = {}) {
         const ref = `${c}:${v}`;
         const id = genId();
-        const primaryArticle = (m.articles && m.articles[0]) || '';
+        const primaryArticle = m.preferredArticle || (m.articles && m.articles[0]) || '';
         let tag = '';
         if (primaryArticle.startsWith('kt/')) tag = 'keyterm';
         else if (primaryArticle.startsWith('names/')) tag = 'name';

package/src/utils/twl-matcher.js CHANGED Viewed

@@ -88,23 +88,19 @@ function generateVariants(term, isName = false) {
 }
 /**
- * Optimized PrefixTrie for fast term matching with case sensitivity
+ * Optimized PrefixTrie for fast term matching with case insensitivity
  */
 class PrefixTrie {
   constructor() {
-    this.exactCaseRoot = {}; // For exact case matches
-    this.lowerCaseRoot = {}; // For case-insensitive fallback
+    this.root = {}; // For case-insensitive matches
   }
   insert(term, originalTerm, articles, isOriginal = true) {
-    // Insert into exact case trie
-    this._insertIntoTree(this.exactCaseRoot, term, originalTerm, articles, isOriginal, true);
-    // // Also insert into lowercase trie for fallback - removed, too many falses
-    // this._insertIntoTree(this.lowerCaseRoot, term.toLowerCase(), originalTerm, articles, isOriginal, false);
+    // Insert into case-insensitive trie (always lowercase)
+    this._insertIntoTree(this.root, term.toLowerCase(), originalTerm, articles, isOriginal);
   }
-  _insertIntoTree(root, term, originalTerm, articles, isOriginal, isExactCase) {
+  _insertIntoTree(root, term, originalTerm, articles, isOriginal) {
     let node = root;
     for (const char of term) {
@@ -123,24 +119,16 @@ class PrefixTrie {
       term: originalTerm,
       articles,
       matchedText: term,
-      priority: isOriginal ? 0 : 1,
-      isExactCase
+      priority: isOriginal ? 0 : 1
     });
   }
   findMatches(text, startPos) {
-    // First try exact case matches
-    let matches = this._findMatchesInTree(this.exactCaseRoot, text, startPos, true, text);
-    // If no exact case matches, try case-insensitive
-    if (matches.length === 0) {
-      matches = this._findMatchesInTree(this.lowerCaseRoot, text.toLowerCase(), startPos, false, text);
-    }
-    return matches;
+    // Always use case-insensitive matching
+    return this._findMatchesInTree(this.root, text.toLowerCase(), startPos, text);
   }
-  _findMatchesInTree(root, searchText, startPos, isExactCase, originalText) {
+  _findMatchesInTree(root, searchText, startPos, originalText) {
     const matches = [];
     let node = root;
     let currentPos = startPos;
@@ -223,15 +211,14 @@ class PrefixTrie {
               matchedText: originalMatchedText, // Use the extended matched text
               length: originalMatchedText.length, // Use extended length
               originalLength: matchLength, // Keep track of original match length for advancement
-              priority: termData.priority,
-              isExactCase: isExactCase
+              priority: termData.priority
             });
           }
         }
       }
     }
-    // Sort by length (longer first), then by priority, then by case match preference
+    // Sort by length (longer first), then by priority
     return matches.sort((a, b) => {
       if (b.length !== a.length) {
         return b.length - a.length;
@@ -239,10 +226,6 @@ class PrefixTrie {
       if (a.priority !== b.priority) {
         return a.priority - b.priority;
       }
-      // Prefer exact case matches
-      if (a.isExactCase !== b.isExactCase) {
-        return a.isExactCase ? -1 : 1;
-      }
       return 0;
     });
   }
@@ -308,8 +291,37 @@ function findMatches(verseText, termTrie) {
     let bestMatch = null;
     // Pick the best match (longest, then by priority)
+    // But collect all articles from matches of the same length and priority
     if (candidateMatches.length > 0) {
       bestMatch = candidateMatches[0];
+      // Collect all articles from matches with the same length and priority as the best match
+      const allArticles = new Set();
+      for (const match of candidateMatches) {
+        if (match.length === bestMatch.length && match.priority === bestMatch.priority) {
+          match.articles.forEach(article => allArticles.add(article));
+        }
+      }
+      bestMatch.articles = Array.from(allArticles);
+      // Special case for "god" - prefer the appropriate article based on capitalization
+      // but keep all articles for disambiguation
+      if (bestMatch.matchedText.toLowerCase() === 'god' && bestMatch.articles.length > 1) {
+        const originalMatchedText = normalizedText.substring(currentPos, currentPos + bestMatch.length);
+        const hasGodArticle = bestMatch.articles.includes('kt/god');
+        const hasFalseGodArticle = bestMatch.articles.includes('kt/falsegod');
+        if (hasGodArticle && hasFalseGodArticle) {
+          // Check capitalization in original text
+          if (originalMatchedText === 'God' || originalMatchedText.charAt(0) === 'G') {
+            // Prefer kt/god for capitalized "God"
+            bestMatch.preferredArticle = 'kt/god';
+          } else {
+            // Prefer kt/falsegod for lowercase "god"
+            bestMatch.preferredArticle = 'kt/falsegod';
+          }
+        }
+      }
     }
     if (bestMatch) {
@@ -320,6 +332,7 @@ function findMatches(verseText, termTrie) {
       matches.push({
         term: bestMatch.term,
         articles: bestMatch.articles,
+        preferredArticle: bestMatch.preferredArticle,
         matchedText: matchedText,
         context: context,
         priority: bestMatch.priority

package/src/utils/usfm-alignment-remover.js CHANGED Viewed

@@ -1,6 +1,7 @@
 /* eslint-disable no-async-promise-executor, no-throw-literal */
 import { BibleBookData } from '../common/books.js';
+import { removeAlignments } from 'usfm-alignment-remover';
 // Environment detection
 const isNode = typeof window === 'undefined' && typeof process !== 'undefined' && process.versions?.node;
@@ -26,16 +27,7 @@ function decodeBase64(base64String) {
 export const removeAllTagsExceptChapterVerse = (usfmContent) => {
   if (!usfmContent) return '';
-  let cleanContent = usfmContent;
-  // Remove word-level alignment markers like \w word|lemma="lemma" strong="H1234"\w*
-  cleanContent = cleanContent.replace(/\\w\s+([^|\\]+)\|[^\\]*\\w\*/g, '$1');
-  // Remove milestone markers like \zaln-s | \zaln-e\*
-  cleanContent = cleanContent.replace(/\\zaln-[se][^\\]*\\?\*?/g, '');
-  // Remove other alignment-related markers
-  cleanContent = cleanContent.replace(/\\k-[se][^\\]*\\?\*?/g, '');
+  let cleanContent = removeAlignments(usfmContent);
   // Remove empty lines that might result from marker removal
   cleanContent = cleanContent.replace(/\n\s*\n\s*\n/g, '\n\n');