npm - twl-generator - Versions diffs - 1.4.0 → 1.4.2 - Mend

twl-generator 1.4.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/package.json +4 -3
package/src/cli.js +8 -6
package/src/index.js +11 -56
package/src/utils/twl-matcher.js +85 -13
package/src/utils/usfm-alignment-remover.js +2 -2
package/src/utils/zipProcessor.js +14 -210

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "twl-generator",
-  "version": "1.4.0",
+  "version": "1.4.2",
   "description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
   "main": "src/index.js",
   "bin": {
@@ -46,9 +46,10 @@
     "node": ">=18.0.0"
   },
   "dependencies": {
+    "compromise": "^14.14.2",
     "csv-parse": "^5.5.6",
     "csv-stringify": "^6.5.0",
-    "compromise": "^14.14.2",
+    "jszip": "^3.10.1",
     "tsv-quote-converters": "^1.1.13"
   },
   "peerDependencies": {
@@ -59,4 +60,4 @@
       "optional": true
     }
   }
-}
+}

package/src/cli.js CHANGED Viewed

@@ -15,7 +15,7 @@ async function readBooksJs() {
 }
 function parseArgs(argv) {
-  const args = { book: '', out: '', outDir: '', all: false, useCompromise: false };
+  const args = { book: '', out: '', outDir: '', all: false, useCompromise: false, dcsHost: 'https://git.door43.org' };
   for (let i = 2; i < argv.length; i++) {
     const a = argv[i];
     if (a === '--book' || a === '-b') { args.book = argv[++i] || ''; }
@@ -23,21 +23,22 @@ function parseArgs(argv) {
     else if (a === '--out-dir' || a === '-O') { args.outDir = argv[++i] || ''; }
     else if (a === '--all' || a === '-A') { args.all = true; }
     else if (a === '--use-compromise') { args.useCompromise = true; }
+    else if (a === '--dcs') { args.dcsHost = argv[++i] || 'https://git.door43.org'; }
   }
   return args;
 }
 async function main() {
-  const { book, out, outDir, all, useCompromise } = parseArgs(process.argv);
+  const { book, out, outDir, all, useCompromise, dcsHost } = parseArgs(process.argv);
   if (all || (book && book.toLowerCase() === 'all')) {
     const books = await readBooksJs();
     const codes = Object.keys(books);
     const destDir = outDir ? path.resolve(outDir) : path.resolve(THIS_DIR, '..'); // default to twl-generator dir
     await fs.mkdir(destDir, { recursive: true });
-    console.error(`Generating TWL for ${codes.length} books to ${destDir} (useCompromise=${useCompromise})`);
+    console.error(`Generating TWL for ${codes.length} books to ${destDir} (useCompromise=${useCompromise}, dcsHost=${dcsHost})`);
     for (const code of codes) {
       try {
-        const { matchedTsv, noMatchTsv } = await generateTwlByBook(code, { useCompromise });
+        const { matchedTsv, noMatchTsv } = await generateTwlByBook(code, { useCompromise, dcsHost });
         const fname = `${code.toLowerCase()}.twl.tsv`;
         const outPath = path.join(destDir, fname);
         await fs.writeFile(outPath, matchedTsv, 'utf8');
@@ -52,11 +53,12 @@ async function main() {
   }
   if (!book) {
-    console.error('Usage: generate-twl --book <code>|all [--out <file.tsv> | --out-dir <dir>] [--use-compromise]');
+    console.error('Usage: generate-twl --book <code>|all [--out <file.tsv> | --out-dir <dir>] [--use-compromise] [--dcs <host>]');
+    console.error('  --dcs defaults to https://git.door43.org');
     process.exit(1);
   }
-  const { matchedTsv, noMatchTsv } = await generateTwlByBook(book, { useCompromise });
+  const { matchedTsv, noMatchTsv } = await generateTwlByBook(book, { useCompromise, dcsHost });
   if (out) {
     const outPath = path.resolve(out);
     await fs.writeFile(outPath, matchedTsv, 'utf8');

package/src/index.js CHANGED Viewed

@@ -1,7 +1,6 @@
 import { BibleBookData } from './common/books.js';
 const isBrowser = typeof window !== 'undefined';
-const TW_JSON_URL = new URL('../tw_strongs_list.json', import.meta.url);
 async function readBooks() {
   // Build a simple CODE -> { usfm, testament } map from the local BibleBookData
@@ -20,29 +19,11 @@ function findBookMeta(bookMap, code) {
   return { key, ...meta };
 }
-async function fetchUsfm(usfmCode, testament) {
-  const repo = testament === 'old' ? 'hbo_uhb' : 'el-x-koine_ugnt';
-  const url = `https://git.door43.org/api/v1/repos/unfoldingWord/${repo}/contents/${usfmCode}.usfm`;
-  const res = await fetch(url);
-  if (!res.ok) throw new Error(`Failed to fetch USFM: ${res.status} ${res.statusText}`);
-  const json = await res.json();
-  const b64 = json.content || '';
-  if (isBrowser) {
-    // Browser: use atob and TextDecoder
-    const binary = atob(b64);
-    const bytes = Uint8Array.from(binary, c => c.charCodeAt(0));
-    const decoder = new TextDecoder('utf-8');
-    return decoder.decode(bytes);
-  } else {
-    // Node.js: use Buffer
-    const { Buffer } = await import('node:buffer');
-    const buf = Buffer.from(b64, 'base64');
-    return buf.toString('utf8');
-  }
-}
-function pivotByStrong(twMap) {
+async function loadTermsFromEnTw(dcsHost = 'https://git.door43.org') {
+  // Use the updated zipProcessor that accepts dcsHost
+  const { generateTWTerms } = await import('./utils/zipProcessor.js');
+  return await generateTWTerms(dcsHost);
+} function pivotByStrong(twMap) {
   // Build two structures:
   // 1) singles: strong -> Set(articles) including base (strip letter suffix)
   // 2) seqFirst: base-first-strong -> [{ article, seqBase, len }] preserving order in twMap
@@ -210,23 +191,6 @@ function buildInitialTsv(usfm, strongPivot, bookCode) {
   return tsv;
 }
-async function loadTwJsonLocal() {
-  if (isBrowser) {
-    // In browser, try to fetch from public path
-    const url = '/tw_strongs_list.json';
-    const res = await fetch(url);
-    if (!res.ok) throw new Error(`Failed to fetch tw_strongs_list.json: ${res.status}`);
-    return await res.json();
-  } else {
-    // In Node.js, read from file system
-    const fs = await import('node:fs/promises');
-    const { fileURLToPath } = await import('node:url');
-    const filePath = fileURLToPath(TW_JSON_URL);
-    const raw = await fs.readFile(filePath, 'utf8');
-    return JSON.parse(raw);
-  }
-}
 function buildArticleTermMap(twMap) {
   // Normalize helper: remove only trailing parenthetical notes and collapse whitespace
   const stripParensTrim = (s) => String(s || '').replace(/\s*\([^)]*\)\s*$/, '').replace(/\s+/g, ' ').trim();
@@ -863,20 +827,11 @@ function chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, twMap, opts
 }
 export async function generateTwlByBook(bookCode, options = {}) {
-  // New: English-first matching (no Strong's), using ULT USFM verses
-  // Build term -> [articles] from local tw_strongs_list.json (terms only; ignore Strong's)
-  const twJson = await loadTwJsonLocal();
-  const termToArticles = {};
-  for (const [article, val] of Object.entries(twJson)) {
-    const terms = (val && val.article && Array.isArray(val.article.terms)) ? val.article.terms : [];
-    for (const raw of terms) {
-      const term = String(raw || '').replace(/\s*\([^)]*\)\s*$/, '').trim();
-      if (!term) continue;
-      if (!termToArticles[term]) termToArticles[term] = [];
-      // Use slug as-is (e.g., kt/grace)
-      termToArticles[term].push(article);
-    }
-  }
+  // Extract dcsHost option with default
+  const dcsHost = options.dcsHost || 'https://git.door43.org';
+  // Load terms from en_tw zip file instead of local tw_strongs_list.json
+  const termToArticles = await loadTermsFromEnTw(dcsHost);
   // Build trie for fast scanning
   const { buildTermTrie, scanVerseMatches } = await import('./utils/twl-matcher.js');
@@ -887,7 +842,7 @@ export async function generateTwlByBook(bookCode, options = {}) {
   const bibleData = await readBooks();
   const meta = findBookMeta(bibleData, bookCode);
   if (!meta) throw new Error(`Unknown book code: ${bookCode}`);
-  const versesByChapter = await processUsfmForBook(meta.key);
+  const versesByChapter = await processUsfmForBook(meta.key, dcsHost);
   // Header without Strongs; keep GLQuote/GLOccurrence and add Variant of, Disambiguation
   const header = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Variant of', 'Disambiguation'];

package/src/utils/twl-matcher.js CHANGED Viewed

@@ -160,16 +160,86 @@ class PrefixTrie {
       if (node._terms) {
         const matchLength = currentPos - startPos;
         // Always extract from the original text to preserve case
-        const originalMatchedText = originalText.substring(startPos, currentPos);
+        let originalMatchedText = originalText.substring(startPos, currentPos);
+        // Extend match backwards to include dash-connected words and possessive forms
+        let extendedStartPos = startPos;
+        // Check backwards for dash preceded by word characters (no space between)
+        if (extendedStartPos > 0 && originalText[extendedStartPos - 1] === '-') {
+          let dashPos = extendedStartPos - 1;
+          dashPos--; // Move before the dash
+          // Check if there are word characters immediately before the dash
+          if (dashPos >= 0 && /[\w]/.test(originalText[dashPos])) {
+            // Find the start of the word before the dash
+            while (dashPos >= 0 && /[\w]/.test(originalText[dashPos])) {
+              dashPos--;
+            }
+            extendedStartPos = dashPos + 1;
+          }
+        }
+        // Check backwards for apostrophe (straight or curly) preceded by text
+        if (extendedStartPos > 0 && /['']/.test(originalText[extendedStartPos - 1])) {
+          let apostrophePos = extendedStartPos - 1;
+          apostrophePos--; // Move before the apostrophe
+          // Check if there are word characters immediately before the apostrophe
+          if (apostrophePos >= 0 && /[\w]/.test(originalText[apostrophePos])) {
+            // Find the start of the text before the apostrophe
+            while (apostrophePos >= 0 && /[\w]/.test(originalText[apostrophePos])) {
+              apostrophePos--;
+            }
+            extendedStartPos = apostrophePos + 1;
+          }
+        }
+        // Extend match forwards to include dash-connected words and possessive forms
+        let extendedEndPos = currentPos;
+        // Check for dash followed by word characters (no space between)
+        if (extendedEndPos < originalText.length && originalText[extendedEndPos] === '-') {
+          let dashPos = extendedEndPos;
+          dashPos++; // Move past the dash
+          // Check if there are word characters immediately after the dash
+          if (dashPos < originalText.length && /[\w]/.test(originalText[dashPos])) {
+            // Find the end of the word after the dash
+            while (dashPos < originalText.length && /[\w]/.test(originalText[dashPos])) {
+              dashPos++;
+            }
+            extendedEndPos = dashPos;
+          }
+        }
+        // Check for apostrophe (straight or curly) followed by text
+        if (extendedEndPos < originalText.length && /['']/.test(originalText[extendedEndPos])) {
+          let apostrophePos = extendedEndPos;
+          apostrophePos++; // Move past the apostrophe
+          // Check if there are word characters immediately after the apostrophe
+          if (apostrophePos < originalText.length && /[\w]/.test(originalText[apostrophePos])) {
+            // Find the end of the text after the apostrophe
+            while (apostrophePos < originalText.length && /[\w]/.test(originalText[apostrophePos])) {
+              apostrophePos++;
+            }
+            extendedEndPos = apostrophePos;
+          } else {
+            // Include the apostrophe even if no text follows (for possessives ending in s)
+            extendedEndPos = apostrophePos;
+          }
+        }
+        // Update the matched text if we extended it
+        if (extendedStartPos < startPos || extendedEndPos > currentPos) {
+          originalMatchedText = originalText.substring(extendedStartPos, extendedEndPos);
+        }
         // Check if this is a valid word boundary match (both start and end)
-        const isStartBoundary = startPos === 0 ||
-          /[\s\p{P}]/.test(originalText[startPos - 1]) ||
-          !/[\w]/.test(originalText[startPos - 1]);
+        const isStartBoundary = extendedStartPos === 0 ||
+          /[\s\p{P}]/.test(originalText[extendedStartPos - 1]) ||
+          !/[\w]/.test(originalText[extendedStartPos - 1]);
-        const isEndBoundary = currentPos >= originalText.length ||
-          /[\s\p{P}]/.test(originalText[currentPos]) ||
-          !/[\w]/.test(originalText[currentPos]);
+        const isEndBoundary = extendedEndPos >= originalText.length ||
+          /[\s\p{P}]/.test(originalText[extendedEndPos]) ||
+          !/[\w]/.test(originalText[extendedEndPos]);
         const isWordBoundary = isStartBoundary && isEndBoundary;
@@ -178,8 +248,9 @@ class PrefixTrie {
             matches.push({
               term: termData.term,
               articles: termData.articles,
-              matchedText: originalMatchedText, // Use the original text, not the normalized version
-              length: matchLength,
+              matchedText: originalMatchedText, // Use the extended matched text
+              length: originalMatchedText.length, // Use extended length
+              originalLength: matchLength, // Keep track of original match length for advancement
               priority: termData.priority,
               isExactCase: isExactCase
             });
@@ -224,7 +295,6 @@ function createOptimizedTermMap(twTerms) {
       let variants = new Set([originalTerm]);
       const isName = articles[0].startsWith('names/') || articles[1]?.startsWith('names/')
       variants = generateVariants(originalTerm, isName);
-      console.log(variants)
       for (const variant of variants) {
         if (variant !== originalTerm) {
           trie.insert(variant, originalTerm, articles, false);
@@ -283,9 +353,11 @@ function findMatches(verseText, termTrie) {
         priority: bestMatch.priority
       });
-      // Move past the matched text
-      processedText += matchedText;
-      currentPos += bestMatch.length;
+      // Move past only the original matched text (not the extended part)
+      // This allows finding additional matches within the extended portion
+      const advanceBy = bestMatch.originalLength || bestMatch.length;
+      processedText += normalizedText.substring(currentPos, currentPos + advanceBy);
+      currentPos += advanceBy;
     } else {
       // No match found, move to next character/word boundary
       const nextWordBoundary = normalizedText.substring(currentPos).search(/[\s\p{P}]/u);

package/src/utils/usfm-alignment-remover.js CHANGED Viewed

@@ -68,13 +68,13 @@ export const removeAllTagsExceptChapterVerse = (usfmContent) => {
  * @param {string} book - The book identifier
  * @return {Promise<Object>} - Object with chapters and verses
  */
-export async function processUsfmForBook(book) {
+export async function processUsfmForBook(book, dcsHost = 'https://git.door43.org') {
   // Normalize book key to lowercase to match BibleBookData keys
   const key = String(book || '').toLowerCase();
   if (!BibleBookData[key]) throw new Error(`Unknown book: ${book}`);
   const fetch = await getFetch();
-  const usfmUrl = `https://git.door43.org/api/v1/repos/unfoldingWord/en_ult/contents/${BibleBookData[key].usfm}.usfm?ref=master`;
+  const usfmUrl = `${dcsHost}/api/v1/repos/unfoldingWord/en_ult/contents/${BibleBookData[key].usfm}.usfm?ref=master`;
   const usfmRes = await fetch(usfmUrl);
   if (!usfmRes.ok) throw new Error(`Failed to download USFM file for ${book}`);
   const usfmData = await usfmRes.json();

package/src/utils/zipProcessor.js CHANGED Viewed

@@ -1,11 +1,11 @@
 /**
  * Universal TWL zipProcessor - Works in both Node.js and Browser environments
  *
- * Caches the raw ZIP file and processes term headers on-demand
+ * Downloads and processes en_tw ZIP files on-demand (no caching per user request)
  *
  * Usage in React.js:
  *   import { generateTWTerms } from './utils/zipProcessor.js';
- *   const terms = await generateTWTerms();
+ *   const terms = await generateTWTerms('https://git.door43.org');
  */
 import JSZip from "jszip";
@@ -13,141 +13,6 @@ import JSZip from "jszip";
 const isNode = typeof process !== 'undefined' && process.versions?.node;
 const isBrowser = typeof window !== 'undefined';
-const ZIP_URL = 'https://git.door43.org/unfoldingWord/en_tw/archive/master.zip';
-const CACHE_KEY = 'twl_zip_cache';
-const CACHE_VERSION = '1.0';
-// In-memory cache for processed terms (per session)
-let processedTermsCache = null;
-async function getCachedZip() {
-  if (isBrowser) {
-    // Browser: Use localStorage for ZIP cache
-    try {
-      const cached = localStorage.getItem(CACHE_KEY);
-      if (cached) {
-        const data = JSON.parse(cached);
-        // Only use cache if version matches and cache is less than 5 minutes old
-        const FIVE_MINUTES = 5 * 60 * 1000;
-        if (
-          data.version === CACHE_VERSION &&
-          data.timestamp &&
-          (Date.now() - data.timestamp) < FIVE_MINUTES
-        ) {
-          console.log('Using cached ZIP from browser storage');
-          return new Uint8Array(data.zipData);
-        } else {
-          localStorage.removeItem(CACHE_KEY);
-        }
-      }
-    } catch (error) {
-      console.log('Browser ZIP cache corrupted, re-downloading...');
-      try { localStorage.removeItem(CACHE_KEY); } catch (e) { }
-    }
-  }
-  // Note: In Node.js we could cache to filesystem, but fresh download is fine for CLI usage
-  return null;
-}
-/**
- * Cache ZIP data in appropriate storage
- */
-async function cacheZip(zipBuffer) {
-  if (isBrowser) {
-    try {
-      const cacheData = {
-        version: CACHE_VERSION,
-        timestamp: Date.now(),
-        zipData: Array.from(new Uint8Array(zipBuffer))
-      };
-      localStorage.setItem(CACHE_KEY, JSON.stringify(cacheData));
-      console.log('ZIP cached in browser storage');
-    } catch (error) {
-      console.warn('Failed to cache ZIP in browser:', error.message);
-    }
-  }
-}
-/**
- * Get browser storage (localStorage or sessionStorage)
- */
-function getBrowserStorage() {
-  if (!isBrowser) return null;
-  try {
-    return localStorage || sessionStorage || null;
-  } catch (e) {
-    console.warn('Browser storage not available:', e.message);
-    return null;
-  }
-}
-/**
- * Get cached terms from appropriate storage
- */
-async function getCachedTerms() {
-  // Check in-memory cache first (fastest)
-  if (memoryCache) {
-    console.log('Using in-memory cached article terms');
-    return memoryCache;
-  }
-  if (isBrowser) {
-    // Browser caching with localStorage/sessionStorage
-    const storage = getBrowserStorage();
-    if (storage) {
-      try {
-        const cached = storage.getItem(CACHE_KEY);
-        if (cached) {
-          const data = JSON.parse(cached);
-          if (data.version === CACHE_VERSION) {
-            console.log('Using browser cached article terms');
-            memoryCache = data.terms;
-            return data.terms;
-          } else {
-            console.log('Browser cache version mismatch, regenerating...');
-            storage.removeItem(CACHE_KEY);
-          }
-        }
-      } catch (error) {
-        console.log('Browser cache corrupted, regenerating...');
-        try {
-          storage.removeItem(CACHE_KEY);
-        } catch (e) { /* ignore cleanup errors */ }
-      }
-    }
-  }
-  return null;
-}
-/**
- * Cache terms in appropriate storage
- */
-async function cacheTerms(termMap) {
-  // Always cache in memory for this session
-  memoryCache = termMap;
-  if (isBrowser) {
-    // Browser caching
-    const storage = getBrowserStorage();
-    if (storage) {
-      try {
-        const cacheData = {
-          version: CACHE_VERSION,
-          timestamp: Date.now(),
-          terms: termMap
-        };
-        storage.setItem(CACHE_KEY, JSON.stringify(cacheData));
-        console.log('Article terms cached in browser storage');
-      } catch (error) {
-        console.warn('Failed to cache in browser storage:', error.message);
-      }
-    }
-  }
-}
 /**
  * Process ZIP buffer and extract term mappings
  */
@@ -195,28 +60,15 @@ async function processZipBuffer(zipBuffer) {
   return termMap;
 }
-export async function generateTWTerms() {
-  // Check if we already processed terms this session
-  if (processedTermsCache) {
-    console.log('Using in-memory processed terms');
-    return processedTermsCache;
-  }
-  // Try to get cached ZIP first
-  let zipBuffer = await getCachedZip();
+export async function generateTWTerms(dcsHost = 'https://git.door43.org') {
+  // Always download fresh ZIP (no caching per user request)
+  const zipUrl = `${dcsHost}/unfoldingWord/en_tw/archive/master.zip`;
+  console.log(`Downloading TW archive from ${zipUrl}...`);
-  if (!zipBuffer) {
-    // Download fresh ZIP
-    console.log('Downloading TW archive...');
+  const res = await fetch(zipUrl);
+  if (!res.ok) throw new Error(`Failed to download ZIP: ${res.status} ${res.statusText}`);
-    const res = await fetch(ZIP_URL);
-    if (!res.ok) throw new Error(`Failed to download ZIP: ${res.status} ${res.statusText}`);
-    zipBuffer = await res.arrayBuffer();
-    // Cache the ZIP for next time
-    await cacheZip(zipBuffer);
-  }
+  const zipBuffer = await res.arrayBuffer();
   // Process ZIP to extract terms
   console.log('Processing TW articles...');
@@ -224,64 +76,16 @@ export async function generateTWTerms() {
   console.log(`Generated ${Object.keys(termMap).length} terms from TW archive`);
-  // Cache processed terms for this session
-  processedTermsCache = termMap;
   return termMap;
 }
 /**
- * Clear cache - useful for forcing refresh
+ * Get information about the current environment for debugging
  */
-export async function clearCache() {
-  // Clear in-memory cache
-  processedTermsCache = null;
-  if (isBrowser) {
-    try {
-      localStorage.removeItem(CACHE_KEY);
-      console.log('Browser ZIP cache cleared');
-      return true;
-    } catch (error) {
-      console.warn('Failed to clear browser cache:', error.message);
-      return false;
-    }
-  }
-  console.log('Memory cache cleared');
-  return true;
-}
-/**
- * Get cache information for debugging
- */
-export function getCacheInfo() {
-  const info = {
+export function getEnvironmentInfo() {
+  return {
     environment: isNode ? 'Node.js' : (isBrowser ? 'Browser' : 'Unknown'),
-    hasProcessedTerms: !!processedTermsCache,
-    hasZipCache: false,
-    termCount: 0,
-    cacheVersion: CACHE_VERSION
+    hasFetch: typeof fetch !== 'undefined',
+    hasJSZip: typeof JSZip !== 'undefined'
   };
-  // Check processed terms
-  if (processedTermsCache) {
-    info.termCount = Object.keys(processedTermsCache).length;
-  }
-  // Check ZIP cache in browser
-  if (isBrowser) {
-    try {
-      const cached = localStorage.getItem(CACHE_KEY);
-      if (cached) {
-        const data = JSON.parse(cached);
-        info.hasZipCache = true;
-        info.timestamp = data.timestamp ? new Date(data.timestamp) : null;
-      }
-    } catch (error) {
-      // Ignore parse errors
-    }
-  }
-  return info;
 }