twl-generator 1.4.9 → 1.4.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "twl-generator",
3
- "version": "1.4.9",
3
+ "version": "1.4.11",
4
4
  "description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -51,7 +51,8 @@
51
51
  "csv-stringify": "^6.5.0",
52
52
  "en-inflectors": "^1.0.12",
53
53
  "jszip": "^3.10.1",
54
- "tsv-quote-converters": "^1.1.14"
54
+ "tsv-quote-converters": "^1.1.14",
55
+ "usfm-alignment-remover": "^0.1.6"
55
56
  },
56
57
  "peerDependencies": {
57
58
  "react": ">=16.8.0"
@@ -61,4 +62,4 @@
61
62
  "optional": true
62
63
  }
63
64
  }
64
- }
65
+ }
package/src/index.js CHANGED
@@ -442,7 +442,7 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
442
442
  let termHit = '';
443
443
  let truncated = false;
444
444
 
445
- // Stage 1: case-sensitive, word-boundary
445
+ // Stage 1: case-insensitive, word-boundary (prioritized)
446
446
  if (stage === 0) {
447
447
  for (const tobj of terms) {
448
448
  const termOrig = tobj.orig;
@@ -451,28 +451,14 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
451
451
  for (const a of irregularFormsForTerm(termOrig)) alts.add(a);
452
452
  for (const a of conjugationsForTerm(termOrig)) alts.add(a);
453
453
  for (const alt of alts) {
454
- const re1 = new RegExp(`\\b${escapeRegExp(alt)}\\b`);
454
+ const re1 = new RegExp(`\\b${escapeRegExp(alt)}\\b`, 'i');
455
455
  if (re1.test(textOrig)) { stage = 1; termHit = termOrig; break; }
456
456
  }
457
457
  if (stage === 1) break;
458
458
  }
459
459
  }
460
- // Stage 2: case-insensitive, word-boundary
461
- if (stage === 0) {
462
- for (const tobj of terms) {
463
- const termOrig = tobj.orig;
464
- const alts = new Set([termOrig]);
465
- for (const a of pluralizeTerm(termOrig)) alts.add(a);
466
- for (const a of irregularFormsForTerm(termOrig)) alts.add(a);
467
- for (const a of conjugationsForTerm(termOrig)) alts.add(a);
468
- for (const alt of alts) {
469
- const re2 = new RegExp(`\\b${escapeRegExp(alt)}\\b`, 'i');
470
- if (re2.test(textOrig)) { stage = 2; termHit = termOrig; break; }
471
- }
472
- if (stage === 2) break;
473
- }
474
- }
475
- // Stage 3: case-sensitive, substring matching at word boundaries or after dashes
460
+
461
+ // Stage 2: case-insensitive, substring matching at word boundaries or after dashes
476
462
  if (stage === 0) {
477
463
  for (const tobj of terms) {
478
464
  const termOrig = tobj.orig;
@@ -480,12 +466,12 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
480
466
  // Match if the term appears:
481
467
  // - At word boundary (beginning of word or after dash)
482
468
  // - Allow substring matching (e.g., "reap" matches "reapers")
483
- const re3 = new RegExp(`(?:^|\\b|[—–-])${escapeRegExp(termOrig)}`, '');
484
- if (re3.test(textOrig)) { stage = 3; termHit = termOrig; break; }
469
+ const re2 = new RegExp(`(?:^|\\b|[—–-])${escapeRegExp(termOrig)}`, 'i');
470
+ if (re2.test(textOrig)) { stage = 2; termHit = termOrig; break; }
485
471
  }
486
472
  }
487
473
  }
488
- // Stage 4: case-insensitive, substring on derived stripped forms
474
+ // Stage 3: case-insensitive, substring on derived stripped forms
489
475
  if (stage === 0) {
490
476
  const strippedForms = (base) => {
491
477
  const { head, last } = splitHeadLast(base);
@@ -550,7 +536,7 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
550
536
  // Only match if the stripped form is followed by a grammatical ending
551
537
  const regex = new RegExp(escapeRegExp(form) + '(ed|ing|er|est|es|ies|s|d|n|t)\\b', 'i');
552
538
  if (regex.test(textLower)) {
553
- stage = 4;
539
+ stage = 3;
554
540
  termHit = termOrig;
555
541
  truncated = false;
556
542
  break outerStrip;
@@ -558,9 +544,9 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
558
544
  } else {
559
545
  // For non-stripped forms, match at word boundaries or after dashes (case-insensitive)
560
546
  // Allow substring matching (e.g., "reap" matches "reapers")
561
- const regex4 = new RegExp(`(?:^|\\b|[—–-])${escapeRegExp(form)}`, 'i');
562
- if (regex4.test(textOrig)) {
563
- stage = 4;
547
+ const regex3 = new RegExp(`(?:^|\\b|[—–-])${escapeRegExp(form)}`, 'i');
548
+ if (regex3.test(textOrig)) {
549
+ stage = 3;
564
550
  termHit = termOrig;
565
551
  truncated = false;
566
552
  break outerStrip;
@@ -930,7 +916,7 @@ export async function generateTwlByBook(bookCode, options = {}) {
930
916
 
931
917
  const ref = `${c}:${v}`;
932
918
  const id = genId();
933
- const primaryArticle = (m.articles && m.articles[0]) || '';
919
+ const primaryArticle = m.preferredArticle || (m.articles && m.articles[0]) || '';
934
920
  let tag = '';
935
921
  if (primaryArticle.startsWith('kt/')) tag = 'keyterm';
936
922
  else if (primaryArticle.startsWith('names/')) tag = 'name';
@@ -88,23 +88,19 @@ function generateVariants(term, isName = false) {
88
88
  }
89
89
 
90
90
  /**
91
- * Optimized PrefixTrie for fast term matching with case sensitivity
91
+ * Optimized PrefixTrie for fast term matching with case insensitivity
92
92
  */
93
93
  class PrefixTrie {
94
94
  constructor() {
95
- this.exactCaseRoot = {}; // For exact case matches
96
- this.lowerCaseRoot = {}; // For case-insensitive fallback
95
+ this.root = {}; // For case-insensitive matches
97
96
  }
98
97
 
99
98
  insert(term, originalTerm, articles, isOriginal = true) {
100
- // Insert into exact case trie
101
- this._insertIntoTree(this.exactCaseRoot, term, originalTerm, articles, isOriginal, true);
102
-
103
- // // Also insert into lowercase trie for fallback - removed, too many falses
104
- // this._insertIntoTree(this.lowerCaseRoot, term.toLowerCase(), originalTerm, articles, isOriginal, false);
99
+ // Insert into case-insensitive trie (always lowercase)
100
+ this._insertIntoTree(this.root, term.toLowerCase(), originalTerm, articles, isOriginal);
105
101
  }
106
102
 
107
- _insertIntoTree(root, term, originalTerm, articles, isOriginal, isExactCase) {
103
+ _insertIntoTree(root, term, originalTerm, articles, isOriginal) {
108
104
  let node = root;
109
105
 
110
106
  for (const char of term) {
@@ -123,24 +119,16 @@ class PrefixTrie {
123
119
  term: originalTerm,
124
120
  articles,
125
121
  matchedText: term,
126
- priority: isOriginal ? 0 : 1,
127
- isExactCase
122
+ priority: isOriginal ? 0 : 1
128
123
  });
129
124
  }
130
125
 
131
126
  findMatches(text, startPos) {
132
- // First try exact case matches
133
- let matches = this._findMatchesInTree(this.exactCaseRoot, text, startPos, true, text);
134
-
135
- // If no exact case matches, try case-insensitive
136
- if (matches.length === 0) {
137
- matches = this._findMatchesInTree(this.lowerCaseRoot, text.toLowerCase(), startPos, false, text);
138
- }
139
-
140
- return matches;
127
+ // Always use case-insensitive matching
128
+ return this._findMatchesInTree(this.root, text.toLowerCase(), startPos, text);
141
129
  }
142
130
 
143
- _findMatchesInTree(root, searchText, startPos, isExactCase, originalText) {
131
+ _findMatchesInTree(root, searchText, startPos, originalText) {
144
132
  const matches = [];
145
133
  let node = root;
146
134
  let currentPos = startPos;
@@ -223,15 +211,14 @@ class PrefixTrie {
223
211
  matchedText: originalMatchedText, // Use the extended matched text
224
212
  length: originalMatchedText.length, // Use extended length
225
213
  originalLength: matchLength, // Keep track of original match length for advancement
226
- priority: termData.priority,
227
- isExactCase: isExactCase
214
+ priority: termData.priority
228
215
  });
229
216
  }
230
217
  }
231
218
  }
232
219
  }
233
220
 
234
- // Sort by length (longer first), then by priority, then by case match preference
221
+ // Sort by length (longer first), then by priority
235
222
  return matches.sort((a, b) => {
236
223
  if (b.length !== a.length) {
237
224
  return b.length - a.length;
@@ -239,10 +226,6 @@ class PrefixTrie {
239
226
  if (a.priority !== b.priority) {
240
227
  return a.priority - b.priority;
241
228
  }
242
- // Prefer exact case matches
243
- if (a.isExactCase !== b.isExactCase) {
244
- return a.isExactCase ? -1 : 1;
245
- }
246
229
  return 0;
247
230
  });
248
231
  }
@@ -308,8 +291,37 @@ function findMatches(verseText, termTrie) {
308
291
  let bestMatch = null;
309
292
 
310
293
  // Pick the best match (longest, then by priority)
294
+ // But collect all articles from matches of the same length and priority
311
295
  if (candidateMatches.length > 0) {
312
296
  bestMatch = candidateMatches[0];
297
+
298
+ // Collect all articles from matches with the same length and priority as the best match
299
+ const allArticles = new Set();
300
+ for (const match of candidateMatches) {
301
+ if (match.length === bestMatch.length && match.priority === bestMatch.priority) {
302
+ match.articles.forEach(article => allArticles.add(article));
303
+ }
304
+ }
305
+ bestMatch.articles = Array.from(allArticles);
306
+
307
+ // Special case for "god" - prefer the appropriate article based on capitalization
308
+ // but keep all articles for disambiguation
309
+ if (bestMatch.matchedText.toLowerCase() === 'god' && bestMatch.articles.length > 1) {
310
+ const originalMatchedText = normalizedText.substring(currentPos, currentPos + bestMatch.length);
311
+ const hasGodArticle = bestMatch.articles.includes('kt/god');
312
+ const hasFalseGodArticle = bestMatch.articles.includes('kt/falsegod');
313
+
314
+ if (hasGodArticle && hasFalseGodArticle) {
315
+ // Check capitalization in original text
316
+ if (originalMatchedText === 'God' || originalMatchedText.charAt(0) === 'G') {
317
+ // Prefer kt/god for capitalized "God"
318
+ bestMatch.preferredArticle = 'kt/god';
319
+ } else {
320
+ // Prefer kt/falsegod for lowercase "god"
321
+ bestMatch.preferredArticle = 'kt/falsegod';
322
+ }
323
+ }
324
+ }
313
325
  }
314
326
 
315
327
  if (bestMatch) {
@@ -320,6 +332,7 @@ function findMatches(verseText, termTrie) {
320
332
  matches.push({
321
333
  term: bestMatch.term,
322
334
  articles: bestMatch.articles,
335
+ preferredArticle: bestMatch.preferredArticle,
323
336
  matchedText: matchedText,
324
337
  context: context,
325
338
  priority: bestMatch.priority
@@ -1,6 +1,7 @@
1
1
  /* eslint-disable no-async-promise-executor, no-throw-literal */
2
2
 
3
3
  import { BibleBookData } from '../common/books.js';
4
+ import { removeAlignments } from 'usfm-alignment-remover';
4
5
 
5
6
  // Environment detection
6
7
  const isNode = typeof window === 'undefined' && typeof process !== 'undefined' && process.versions?.node;
@@ -26,16 +27,7 @@ function decodeBase64(base64String) {
26
27
  export const removeAllTagsExceptChapterVerse = (usfmContent) => {
27
28
  if (!usfmContent) return '';
28
29
 
29
- let cleanContent = usfmContent;
30
-
31
- // Remove word-level alignment markers like \w word|lemma="lemma" strong="H1234"\w*
32
- cleanContent = cleanContent.replace(/\\w\s+([^|\\]+)\|[^\\]*\\w\*/g, '$1');
33
-
34
- // Remove milestone markers like \zaln-s | \zaln-e\*
35
- cleanContent = cleanContent.replace(/\\zaln-[se][^\\]*\\?\*?/g, '');
36
-
37
- // Remove other alignment-related markers
38
- cleanContent = cleanContent.replace(/\\k-[se][^\\]*\\?\*?/g, '');
30
+ let cleanContent = removeAlignments(usfmContent);
39
31
 
40
32
  // Remove empty lines that might result from marker removal
41
33
  cleanContent = cleanContent.replace(/\n\s*\n\s*\n/g, '\n\n');