twl-generator 1.4.9 → 1.4.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -3
- package/src/index.js +12 -26
- package/src/utils/twl-matcher.js +41 -28
- package/src/utils/usfm-alignment-remover.js +2 -10
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "twl-generator",
|
|
3
|
-
"version": "1.4.
|
|
3
|
+
"version": "1.4.11",
|
|
4
4
|
"description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -51,7 +51,8 @@
|
|
|
51
51
|
"csv-stringify": "^6.5.0",
|
|
52
52
|
"en-inflectors": "^1.0.12",
|
|
53
53
|
"jszip": "^3.10.1",
|
|
54
|
-
"tsv-quote-converters": "^1.1.14"
|
|
54
|
+
"tsv-quote-converters": "^1.1.14",
|
|
55
|
+
"usfm-alignment-remover": "^0.1.6"
|
|
55
56
|
},
|
|
56
57
|
"peerDependencies": {
|
|
57
58
|
"react": ">=16.8.0"
|
|
@@ -61,4 +62,4 @@
|
|
|
61
62
|
"optional": true
|
|
62
63
|
}
|
|
63
64
|
}
|
|
64
|
-
}
|
|
65
|
+
}
|
package/src/index.js
CHANGED
|
@@ -442,7 +442,7 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
|
|
|
442
442
|
let termHit = '';
|
|
443
443
|
let truncated = false;
|
|
444
444
|
|
|
445
|
-
// Stage 1: case-
|
|
445
|
+
// Stage 1: case-insensitive, word-boundary (prioritized)
|
|
446
446
|
if (stage === 0) {
|
|
447
447
|
for (const tobj of terms) {
|
|
448
448
|
const termOrig = tobj.orig;
|
|
@@ -451,28 +451,14 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
|
|
|
451
451
|
for (const a of irregularFormsForTerm(termOrig)) alts.add(a);
|
|
452
452
|
for (const a of conjugationsForTerm(termOrig)) alts.add(a);
|
|
453
453
|
for (const alt of alts) {
|
|
454
|
-
const re1 = new RegExp(`\\b${escapeRegExp(alt)}\\b
|
|
454
|
+
const re1 = new RegExp(`\\b${escapeRegExp(alt)}\\b`, 'i');
|
|
455
455
|
if (re1.test(textOrig)) { stage = 1; termHit = termOrig; break; }
|
|
456
456
|
}
|
|
457
457
|
if (stage === 1) break;
|
|
458
458
|
}
|
|
459
459
|
}
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
for (const tobj of terms) {
|
|
463
|
-
const termOrig = tobj.orig;
|
|
464
|
-
const alts = new Set([termOrig]);
|
|
465
|
-
for (const a of pluralizeTerm(termOrig)) alts.add(a);
|
|
466
|
-
for (const a of irregularFormsForTerm(termOrig)) alts.add(a);
|
|
467
|
-
for (const a of conjugationsForTerm(termOrig)) alts.add(a);
|
|
468
|
-
for (const alt of alts) {
|
|
469
|
-
const re2 = new RegExp(`\\b${escapeRegExp(alt)}\\b`, 'i');
|
|
470
|
-
if (re2.test(textOrig)) { stage = 2; termHit = termOrig; break; }
|
|
471
|
-
}
|
|
472
|
-
if (stage === 2) break;
|
|
473
|
-
}
|
|
474
|
-
}
|
|
475
|
-
// Stage 3: case-sensitive, substring matching at word boundaries or after dashes
|
|
460
|
+
|
|
461
|
+
// Stage 2: case-insensitive, substring matching at word boundaries or after dashes
|
|
476
462
|
if (stage === 0) {
|
|
477
463
|
for (const tobj of terms) {
|
|
478
464
|
const termOrig = tobj.orig;
|
|
@@ -480,12 +466,12 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
|
|
|
480
466
|
// Match if the term appears:
|
|
481
467
|
// - At word boundary (beginning of word or after dash)
|
|
482
468
|
// - Allow substring matching (e.g., "reap" matches "reapers")
|
|
483
|
-
const
|
|
484
|
-
if (
|
|
469
|
+
const re2 = new RegExp(`(?:^|\\b|[—–-])${escapeRegExp(termOrig)}`, 'i');
|
|
470
|
+
if (re2.test(textOrig)) { stage = 2; termHit = termOrig; break; }
|
|
485
471
|
}
|
|
486
472
|
}
|
|
487
473
|
}
|
|
488
|
-
// Stage
|
|
474
|
+
// Stage 3: case-insensitive, substring on derived stripped forms
|
|
489
475
|
if (stage === 0) {
|
|
490
476
|
const strippedForms = (base) => {
|
|
491
477
|
const { head, last } = splitHeadLast(base);
|
|
@@ -550,7 +536,7 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
|
|
|
550
536
|
// Only match if the stripped form is followed by a grammatical ending
|
|
551
537
|
const regex = new RegExp(escapeRegExp(form) + '(ed|ing|er|est|es|ies|s|d|n|t)\\b', 'i');
|
|
552
538
|
if (regex.test(textLower)) {
|
|
553
|
-
stage =
|
|
539
|
+
stage = 3;
|
|
554
540
|
termHit = termOrig;
|
|
555
541
|
truncated = false;
|
|
556
542
|
break outerStrip;
|
|
@@ -558,9 +544,9 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
|
|
|
558
544
|
} else {
|
|
559
545
|
// For non-stripped forms, match at word boundaries or after dashes (case-insensitive)
|
|
560
546
|
// Allow substring matching (e.g., "reap" matches "reapers")
|
|
561
|
-
const
|
|
562
|
-
if (
|
|
563
|
-
stage =
|
|
547
|
+
const regex3 = new RegExp(`(?:^|\\b|[—–-])${escapeRegExp(form)}`, 'i');
|
|
548
|
+
if (regex3.test(textOrig)) {
|
|
549
|
+
stage = 3;
|
|
564
550
|
termHit = termOrig;
|
|
565
551
|
truncated = false;
|
|
566
552
|
break outerStrip;
|
|
@@ -930,7 +916,7 @@ export async function generateTwlByBook(bookCode, options = {}) {
|
|
|
930
916
|
|
|
931
917
|
const ref = `${c}:${v}`;
|
|
932
918
|
const id = genId();
|
|
933
|
-
const primaryArticle = (m.articles && m.articles[0]) || '';
|
|
919
|
+
const primaryArticle = m.preferredArticle || (m.articles && m.articles[0]) || '';
|
|
934
920
|
let tag = '';
|
|
935
921
|
if (primaryArticle.startsWith('kt/')) tag = 'keyterm';
|
|
936
922
|
else if (primaryArticle.startsWith('names/')) tag = 'name';
|
package/src/utils/twl-matcher.js
CHANGED
|
@@ -88,23 +88,19 @@ function generateVariants(term, isName = false) {
|
|
|
88
88
|
}
|
|
89
89
|
|
|
90
90
|
/**
|
|
91
|
-
* Optimized PrefixTrie for fast term matching with case
|
|
91
|
+
* Optimized PrefixTrie for fast term matching with case insensitivity
|
|
92
92
|
*/
|
|
93
93
|
class PrefixTrie {
|
|
94
94
|
constructor() {
|
|
95
|
-
this.
|
|
96
|
-
this.lowerCaseRoot = {}; // For case-insensitive fallback
|
|
95
|
+
this.root = {}; // For case-insensitive matches
|
|
97
96
|
}
|
|
98
97
|
|
|
99
98
|
insert(term, originalTerm, articles, isOriginal = true) {
|
|
100
|
-
// Insert into
|
|
101
|
-
this._insertIntoTree(this.
|
|
102
|
-
|
|
103
|
-
// // Also insert into lowercase trie for fallback - removed, too many falses
|
|
104
|
-
// this._insertIntoTree(this.lowerCaseRoot, term.toLowerCase(), originalTerm, articles, isOriginal, false);
|
|
99
|
+
// Insert into case-insensitive trie (always lowercase)
|
|
100
|
+
this._insertIntoTree(this.root, term.toLowerCase(), originalTerm, articles, isOriginal);
|
|
105
101
|
}
|
|
106
102
|
|
|
107
|
-
_insertIntoTree(root, term, originalTerm, articles, isOriginal
|
|
103
|
+
_insertIntoTree(root, term, originalTerm, articles, isOriginal) {
|
|
108
104
|
let node = root;
|
|
109
105
|
|
|
110
106
|
for (const char of term) {
|
|
@@ -123,24 +119,16 @@ class PrefixTrie {
|
|
|
123
119
|
term: originalTerm,
|
|
124
120
|
articles,
|
|
125
121
|
matchedText: term,
|
|
126
|
-
priority: isOriginal ? 0 : 1
|
|
127
|
-
isExactCase
|
|
122
|
+
priority: isOriginal ? 0 : 1
|
|
128
123
|
});
|
|
129
124
|
}
|
|
130
125
|
|
|
131
126
|
findMatches(text, startPos) {
|
|
132
|
-
//
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
// If no exact case matches, try case-insensitive
|
|
136
|
-
if (matches.length === 0) {
|
|
137
|
-
matches = this._findMatchesInTree(this.lowerCaseRoot, text.toLowerCase(), startPos, false, text);
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
return matches;
|
|
127
|
+
// Always use case-insensitive matching
|
|
128
|
+
return this._findMatchesInTree(this.root, text.toLowerCase(), startPos, text);
|
|
141
129
|
}
|
|
142
130
|
|
|
143
|
-
_findMatchesInTree(root, searchText, startPos,
|
|
131
|
+
_findMatchesInTree(root, searchText, startPos, originalText) {
|
|
144
132
|
const matches = [];
|
|
145
133
|
let node = root;
|
|
146
134
|
let currentPos = startPos;
|
|
@@ -223,15 +211,14 @@ class PrefixTrie {
|
|
|
223
211
|
matchedText: originalMatchedText, // Use the extended matched text
|
|
224
212
|
length: originalMatchedText.length, // Use extended length
|
|
225
213
|
originalLength: matchLength, // Keep track of original match length for advancement
|
|
226
|
-
priority: termData.priority
|
|
227
|
-
isExactCase: isExactCase
|
|
214
|
+
priority: termData.priority
|
|
228
215
|
});
|
|
229
216
|
}
|
|
230
217
|
}
|
|
231
218
|
}
|
|
232
219
|
}
|
|
233
220
|
|
|
234
|
-
// Sort by length (longer first), then by priority
|
|
221
|
+
// Sort by length (longer first), then by priority
|
|
235
222
|
return matches.sort((a, b) => {
|
|
236
223
|
if (b.length !== a.length) {
|
|
237
224
|
return b.length - a.length;
|
|
@@ -239,10 +226,6 @@ class PrefixTrie {
|
|
|
239
226
|
if (a.priority !== b.priority) {
|
|
240
227
|
return a.priority - b.priority;
|
|
241
228
|
}
|
|
242
|
-
// Prefer exact case matches
|
|
243
|
-
if (a.isExactCase !== b.isExactCase) {
|
|
244
|
-
return a.isExactCase ? -1 : 1;
|
|
245
|
-
}
|
|
246
229
|
return 0;
|
|
247
230
|
});
|
|
248
231
|
}
|
|
@@ -308,8 +291,37 @@ function findMatches(verseText, termTrie) {
|
|
|
308
291
|
let bestMatch = null;
|
|
309
292
|
|
|
310
293
|
// Pick the best match (longest, then by priority)
|
|
294
|
+
// But collect all articles from matches of the same length and priority
|
|
311
295
|
if (candidateMatches.length > 0) {
|
|
312
296
|
bestMatch = candidateMatches[0];
|
|
297
|
+
|
|
298
|
+
// Collect all articles from matches with the same length and priority as the best match
|
|
299
|
+
const allArticles = new Set();
|
|
300
|
+
for (const match of candidateMatches) {
|
|
301
|
+
if (match.length === bestMatch.length && match.priority === bestMatch.priority) {
|
|
302
|
+
match.articles.forEach(article => allArticles.add(article));
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
bestMatch.articles = Array.from(allArticles);
|
|
306
|
+
|
|
307
|
+
// Special case for "god" - prefer the appropriate article based on capitalization
|
|
308
|
+
// but keep all articles for disambiguation
|
|
309
|
+
if (bestMatch.matchedText.toLowerCase() === 'god' && bestMatch.articles.length > 1) {
|
|
310
|
+
const originalMatchedText = normalizedText.substring(currentPos, currentPos + bestMatch.length);
|
|
311
|
+
const hasGodArticle = bestMatch.articles.includes('kt/god');
|
|
312
|
+
const hasFalseGodArticle = bestMatch.articles.includes('kt/falsegod');
|
|
313
|
+
|
|
314
|
+
if (hasGodArticle && hasFalseGodArticle) {
|
|
315
|
+
// Check capitalization in original text
|
|
316
|
+
if (originalMatchedText === 'God' || originalMatchedText.charAt(0) === 'G') {
|
|
317
|
+
// Prefer kt/god for capitalized "God"
|
|
318
|
+
bestMatch.preferredArticle = 'kt/god';
|
|
319
|
+
} else {
|
|
320
|
+
// Prefer kt/falsegod for lowercase "god"
|
|
321
|
+
bestMatch.preferredArticle = 'kt/falsegod';
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
}
|
|
313
325
|
}
|
|
314
326
|
|
|
315
327
|
if (bestMatch) {
|
|
@@ -320,6 +332,7 @@ function findMatches(verseText, termTrie) {
|
|
|
320
332
|
matches.push({
|
|
321
333
|
term: bestMatch.term,
|
|
322
334
|
articles: bestMatch.articles,
|
|
335
|
+
preferredArticle: bestMatch.preferredArticle,
|
|
323
336
|
matchedText: matchedText,
|
|
324
337
|
context: context,
|
|
325
338
|
priority: bestMatch.priority
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
/* eslint-disable no-async-promise-executor, no-throw-literal */
|
|
2
2
|
|
|
3
3
|
import { BibleBookData } from '../common/books.js';
|
|
4
|
+
import { removeAlignments } from 'usfm-alignment-remover';
|
|
4
5
|
|
|
5
6
|
// Environment detection
|
|
6
7
|
const isNode = typeof window === 'undefined' && typeof process !== 'undefined' && process.versions?.node;
|
|
@@ -26,16 +27,7 @@ function decodeBase64(base64String) {
|
|
|
26
27
|
export const removeAllTagsExceptChapterVerse = (usfmContent) => {
|
|
27
28
|
if (!usfmContent) return '';
|
|
28
29
|
|
|
29
|
-
let cleanContent = usfmContent;
|
|
30
|
-
|
|
31
|
-
// Remove word-level alignment markers like \w word|lemma="lemma" strong="H1234"\w*
|
|
32
|
-
cleanContent = cleanContent.replace(/\\w\s+([^|\\]+)\|[^\\]*\\w\*/g, '$1');
|
|
33
|
-
|
|
34
|
-
// Remove milestone markers like \zaln-s | \zaln-e\*
|
|
35
|
-
cleanContent = cleanContent.replace(/\\zaln-[se][^\\]*\\?\*?/g, '');
|
|
36
|
-
|
|
37
|
-
// Remove other alignment-related markers
|
|
38
|
-
cleanContent = cleanContent.replace(/\\k-[se][^\\]*\\?\*?/g, '');
|
|
30
|
+
let cleanContent = removeAlignments(usfmContent);
|
|
39
31
|
|
|
40
32
|
// Remove empty lines that might result from marker removal
|
|
41
33
|
cleanContent = cleanContent.replace(/\n\s*\n\s*\n/g, '\n\n');
|