twl-generator 1.3.3 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/index.js +41 -11
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "twl-generator",
|
|
3
|
-
"version": "1.3.
|
|
3
|
+
"version": "1.3.5",
|
|
4
4
|
"description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -59,4 +59,4 @@
|
|
|
59
59
|
"optional": true
|
|
60
60
|
}
|
|
61
61
|
}
|
|
62
|
-
}
|
|
62
|
+
}
|
package/src/index.js
CHANGED
|
@@ -512,10 +512,9 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
|
|
|
512
512
|
const termOrig = tobj.orig;
|
|
513
513
|
if (termOrig) {
|
|
514
514
|
// Match if the term appears:
|
|
515
|
-
// - At word boundary (beginning
|
|
516
|
-
// -
|
|
517
|
-
|
|
518
|
-
const re3 = new RegExp(`(?:^|\\b|[—–-])${escapeRegExp(termOrig)}(?=\\b|$|[—–-])`, '');
|
|
515
|
+
// - At word boundary (beginning of word or after dash)
|
|
516
|
+
// - Allow substring matching (e.g., "reap" matches "reapers")
|
|
517
|
+
const re3 = new RegExp(`(?:^|\\b|[—–-])${escapeRegExp(termOrig)}`, '');
|
|
519
518
|
if (re3.test(textOrig)) { stage = 3; termHit = termOrig; break; }
|
|
520
519
|
}
|
|
521
520
|
}
|
|
@@ -592,7 +591,8 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
|
|
|
592
591
|
}
|
|
593
592
|
} else {
|
|
594
593
|
// For non-stripped forms, match at word boundaries or after dashes (case-insensitive)
|
|
595
|
-
|
|
594
|
+
// Allow substring matching (e.g., "reap" matches "reapers")
|
|
595
|
+
const regex4 = new RegExp(`(?:^|\\b|[—–-])${escapeRegExp(form)}`, 'i');
|
|
596
596
|
if (regex4.test(textOrig)) {
|
|
597
597
|
stage = 4;
|
|
598
598
|
termHit = termOrig;
|
|
@@ -612,7 +612,37 @@ function findMatchingArticles(glq, articlesList, termMap, opts = {}) {
|
|
|
612
612
|
return perArticleMatches;
|
|
613
613
|
}
|
|
614
614
|
|
|
615
|
-
|
|
615
|
+
// Get articles for disambiguation: those with matching Strong's OR those with empty Strong's lists
|
|
616
|
+
function getDisambiguationArticles(strongId, strongPivot, termMap, twMap) {
|
|
617
|
+
// Get articles with matching Strong's (same as prioritizeArticles but without prioritization)
|
|
618
|
+
let articlesWithMatchingStrongs = (strongPivot[strongId] || []).slice();
|
|
619
|
+
if ((!articlesWithMatchingStrongs || !articlesWithMatchingStrongs.length) && /^(H|G)\d+[a-f]$/.test(strongId)) {
|
|
620
|
+
const base = strongId.slice(0, -1);
|
|
621
|
+
articlesWithMatchingStrongs = (strongPivot[base] || []).slice();
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
const result = new Set(articlesWithMatchingStrongs);
|
|
625
|
+
|
|
626
|
+
// Add articles that have empty Strong's lists (orphaned articles)
|
|
627
|
+
for (const [article, val] of Object.entries(twMap)) {
|
|
628
|
+
const articleData = val || {};
|
|
629
|
+
const articleStrongs = articleData.strongs || [];
|
|
630
|
+
|
|
631
|
+
// Check if this article has empty Strong's lists
|
|
632
|
+
// An article qualifies if it has no strongs array or if all its strongs arrays are empty
|
|
633
|
+
const hasEmptyStrongs = !Array.isArray(articleStrongs) ||
|
|
634
|
+
articleStrongs.length === 0 ||
|
|
635
|
+
articleStrongs.every(strongsArray => !Array.isArray(strongsArray) || strongsArray.length === 0);
|
|
636
|
+
|
|
637
|
+
if (hasEmptyStrongs) {
|
|
638
|
+
result.add(article);
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
return Array.from(result);
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
function chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, twMap, opts = {}) {
|
|
616
646
|
const useCompromise = !!opts.useCompromise;
|
|
617
647
|
const nlp = opts.nlp;
|
|
618
648
|
const prioritized = prioritizeArticles(glq, strongId, strongPivot);
|
|
@@ -634,11 +664,11 @@ function chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, opts = {})
|
|
|
634
664
|
bestMatches.sort((a, b) => artIndex.get(a.art) - artIndex.get(b.art));
|
|
635
665
|
const chosenMatch = bestMatches[0];
|
|
636
666
|
|
|
637
|
-
// For disambiguation, search
|
|
638
|
-
const
|
|
639
|
-
const allMatches = findMatchingArticles(glq,
|
|
667
|
+
// For disambiguation, search articles with matching Strong's OR articles with empty Strong's lists
|
|
668
|
+
const disambiguationArticles = getDisambiguationArticles(strongId, strongPivot, termMap, twMap);
|
|
669
|
+
const allMatches = findMatchingArticles(glq, disambiguationArticles, termMap, { useCompromise, nlp });
|
|
640
670
|
|
|
641
|
-
// Disambiguation: list all matched articles (from
|
|
671
|
+
// Disambiguation: list all matched articles (from Strong's + empty Strong's filtered articles)
|
|
642
672
|
const matchesList = allMatches.map(m => m.art);
|
|
643
673
|
const disamb = matchesList.length > 1 ? `(${matchesList.join(', ')})` : '';
|
|
644
674
|
|
|
@@ -966,7 +996,7 @@ export async function generateTwlByBook(bookCode, options = {}) {
|
|
|
966
996
|
totalRows++;
|
|
967
997
|
const strongId = cols[H.Strongs];
|
|
968
998
|
const glq = cols[H.GLQuote] || '';
|
|
969
|
-
const result = chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, { useCompromise, nlp });
|
|
999
|
+
const result = chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, twJson, { useCompromise, nlp });
|
|
970
1000
|
if (!result) {
|
|
971
1001
|
droppedRows++;
|
|
972
1002
|
if (noMatchSamples.length < 8) {
|