twl-generator 1.3.7 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/cli.js +6 -1
- package/src/index.js +113 -141
- package/src/utils/twl-matcher.js +94 -12
- package/src/utils/usfm-alignment-remover.js +5 -3
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "twl-generator",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.4.1",
|
|
4
4
|
"description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
package/src/cli.js
CHANGED
|
@@ -63,7 +63,12 @@ async function main() {
|
|
|
63
63
|
console.log(`Wrote ${out}`);
|
|
64
64
|
const dir = path.dirname(outPath);
|
|
65
65
|
const base = path.basename(outPath);
|
|
66
|
-
|
|
66
|
+
// Derive a sensible no-match filename when --out doesn't follow *.twl.tsv
|
|
67
|
+
let nmFile;
|
|
68
|
+
if (/\.twl\.tsv$/i.test(base)) nmFile = base.replace(/\.twl\.tsv$/i, '.no-match.twl.tsv');
|
|
69
|
+
else if (/\.tsv$/i.test(base)) nmFile = base.replace(/\.tsv$/i, '.no-match.twl.tsv');
|
|
70
|
+
else nmFile = base + '.no-match.twl.tsv';
|
|
71
|
+
const nmPath = path.join(dir, nmFile);
|
|
67
72
|
await fs.writeFile(nmPath, noMatchTsv, 'utf8');
|
|
68
73
|
console.log(`Wrote ${nmPath}`);
|
|
69
74
|
} else if (outDir) {
|
package/src/index.js
CHANGED
|
@@ -863,79 +863,37 @@ function chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, twMap, opts
|
|
|
863
863
|
}
|
|
864
864
|
|
|
865
865
|
export async function generateTwlByBook(bookCode, options = {}) {
|
|
866
|
-
//
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
const
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
const
|
|
873
|
-
|
|
866
|
+
// New: English-first matching (no Strong's), using ULT USFM verses
|
|
867
|
+
// Build term -> [articles] from local tw_strongs_list.json (terms only; ignore Strong's)
|
|
868
|
+
const twJson = await loadTwJsonLocal();
|
|
869
|
+
const termToArticles = {};
|
|
870
|
+
for (const [article, val] of Object.entries(twJson)) {
|
|
871
|
+
const terms = (val && val.article && Array.isArray(val.article.terms)) ? val.article.terms : [];
|
|
872
|
+
for (const raw of terms) {
|
|
873
|
+
const term = String(raw || '').replace(/\s*\([^)]*\)\s*$/, '').trim();
|
|
874
|
+
if (!term) continue;
|
|
875
|
+
if (!termToArticles[term]) termToArticles[term] = [];
|
|
876
|
+
// Use slug as-is (e.g., kt/grace)
|
|
877
|
+
termToArticles[term].push(article);
|
|
878
|
+
}
|
|
874
879
|
}
|
|
880
|
+
|
|
881
|
+
// Build trie for fast scanning
|
|
882
|
+
const { buildTermTrie, scanVerseMatches } = await import('./utils/twl-matcher.js');
|
|
883
|
+
const trie = buildTermTrie(termToArticles);
|
|
884
|
+
|
|
885
|
+
// Fetch and parse ULT USFM into verses
|
|
886
|
+
const { processUsfmForBook } = await import('./utils/usfm-alignment-remover.js');
|
|
875
887
|
const bibleData = await readBooks();
|
|
876
888
|
const meta = findBookMeta(bibleData, bookCode);
|
|
877
889
|
if (!meta) throw new Error(`Unknown book code: ${bookCode}`);
|
|
878
|
-
const
|
|
879
|
-
const twJson = await loadTwJsonLocal();
|
|
880
|
-
const strongPivot = pivotByStrong(twJson);
|
|
881
|
-
|
|
882
|
-
// 1) initial TSV
|
|
883
|
-
const baseTsv = buildInitialTsv(usfm, strongPivot, meta.key);
|
|
884
|
-
|
|
885
|
-
// 2) add GLQuote and GLOccurrence
|
|
886
|
-
const glRes = await addGLQuoteCols({
|
|
887
|
-
bibleLinks: ["unfoldingWord/en_ult/master"],
|
|
888
|
-
bookCode: meta.key,
|
|
889
|
-
tsvContent: baseTsv,
|
|
890
|
-
trySeparatorsAndOccurrences: true,
|
|
891
|
-
});
|
|
892
|
-
const withGl = glRes.output;
|
|
893
|
-
|
|
894
|
-
// 3) Convert GLQuote/GLOccurrence into OrigWords/Occurrence and convert to OL quotes BEFORE matching
|
|
895
|
-
const lines0 = withGl.split(/\r?\n/);
|
|
896
|
-
const header0 = lines0.shift();
|
|
897
|
-
const h0 = header0.split('\t');
|
|
898
|
-
const I0 = {
|
|
899
|
-
Reference: h0.indexOf('Reference'),
|
|
900
|
-
ID: h0.indexOf('ID'),
|
|
901
|
-
Tags: h0.indexOf('Tags'),
|
|
902
|
-
OrigWords: h0.indexOf('OrigWords'),
|
|
903
|
-
Occurrence: h0.indexOf('Occurrence'),
|
|
904
|
-
TWLink: h0.indexOf('TWLink'),
|
|
905
|
-
GLQuote: h0.indexOf('GLQuote'),
|
|
906
|
-
GLOccurrence: h0.indexOf('GLOccurrence'),
|
|
907
|
-
};
|
|
908
|
-
const rebuilt0 = [header0].concat(lines0.filter(Boolean).map(row => {
|
|
909
|
-
const c = row.split('\t');
|
|
910
|
-
const newCols = c.slice();
|
|
911
|
-
if (I0.GLQuote >= 0) newCols[I0.OrigWords] = c[I0.GLQuote];
|
|
912
|
-
if (I0.GLOccurrence >= 0) newCols[I0.Occurrence] = c[I0.GLOccurrence];
|
|
913
|
-
return newCols.join('\t');
|
|
914
|
-
})).join('\n');
|
|
915
|
-
const convEarly = await convertGLQuotes2OLQuotes({
|
|
916
|
-
bibleLinks: ["unfoldingWord/en_ult/master"],
|
|
917
|
-
bookCode: meta.key,
|
|
918
|
-
tsvContent: rebuilt0,
|
|
919
|
-
trySeparatorsAndOccurrences: true,
|
|
920
|
-
});
|
|
890
|
+
const versesByChapter = await processUsfmForBook(meta.key);
|
|
921
891
|
|
|
922
|
-
//
|
|
923
|
-
const
|
|
924
|
-
const
|
|
925
|
-
const aCols = headerA.split('\t');
|
|
926
|
-
const A = {
|
|
927
|
-
Reference: aCols.indexOf('Reference'),
|
|
928
|
-
ID: aCols.indexOf('ID'),
|
|
929
|
-
Tags: aCols.indexOf('Tags'),
|
|
930
|
-
OrigWords: aCols.indexOf('OrigWords'),
|
|
931
|
-
Occurrence: aCols.indexOf('Occurrence'),
|
|
932
|
-
TWLink: aCols.indexOf('TWLink'),
|
|
933
|
-
GLQuote: aCols.indexOf('GLQuote'),
|
|
934
|
-
GLOccurrence: aCols.indexOf('GLOccurrence'),
|
|
935
|
-
};
|
|
892
|
+
// Header without Strongs; keep GLQuote/GLOccurrence and add Variant of, Disambiguation
|
|
893
|
+
const header = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Variant of', 'Disambiguation'];
|
|
894
|
+
const outRows = [header.join('\t')];
|
|
936
895
|
|
|
937
|
-
//
|
|
938
|
-
const finalHeaderBase = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Strongs'];
|
|
896
|
+
// ID generator
|
|
939
897
|
const usedIds = new Set();
|
|
940
898
|
const genId = () => {
|
|
941
899
|
const letters = 'abcdefghijklmnopqrstuvwxyz';
|
|
@@ -949,86 +907,100 @@ export async function generateTwlByBook(bookCode, options = {}) {
|
|
|
949
907
|
}
|
|
950
908
|
};
|
|
951
909
|
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
if (
|
|
955
|
-
|
|
956
|
-
if (
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
GLQuote: 6,
|
|
982
|
-
GLOccurrence: 7,
|
|
983
|
-
Strongs: 8,
|
|
910
|
+
// Helpers for Variant of decision (allow only plural/-ed/-ing without marking variant)
|
|
911
|
+
const pluralizeWord = (w) => {
|
|
912
|
+
if (/[^aeiou]y$/i.test(w)) return w.replace(/y$/i, 'ies');
|
|
913
|
+
if (/(s|x|z|ch|sh)$/i.test(w)) return w + 'es';
|
|
914
|
+
if (/f$/i.test(w) && !/(roof|belief|chief|proof)$/i.test(w)) return w.replace(/f$/i, 'ves');
|
|
915
|
+
if (/fe$/i.test(w)) return w.replace(/fe$/i, 'ves');
|
|
916
|
+
if (/o$/i.test(w)) return w + 'es';
|
|
917
|
+
return w + 's';
|
|
918
|
+
};
|
|
919
|
+
const isVowel = (ch) => /[aeiou]/i.test(ch);
|
|
920
|
+
const isConsonant = (ch) => /[a-z]/i.test(ch) && !isVowel(ch);
|
|
921
|
+
const endsWithCVC = (w) => w.length >= 3 && isConsonant(w[w.length - 3]) && isVowel(w[w.length - 2]) && isConsonant(w[w.length - 1]) && !/[wxy]/i.test(w[w.length - 1]);
|
|
922
|
+
const edForm = (w) => (/e$/i.test(w) ? w + 'd' : (/[^aeiou]y$/i.test(w) ? w.replace(/y$/i, 'ied') : (endsWithCVC(w) ? w + w[w.length - 1] + 'ed' : w + 'ed')));
|
|
923
|
+
const ingForm = (w) => (/ie$/i.test(w) ? w.replace(/ie$/i, 'ying') : (/ee$/i.test(w) ? w + 'ing' : (/e$/i.test(w) ? w.replace(/e$/i, 'ing') : (endsWithCVC(w) ? w + w[w.length - 1] + 'ing' : w + 'ing'))));
|
|
924
|
+
|
|
925
|
+
const allowNoVariant = (base, match) => {
|
|
926
|
+
const b = String(base || '');
|
|
927
|
+
const m = String(match || '');
|
|
928
|
+
if (!b || !m) return true;
|
|
929
|
+
if (b.toLowerCase() === m.toLowerCase()) return true;
|
|
930
|
+
const parts = b.trim().split(/\s+/);
|
|
931
|
+
const head = parts.length > 1 ? parts.slice(0, -1).join(' ') + ' ' : '';
|
|
932
|
+
const last = parts[parts.length - 1];
|
|
933
|
+
const allowed = new Set([
|
|
934
|
+
head + pluralizeWord(last),
|
|
935
|
+
head + edForm(last),
|
|
936
|
+
head + ingForm(last),
|
|
937
|
+
].map(x => x.toLowerCase()));
|
|
938
|
+
return allowed.has(m.toLowerCase());
|
|
984
939
|
};
|
|
985
940
|
|
|
986
|
-
//
|
|
987
|
-
const
|
|
988
|
-
const
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
const
|
|
1004
|
-
|
|
941
|
+
// Walk through verses in order
|
|
942
|
+
const chapterNums = Object.keys(versesByChapter).map(n => parseInt(n, 10)).sort((a, b) => a - b);
|
|
943
|
+
for (const c of chapterNums) {
|
|
944
|
+
const verses = versesByChapter[c] || {};
|
|
945
|
+
const verseNums = Object.keys(verses).map(n => parseInt(n, 10)).sort((a, b) => a - b);
|
|
946
|
+
for (const v of verseNums) {
|
|
947
|
+
const text = verses[v] || '';
|
|
948
|
+
const matches = scanVerseMatches(text, trie);
|
|
949
|
+
// Count occurrences per exact matchedText (case-sensitive)
|
|
950
|
+
const occMap = new Map();
|
|
951
|
+
for (const m of matches) {
|
|
952
|
+
const glq = m.matchedText;
|
|
953
|
+
const occ = (occMap.get(glq) || 0) + 1;
|
|
954
|
+
occMap.set(glq, occ);
|
|
955
|
+
|
|
956
|
+
const ref = `${c}:${v}`;
|
|
957
|
+
const id = genId();
|
|
958
|
+
const primaryArticle = (m.articles && m.articles[0]) || '';
|
|
959
|
+
let tag = '';
|
|
960
|
+
if (primaryArticle.startsWith('kt/')) tag = 'keyterm';
|
|
961
|
+
else if (primaryArticle.startsWith('names/')) tag = 'name';
|
|
962
|
+
const twLink = primaryArticle ? `rc://*/tw/dict/bible/${primaryArticle}` : '';
|
|
963
|
+
|
|
964
|
+
// Variant of: only if beyond plural/-ed/-ing differences
|
|
965
|
+
const variantOf = allowNoVariant(m.term, glq) ? '' : m.term;
|
|
966
|
+
// Disambiguation: list all candidate articles for this match
|
|
967
|
+
const disamb = (m.articles && m.articles.length > 1) ? `(${m.articles.join(', ')})` : '';
|
|
968
|
+
|
|
969
|
+
// Set OrigWords/Occurrence equal to GLQuote/GLOccurrence for English-first output
|
|
970
|
+
outRows.push([
|
|
971
|
+
ref,
|
|
972
|
+
id,
|
|
973
|
+
tag,
|
|
974
|
+
glq,
|
|
975
|
+
String(occ),
|
|
976
|
+
twLink,
|
|
977
|
+
glq,
|
|
978
|
+
String(occ),
|
|
979
|
+
variantOf,
|
|
980
|
+
disamb,
|
|
981
|
+
].join('\t'));
|
|
1005
982
|
}
|
|
1006
|
-
const tried = prioritizeArticles(glq, strongId, strongPivot) || [];
|
|
1007
|
-
const disambTried = tried.length ? `(${tried.join(', ')})` : '';
|
|
1008
|
-
noMatchRows.push(cols.join('\t') + '\t' + disambTried);
|
|
1009
|
-
continue;
|
|
1010
983
|
}
|
|
1011
|
-
const art = result.article;
|
|
1012
|
-
cols[H.TWLink] = `rc://*/tw/dict/bible/${art}`;
|
|
1013
|
-
// Update Tags based on selected article prefix
|
|
1014
|
-
let tag = '';
|
|
1015
|
-
if (art.startsWith('kt/')) tag = 'keyterm';
|
|
1016
|
-
else if (art.startsWith('names/')) tag = 'name';
|
|
1017
|
-
cols[H.Tags] = tag;
|
|
1018
|
-
if (result.disamb) multiDisambRows++;
|
|
1019
|
-
const variantOf = result.variantTerm || '';
|
|
1020
|
-
outRows.push(cols.join('\t') + '\t' + variantOf + '\t' + (result.disamb || ''));
|
|
1021
984
|
}
|
|
1022
985
|
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
986
|
+
// Build TSV and convert GL OrigWords back to OL using tsv-quote-converters
|
|
987
|
+
let matchedTsv = outRows.join('\n');
|
|
988
|
+
try {
|
|
989
|
+
const { convertGLQuotes2OLQuotes } = await import('tsv-quote-converters');
|
|
990
|
+
const conv = await convertGLQuotes2OLQuotes({
|
|
991
|
+
bibleLinks: ['unfoldingWord/en_ult/master'],
|
|
992
|
+
bookCode: String(meta.key || bookCode).toLowerCase(),
|
|
993
|
+
tsvContent: matchedTsv,
|
|
994
|
+
trySeparatorsAndOccurrences: true,
|
|
995
|
+
quiet: true,
|
|
996
|
+
});
|
|
997
|
+
if (conv && typeof conv.output === 'string' && conv.output.length) {
|
|
998
|
+
matchedTsv = conv.output;
|
|
999
|
+
}
|
|
1000
|
+
} catch (e) {
|
|
1001
|
+
// If conversion fails (e.g., no network), fall back to unconverted TSV
|
|
1029
1002
|
}
|
|
1030
|
-
|
|
1031
|
-
const
|
|
1032
|
-
const noMatchTsv = noMatchRows.join('\n');
|
|
1003
|
+
const noMatchHeader = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Disambiguation'];
|
|
1004
|
+
const noMatchTsv = [noMatchHeader.join('\t')].join('\n');
|
|
1033
1005
|
return { matchedTsv, noMatchTsv };
|
|
1034
1006
|
}
|
package/src/utils/twl-matcher.js
CHANGED
|
@@ -160,16 +160,86 @@ class PrefixTrie {
|
|
|
160
160
|
if (node._terms) {
|
|
161
161
|
const matchLength = currentPos - startPos;
|
|
162
162
|
// Always extract from the original text to preserve case
|
|
163
|
-
|
|
163
|
+
let originalMatchedText = originalText.substring(startPos, currentPos);
|
|
164
|
+
|
|
165
|
+
// Extend match backwards to include dash-connected words and possessive forms
|
|
166
|
+
let extendedStartPos = startPos;
|
|
167
|
+
|
|
168
|
+
// Check backwards for dash preceded by word characters (no space between)
|
|
169
|
+
if (extendedStartPos > 0 && originalText[extendedStartPos - 1] === '-') {
|
|
170
|
+
let dashPos = extendedStartPos - 1;
|
|
171
|
+
dashPos--; // Move before the dash
|
|
172
|
+
// Check if there are word characters immediately before the dash
|
|
173
|
+
if (dashPos >= 0 && /[\w]/.test(originalText[dashPos])) {
|
|
174
|
+
// Find the start of the word before the dash
|
|
175
|
+
while (dashPos >= 0 && /[\w]/.test(originalText[dashPos])) {
|
|
176
|
+
dashPos--;
|
|
177
|
+
}
|
|
178
|
+
extendedStartPos = dashPos + 1;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Check backwards for apostrophe (straight or curly) preceded by text
|
|
183
|
+
if (extendedStartPos > 0 && /['']/.test(originalText[extendedStartPos - 1])) {
|
|
184
|
+
let apostrophePos = extendedStartPos - 1;
|
|
185
|
+
apostrophePos--; // Move before the apostrophe
|
|
186
|
+
// Check if there are word characters immediately before the apostrophe
|
|
187
|
+
if (apostrophePos >= 0 && /[\w]/.test(originalText[apostrophePos])) {
|
|
188
|
+
// Find the start of the text before the apostrophe
|
|
189
|
+
while (apostrophePos >= 0 && /[\w]/.test(originalText[apostrophePos])) {
|
|
190
|
+
apostrophePos--;
|
|
191
|
+
}
|
|
192
|
+
extendedStartPos = apostrophePos + 1;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Extend match forwards to include dash-connected words and possessive forms
|
|
197
|
+
let extendedEndPos = currentPos;
|
|
198
|
+
|
|
199
|
+
// Check for dash followed by word characters (no space between)
|
|
200
|
+
if (extendedEndPos < originalText.length && originalText[extendedEndPos] === '-') {
|
|
201
|
+
let dashPos = extendedEndPos;
|
|
202
|
+
dashPos++; // Move past the dash
|
|
203
|
+
// Check if there are word characters immediately after the dash
|
|
204
|
+
if (dashPos < originalText.length && /[\w]/.test(originalText[dashPos])) {
|
|
205
|
+
// Find the end of the word after the dash
|
|
206
|
+
while (dashPos < originalText.length && /[\w]/.test(originalText[dashPos])) {
|
|
207
|
+
dashPos++;
|
|
208
|
+
}
|
|
209
|
+
extendedEndPos = dashPos;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// Check for apostrophe (straight or curly) followed by text
|
|
214
|
+
if (extendedEndPos < originalText.length && /['']/.test(originalText[extendedEndPos])) {
|
|
215
|
+
let apostrophePos = extendedEndPos;
|
|
216
|
+
apostrophePos++; // Move past the apostrophe
|
|
217
|
+
// Check if there are word characters immediately after the apostrophe
|
|
218
|
+
if (apostrophePos < originalText.length && /[\w]/.test(originalText[apostrophePos])) {
|
|
219
|
+
// Find the end of the text after the apostrophe
|
|
220
|
+
while (apostrophePos < originalText.length && /[\w]/.test(originalText[apostrophePos])) {
|
|
221
|
+
apostrophePos++;
|
|
222
|
+
}
|
|
223
|
+
extendedEndPos = apostrophePos;
|
|
224
|
+
} else {
|
|
225
|
+
// Include the apostrophe even if no text follows (for possessives ending in s)
|
|
226
|
+
extendedEndPos = apostrophePos;
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Update the matched text if we extended it
|
|
231
|
+
if (extendedStartPos < startPos || extendedEndPos > currentPos) {
|
|
232
|
+
originalMatchedText = originalText.substring(extendedStartPos, extendedEndPos);
|
|
233
|
+
}
|
|
164
234
|
|
|
165
235
|
// Check if this is a valid word boundary match (both start and end)
|
|
166
|
-
const isStartBoundary =
|
|
167
|
-
/[\s\p{P}]/.test(originalText[
|
|
168
|
-
!/[\w]/.test(originalText[
|
|
236
|
+
const isStartBoundary = extendedStartPos === 0 ||
|
|
237
|
+
/[\s\p{P}]/.test(originalText[extendedStartPos - 1]) ||
|
|
238
|
+
!/[\w]/.test(originalText[extendedStartPos - 1]);
|
|
169
239
|
|
|
170
|
-
const isEndBoundary =
|
|
171
|
-
/[\s\p{P}]/.test(originalText[
|
|
172
|
-
!/[\w]/.test(originalText[
|
|
240
|
+
const isEndBoundary = extendedEndPos >= originalText.length ||
|
|
241
|
+
/[\s\p{P}]/.test(originalText[extendedEndPos]) ||
|
|
242
|
+
!/[\w]/.test(originalText[extendedEndPos]);
|
|
173
243
|
|
|
174
244
|
const isWordBoundary = isStartBoundary && isEndBoundary;
|
|
175
245
|
|
|
@@ -178,8 +248,9 @@ class PrefixTrie {
|
|
|
178
248
|
matches.push({
|
|
179
249
|
term: termData.term,
|
|
180
250
|
articles: termData.articles,
|
|
181
|
-
matchedText: originalMatchedText, // Use the
|
|
182
|
-
length:
|
|
251
|
+
matchedText: originalMatchedText, // Use the extended matched text
|
|
252
|
+
length: originalMatchedText.length, // Use extended length
|
|
253
|
+
originalLength: matchLength, // Keep track of original match length for advancement
|
|
183
254
|
priority: termData.priority,
|
|
184
255
|
isExactCase: isExactCase
|
|
185
256
|
});
|
|
@@ -283,9 +354,11 @@ function findMatches(verseText, termTrie) {
|
|
|
283
354
|
priority: bestMatch.priority
|
|
284
355
|
});
|
|
285
356
|
|
|
286
|
-
// Move past the matched text
|
|
287
|
-
|
|
288
|
-
|
|
357
|
+
// Move past only the original matched text (not the extended part)
|
|
358
|
+
// This allows finding additional matches within the extended portion
|
|
359
|
+
const advanceBy = bestMatch.originalLength || bestMatch.length;
|
|
360
|
+
processedText += normalizedText.substring(currentPos, currentPos + advanceBy);
|
|
361
|
+
currentPos += advanceBy;
|
|
289
362
|
} else {
|
|
290
363
|
// No match found, move to next character/word boundary
|
|
291
364
|
const nextWordBoundary = normalizedText.substring(currentPos).search(/[\s\p{P}]/u);
|
|
@@ -427,3 +500,12 @@ export function generateTWLMatches(twTerms, verses) {
|
|
|
427
500
|
|
|
428
501
|
return tsvRows.join('\n');
|
|
429
502
|
}
|
|
503
|
+
|
|
504
|
+
// Expose lightweight building and scanning APIs for reuse
|
|
505
|
+
export function buildTermTrie(twTerms) {
|
|
506
|
+
return createOptimizedTermMap(twTerms);
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
export function scanVerseMatches(verseText, termTrie) {
|
|
510
|
+
return findMatches(verseText, termTrie);
|
|
511
|
+
}
|
|
@@ -69,10 +69,12 @@ export const removeAllTagsExceptChapterVerse = (usfmContent) => {
|
|
|
69
69
|
* @return {Promise<Object>} - Object with chapters and verses
|
|
70
70
|
*/
|
|
71
71
|
export async function processUsfmForBook(book) {
|
|
72
|
-
|
|
72
|
+
// Normalize book key to lowercase to match BibleBookData keys
|
|
73
|
+
const key = String(book || '').toLowerCase();
|
|
74
|
+
if (!BibleBookData[key]) throw new Error(`Unknown book: ${book}`);
|
|
73
75
|
|
|
74
76
|
const fetch = await getFetch();
|
|
75
|
-
const usfmUrl = `https://git.door43.org/api/v1/repos/unfoldingWord/en_ult/contents/${BibleBookData[
|
|
77
|
+
const usfmUrl = `https://git.door43.org/api/v1/repos/unfoldingWord/en_ult/contents/${BibleBookData[key].usfm}.usfm?ref=master`;
|
|
76
78
|
const usfmRes = await fetch(usfmUrl);
|
|
77
79
|
if (!usfmRes.ok) throw new Error(`Failed to download USFM file for ${book}`);
|
|
78
80
|
const usfmData = await usfmRes.json();
|
|
@@ -120,4 +122,4 @@ export function parseUsfmToVerses(usfm) {
|
|
|
120
122
|
}
|
|
121
123
|
|
|
122
124
|
return versesObj;
|
|
123
|
-
}
|
|
125
|
+
}
|