twl-generator 1.2.15 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/README.md +282 -57
  2. package/package.json +5 -2
  3. package/src/cli.js +72 -74
  4. package/src/index.js +807 -27
package/src/index.js CHANGED
@@ -1,32 +1,812 @@
1
- // Main module for twl-generator
2
- import { generateTWTerms } from './utils/zipProcessor.js';
3
- import { processUsfmForBook, parseUsfmToVerses, removeAllTagsExceptChapterVerse } from './utils/usfm-alignment-remover.js';
4
- import { generateTWLMatches } from './utils/twl-matcher.js';
5
-
6
- export { generateTWTerms, processUsfmForBook };
7
-
8
- /**
9
- * Main function that processes both TW articles and USFM file
10
- * @param {string} book - The book identifier (optional if usfmContent is provided)
11
- * @param {string} usfmContent - Optional USFM content to process instead of fetching
12
- * @return {Promise<string>} - TSV string
13
- */
14
- export async function generateTWLWithUsfm(book, usfmContent = null) {
15
- // Generate TW terms (with caching)
16
- const terms = await generateTWTerms();
17
-
18
- let verses;
19
- if (usfmContent) {
20
- // Parse provided USFM content (clean it first)
21
- const cleanUsfm = removeAllTagsExceptChapterVerse(usfmContent);
22
- verses = parseUsfmToVerses(cleanUsfm);
1
+ import { BibleBookData } from './common/books.js';
2
+
3
+ const isBrowser = typeof window !== 'undefined';
4
+ const TW_JSON_URL = new URL('../tw_strongs_list.json', import.meta.url);
5
+
6
+ async function readBooks() {
7
+ // Build a simple CODE -> { usfm, testament } map from the local BibleBookData
8
+ const map = {};
9
+ for (const [code, meta] of Object.entries(BibleBookData)) {
10
+ map[code.toUpperCase()] = { usfm: meta.usfm, testament: meta.testament };
11
+ }
12
+ return map;
13
+ }
14
+
15
+ function findBookMeta(bookMap, code) {
16
+ const key = Object.keys(bookMap).find(k => k.toLowerCase() === code.toLowerCase());
17
+ if (!key) return null;
18
+ const meta = bookMap[key];
19
+ if (!meta || !meta.usfm || !meta.testament) return null;
20
+ return { key, ...meta };
21
+ }
22
+
23
+ async function fetchUsfm(usfmCode, testament) {
24
+ const repo = testament === 'old' ? 'hbo_uhb' : 'el-x-koine_ugnt';
25
+ const url = `https://git.door43.org/api/v1/repos/unfoldingWord/${repo}/contents/${usfmCode}.usfm`;
26
+ const res = await fetch(url);
27
+ if (!res.ok) throw new Error(`Failed to fetch USFM: ${res.status} ${res.statusText}`);
28
+ const json = await res.json();
29
+ const b64 = json.content || '';
30
+
31
+ if (isBrowser) {
32
+ // Browser: use atob and TextDecoder
33
+ const binary = atob(b64);
34
+ const bytes = Uint8Array.from(binary, c => c.charCodeAt(0));
35
+ const decoder = new TextDecoder('utf-8');
36
+ return decoder.decode(bytes);
23
37
  } else {
24
- // Fetch USFM from git.door43.org
25
- if (!book) throw new Error('Book parameter required when no USFM content provided');
26
- verses = await processUsfmForBook(book);
38
+ // Node.js: use Buffer
39
+ const { Buffer } = await import('node:buffer');
40
+ const buf = Buffer.from(b64, 'base64');
41
+ return buf.toString('utf8');
27
42
  }
43
+ }
44
+
45
+ function pivotByStrong(twMap) {
46
+ // Build two structures:
47
+ // 1) singles: strong -> Set(articles) including base (strip letter suffix)
48
+ // 2) seqFirst: base-first-strong -> [{ article, seqBase, len }] preserving order in twMap
49
+ const singles = new Map();
50
+ const seqFirst = new Map();
51
+ const toBase = (sid) => {
52
+ const m = String(sid || '').match(/^([HG])(\d+)([a-f])?$/i);
53
+ if (!m) return '';
54
+ return `${m[1].toUpperCase()}${m[2]}`;
55
+ };
56
+
57
+ for (const [article, val] of Object.entries(twMap)) {
58
+ const list = Array.isArray(val && val.strongs ? val.strongs : undefined) ? val.strongs : [];
59
+ for (const arr of list) {
60
+ const seq = Array.isArray(arr) ? arr.filter(Boolean) : [];
61
+ if (!seq.length) continue;
62
+ // map each sid to article for singles (also its base form)
63
+ for (const sid of seq) {
64
+ const add = (s) => {
65
+ if (!s) return;
66
+ if (!singles.has(s)) singles.set(s, new Set());
67
+ singles.get(s).add(article);
68
+ };
69
+ add(sid);
70
+ add(toBase(sid));
71
+ }
72
+ // record multi-strong sequences by their base first sid
73
+ if (seq.length > 1) {
74
+ const firstBase = toBase(seq[0]);
75
+ if (firstBase) {
76
+ if (!seqFirst.has(firstBase)) seqFirst.set(firstBase, []);
77
+ seqFirst.get(firstBase).push({ article, seqBase: seq.map(toBase), len: seq.length });
78
+ }
79
+ }
80
+ }
81
+ }
82
+ // convert to plain objects/arrays
83
+ const singlesObj = {};
84
+ for (const [k, v] of singles.entries()) singlesObj[k] = Array.from(v);
85
+ const seqFirstObj = {};
86
+ for (const [k, v] of seqFirst.entries()) seqFirstObj[k] = v.slice().sort((a, b) => b.len - a.len);
87
+ // expose legacy mapping for strong -> articles, and an extra property for sequences
88
+ return Object.assign(singlesObj, { __seqFirst: seqFirstObj });
89
+ }
90
+
91
+ function parseWTokens(usfm) {
92
+ // return array of { c, v, surface, attrs }
93
+ const out = [];
94
+ let curC = 0, curV = 0;
95
+ const cRe = /\\c\s+(\d+)/g;
96
+ let m;
97
+ // We'll iterate once and collect tokens with current chapter/verse; cheaper: do a global walk
98
+ const re = /(\\c\s+(\d+))|(\\v\s+(\d+))|\\w\s+([^|\s][^|]*?)\|([^\\]*?)\\w\*/g;
99
+ while ((m = re.exec(usfm))) {
100
+ if (m[2]) { curC = parseInt(m[2], 10); continue; }
101
+ if (m[4]) { curV = parseInt(m[4], 10); continue; }
102
+ if (m[5]) {
103
+ out.push({ c: curC, v: curV, surface: m[5], attrs: m[6] || '' });
104
+ }
105
+ }
106
+ return out;
107
+ }
108
+
109
+ function extractStrongIds(attrText) {
110
+ const sm = attrText.match(/(?:x-)?strong="([^"]+)"/);
111
+ if (!sm) return [];
112
+ const parts = sm[1].split(/[\s|]+/).map(s => s.trim()).filter(Boolean);
113
+ const out = [];
114
+ for (let p of parts) {
115
+ const core = p.split(':').pop().trim();
116
+ const m = core.match(/^([HG])(\d+)([a-f]?)$/i);
117
+ if (!m) continue;
118
+ out.push(`${m[1].toUpperCase()}${m[2]}${(m[3] || '').toLowerCase()}`);
119
+ }
120
+ return out;
121
+ }
28
122
 
29
- // Generate TWL matches and return TSV
30
- const tsv = generateTWLMatches(terms, verses);
123
+ function buildInitialTsv(usfm, strongPivot, bookCode) {
124
+ const tokens = parseWTokens(usfm).map(t => ({ ...t, sids: extractStrongIds(t.attrs) }));
125
+ const rows = [];
126
+ // map of `${c}:${v}` -> Map(phrase -> count)
127
+ const occMap = new Map();
128
+ const getArts = (sid) => {
129
+ let arts = strongPivot[sid];
130
+ if ((!arts || !arts.length) && /^(H|G)\d+[a-f]$/.test(sid)) {
131
+ const base = sid.slice(0, -1);
132
+ arts = strongPivot[base];
133
+ }
134
+ return arts;
135
+ };
136
+ const toBase = (sid) => {
137
+ const m = String(sid || '').match(/^([HG])(\d+)([a-f])?$/i);
138
+ if (!m) return '';
139
+ return `${m[1].toUpperCase()}${m[2]}`;
140
+ };
141
+ const tokenHasSid = (tok, sidBase) => {
142
+ if (!sidBase) return false;
143
+ return (tok.sids || []).some(s => toBase(s) === sidBase);
144
+ };
145
+ const seqFirst = strongPivot.__seqFirst || {};
146
+
147
+ let i = 0;
148
+ while (i < tokens.length) {
149
+ const t = tokens[i];
150
+ if (!t.c || !t.v) { i++; continue; }
151
+ const keyCv = `${t.c}:${t.v}`;
152
+ if (!occMap.has(keyCv)) occMap.set(keyCv, new Map());
153
+ const cvMap = occMap.get(keyCv);
154
+
155
+ // Try to match the longest multi-Strong's sequence starting at this token (within the same verse)
156
+ let bestSeq = null;
157
+ const startBases = (t.sids || []).map(toBase).filter(Boolean);
158
+ for (const firstBase of startBases) {
159
+ const candidates = seqFirst[firstBase] || [];
160
+ for (const cand of candidates) {
161
+ // Ensure all subsequent sids match in order within the same verse
162
+ let ok = true;
163
+ for (let k = 0; k < cand.seqBase.length; k++) {
164
+ const pos = i + k;
165
+ const tt = tokens[pos];
166
+ if (!tt || tt.c !== t.c || tt.v !== t.v) { ok = false; break; }
167
+ if (!tokenHasSid(tt, cand.seqBase[k])) { ok = false; break; }
168
+ }
169
+ if (ok) {
170
+ if (!bestSeq || cand.len > bestSeq.len) bestSeq = { ...cand };
171
+ }
172
+ }
173
+ }
174
+
175
+ if (bestSeq) {
176
+ // Build combined surface phrase and count occurrence within the verse
177
+ const len = bestSeq.len;
178
+ const phrase = tokens.slice(i, i + len).map(x => x.surface.trim()).join(' ');
179
+ const cur = (cvMap.get(phrase) || 0) + 1;
180
+ cvMap.set(phrase, cur);
181
+ // Assign ID as the first strong in the sequence; TWLink prefers the sequence's article
182
+ const firstSid = (t.sids && t.sids[0]) ? t.sids[0] : bestSeq.seqBase[0];
183
+ const art = bestSeq.article;
184
+ const tag = art.startsWith('kt/') ? 'kt' : (art.startsWith('names/') ? 'names' : '');
185
+ const twLink = `rc://*/tw/dict/bible/${art}`;
186
+ rows.push([`${t.c}:${t.v}`, firstSid, tag, phrase, String(cur), twLink]);
187
+ i += len; // skip consumed tokens
188
+ continue;
189
+ }
190
+
191
+ // Fallback: single-token behavior
192
+ const normSurface = t.surface.trim();
193
+ const cur = (cvMap.get(normSurface) || 0) + 1;
194
+ cvMap.set(normSurface, cur);
195
+ const sidList = t.sids || [];
196
+ if (!sidList.length) { i++; continue; }
197
+ for (const sid of sidList) {
198
+ const arts = getArts(sid);
199
+ if (!arts || !arts.length) continue;
200
+ const first = arts[0];
201
+ const tag = first.startsWith('kt/') ? 'kt' : (first.startsWith('names/') ? 'names' : '');
202
+ const twLink = `rc://*/tw/dict/bible/${first}`;
203
+ rows.push([`${t.c}:${t.v}`, sid, tag, normSurface, String(cur), twLink]);
204
+ }
205
+ i++;
206
+ }
207
+
208
+ const header = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink'];
209
+ const tsv = [header.join('\t'), ...rows.map(r => r.join('\t'))].join('\n');
31
210
  return tsv;
32
211
  }
212
+
213
+ async function loadTwJsonLocal() {
214
+ if (isBrowser) {
215
+ // In browser, try to fetch from public path
216
+ const url = '/tw_strongs_list.json';
217
+ const res = await fetch(url);
218
+ if (!res.ok) throw new Error(`Failed to fetch tw_strongs_list.json: ${res.status}`);
219
+ return await res.json();
220
+ } else {
221
+ // In Node.js, read from file system
222
+ const fs = await import('node:fs/promises');
223
+ const { fileURLToPath } = await import('node:url');
224
+ const filePath = fileURLToPath(TW_JSON_URL);
225
+ const raw = await fs.readFile(filePath, 'utf8');
226
+ return JSON.parse(raw);
227
+ }
228
+ }
229
+
230
+ function buildArticleTermMap(twMap) {
231
+ // Normalize helper: remove only trailing parenthetical notes and collapse whitespace
232
+ const stripParensTrim = (s) => String(s || '').replace(/\s*\([^)]*\)\s*$/, '').replace(/\s+/g, ' ').trim();
233
+ const out = new Map(); // article -> ordered unique terms as { orig, lower }
234
+ for (const [k, v] of Object.entries(twMap)) {
235
+ const terms = (v && v.article && typeof v.article === 'object' && Array.isArray(v.article.terms)) ? v.article.terms : [];
236
+ const ordered = [];
237
+ for (const t of terms) {
238
+ const orig = stripParensTrim(t);
239
+ if (!orig) continue;
240
+ ordered.push({ orig, lower: orig.toLowerCase() });
241
+ }
242
+ // de-dupe by lower, preserve order
243
+ const seen = new Set();
244
+ const uniq = [];
245
+ for (const obj of ordered) {
246
+ if (seen.has(obj.lower)) continue;
247
+ seen.add(obj.lower);
248
+ uniq.push(obj);
249
+ }
250
+ // sort longest to shortest; for ties, preserve original order (stable by adding index)
251
+ const withOrd = uniq.map((o, i) => ({ ...o, ord: i }));
252
+ withOrd.sort((a, b) => {
253
+ const dl = b.orig.length - a.orig.length;
254
+ if (dl !== 0) return dl;
255
+ return a.ord - b.ord;
256
+ });
257
+ out.set(k, withOrd);
258
+ }
259
+ return out;
260
+ }
261
+
262
+ // Build prioritized candidate list for a given strongId and GLQuote
263
+ function prioritizeArticles(glq, strongId, strongPivot) {
264
+ let candidates = (strongPivot[strongId] || []).slice();
265
+ if ((!candidates || !candidates.length) && /^(H|G)\d+[a-f]$/.test(strongId)) {
266
+ const base = strongId.slice(0, -1);
267
+ candidates = (strongPivot[base] || []).slice();
268
+ }
269
+ if (!candidates.length) return [];
270
+ const text = String(glq || '').toLowerCase();
271
+
272
+ const slugOf = (art) => (art.includes('/') ? art.split('/').pop() : art).toLowerCase();
273
+ // 1) Articles whose slug appears in GLQuote, ordered by longer slug first
274
+ const slugMatched = candidates
275
+ .filter((a) => text.includes(slugOf(a)))
276
+ .sort((a, b) => slugOf(b).length - slugOf(a).length);
277
+ const inSlug = new Set(slugMatched);
278
+
279
+ // 2) Remaining articles grouped kt/ then names/ then other; each group sorted by slug alphabetically
280
+ const rest = candidates.filter(a => !inSlug.has(a));
281
+ const groupRank = (a) => (a.startsWith('kt/') ? 0 : (a.startsWith('names/') ? 1 : 2));
282
+ const restSorted = rest.sort((a, b) => {
283
+ const ga = groupRank(a), gb = groupRank(b);
284
+ if (ga !== gb) return ga - gb;
285
+ const sa = slugOf(a), sb = slugOf(b);
286
+ return sa.localeCompare(sb);
287
+ });
288
+
289
+ return slugMatched.concat(restSorted);
290
+ }
291
+
292
+ function chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, opts = {}) {
293
+ const useCompromise = !!opts.useCompromise;
294
+ const nlp = opts.nlp;
295
+ const prioritized = prioritizeArticles(glq, strongId, strongPivot);
296
+ if (!prioritized.length) return null;
297
+ const textOrig = String(glq || '');
298
+ const textLower = textOrig.toLowerCase();
299
+ const escapeRegExp = s => s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
300
+
301
+ // Utility: split a term into head (all but last word) and last word.
302
+ // head has no trailing space, last has no leading space. Rejoin with (head ? head+" " : "") + last
303
+ const splitHeadLast = (term) => {
304
+ const parts = String(term || '').trim().split(/\s+/);
305
+ if (parts.length <= 1) return { head: '', last: parts[0] || '' };
306
+ const last = parts.pop();
307
+ return { head: parts.join(' '), last };
308
+ };
309
+
310
+ // Basic pluralization helper for English terms. Handles common endings and a few irregulars.
311
+ const pluralizeTerm = (term) => {
312
+ const out = new Set();
313
+ const add = (s) => { const v = s.trim(); if (v) out.add(v); };
314
+ const irregular = {
315
+ man: 'men', woman: 'women', person: 'people', child: 'children',
316
+ foot: 'feet', tooth: 'teeth', goose: 'geese', mouse: 'mice', ox: 'oxen',
317
+ };
318
+ const pluralizeWord = (w) => {
319
+ const lw = w.toLowerCase();
320
+ if (irregular[lw]) return irregular[lw];
321
+ // endings
322
+ if (/[^aeiou]y$/i.test(w)) return w.replace(/y$/i, 'ies');
323
+ if (/(s|x|z|ch|sh)$/i.test(w)) return w + 'es';
324
+ if (/f$/i.test(w) && !/(roof|belief|chief|proof)$/i.test(w)) return w.replace(/f$/i, 'ves');
325
+ if (/fe$/i.test(w)) return w.replace(/fe$/i, 'ves');
326
+ if (/o$/i.test(w)) return w + 'es';
327
+ return w + 's';
328
+ };
329
+ const parts = term.split(/\s+/);
330
+ if (parts.length === 1) {
331
+ add(pluralizeWord(term));
332
+ } else {
333
+ const last = parts.pop();
334
+ const pl = pluralizeWord(last);
335
+ add([...parts, pl].join(' '));
336
+ }
337
+ // also the simple +s as fallback
338
+ add(term + 's');
339
+ return Array.from(out);
340
+ };
341
+
342
+ // Helpers to form -ing and -ed variants for a single word
343
+ const isVowel = (ch) => /[aeiou]/i.test(ch);
344
+ const isConsonant = (ch) => /[a-z]/i.test(ch) && !isVowel(ch);
345
+ const endsWithCVC = (w) => {
346
+ if (w.length < 3) return false;
347
+ const a = w[w.length - 3], b = w[w.length - 2], c = w[w.length - 1];
348
+ if (!isConsonant(a) || !isVowel(b) || !isConsonant(c)) return false;
349
+ // don't double for w, x, y
350
+ if (/[wxy]/i.test(c)) return false;
351
+ return true;
352
+ };
353
+ const presentParticipleWord = (w) => {
354
+ if (/ie$/i.test(w)) return w.replace(/ie$/i, 'ying'); // tie -> tying
355
+ if (/ee$/i.test(w)) return w + 'ing'; // see -> seeing
356
+ if (/e$/i.test(w)) return w.replace(/e$/i, 'ing'); // make -> making
357
+ if (endsWithCVC(w)) return w + w[w.length - 1] + 'ing'; // run -> running
358
+ return w + 'ing';
359
+ };
360
+ const pastTenseWord = (w) => {
361
+ if (/e$/i.test(w)) return w + 'd'; // move -> moved
362
+ if (/[^aeiou]y$/i.test(w)) return w.replace(/y$/i, 'ied'); // carry -> carried
363
+ if (endsWithCVC(w)) return w + w[w.length - 1] + 'ed'; // stop -> stopped
364
+ return w + 'ed';
365
+ };
366
+ const ingEdFormsForTerm = (term) => {
367
+ const forms = new Set();
368
+ const parts = term.split(/\s+/);
369
+ if (parts.length === 1) {
370
+ forms.add(presentParticipleWord(term));
371
+ forms.add(pastTenseWord(term));
372
+ } else {
373
+ const last = parts.pop();
374
+ const base = parts.join(' ');
375
+ forms.add((base ? base + ' ' : '') + presentParticipleWord(last));
376
+ forms.add((base ? base + ' ' : '') + pastTenseWord(last));
377
+ }
378
+ return Array.from(forms);
379
+ };
380
+
381
+ // Irregular verb support: small curated map plus reverse lookup
382
+ const irregularVerbMap = {
383
+ be: ['am', 'is', 'are', 'was', 'were', 'been', 'being', 'be'],
384
+ do: ['did', 'done', 'doing', 'does'],
385
+ go: ['went', 'gone', 'going', 'goes'],
386
+ have: ['had', 'having', 'has'],
387
+ say: ['said', 'saying', 'says'],
388
+ see: ['saw', 'seen', 'seeing', 'sees'],
389
+ get: ['got', 'gotten', 'getting', 'gets'],
390
+ make: ['made', 'making', 'makes'],
391
+ take: ['took', 'taken', 'taking', 'takes'],
392
+ come: ['came', 'coming', 'comes'],
393
+ know: ['knew', 'known', 'knowing', 'knows'],
394
+ give: ['gave', 'given', 'giving', 'gives'],
395
+ find: ['found', 'finding', 'finds'],
396
+ think: ['thought', 'thinking', 'thinks'],
397
+ tell: ['told', 'telling', 'tells'],
398
+ become: ['became', 'become', 'becoming', 'becomes'],
399
+ show: ['showed', 'shown', 'showing', 'shows'],
400
+ leave: ['left', 'leaving', 'leaves'],
401
+ feel: ['felt', 'feeling', 'feels'],
402
+ put: ['put', 'putting', 'puts'],
403
+ bring: ['brought', 'bringing', 'brings'],
404
+ begin: ['began', 'begun', 'beginning', 'begins'],
405
+ keep: ['kept', 'keeping', 'keeps'],
406
+ hold: ['held', 'holding', 'holds'],
407
+ write: ['wrote', 'written', 'writing', 'writes'],
408
+ stand: ['stood', 'standing', 'stands'],
409
+ hear: ['heard', 'hearing', 'hears'],
410
+ let: ['let', 'letting', 'lets'],
411
+ mean: ['meant', 'meaning', 'means'],
412
+ set: ['set', 'setting', 'sets'],
413
+ meet: ['met', 'meeting', 'meets'],
414
+ run: ['ran', 'running', 'runs'],
415
+ pay: ['paid', 'paying', 'pays'],
416
+ sit: ['sat', 'sitting', 'sits'],
417
+ speak: ['spoke', 'spoken', 'speaking', 'speaks'],
418
+ lie: ['lay', 'lain', 'lying', 'lies'],
419
+ lead: ['led', 'leading', 'leads'],
420
+ read: ['read', 'reading', 'reads'],
421
+ grow: ['grew', 'grown', 'growing', 'grows'],
422
+ fall: ['fell', 'fallen', 'falling', 'falls'],
423
+ send: ['sent', 'sending', 'sends'],
424
+ build: ['built', 'building', 'builds'],
425
+ understand: ['understood', 'understanding', 'understands'],
426
+ draw: ['drew', 'drawn', 'drawing', 'draws'],
427
+ break: ['broke', 'broken', 'breaking', 'breaks'],
428
+ spend: ['spent', 'spending', 'spends'],
429
+ cut: ['cut', 'cutting', 'cuts'],
430
+ rise: ['rose', 'risen', 'rising', 'rises'],
431
+ drive: ['drove', 'driven', 'driving', 'drives'],
432
+ buy: ['bought', 'buying', 'buys'],
433
+ wear: ['wore', 'worn', 'wearing', 'wears'],
434
+ swear: ['swore', 'sworn', 'swearing', 'swears'],
435
+ drink: ['drank', 'drunk', 'drinking', 'drinks'],
436
+ eat: ['ate', 'eaten', 'eating', 'eats'],
437
+ choose: ['chose', 'chosen', 'choosing', 'chooses'],
438
+ };
439
+ const irregularReverse = (() => {
440
+ const m = new Map();
441
+ for (const [base, forms] of Object.entries(irregularVerbMap)) {
442
+ m.set(base.toLowerCase(), base);
443
+ for (const f of forms) m.set(String(f).toLowerCase(), base);
444
+ }
445
+ return m;
446
+ })();
447
+ // Return full-term variants where only the last word is replaced by its irregular forms set
448
+ const irregularFormsForTerm = (term) => {
449
+ const { head, last } = splitHeadLast(term);
450
+ const baseKey = irregularReverse.get(String(last).toLowerCase());
451
+ const acc = new Set();
452
+ if (baseKey) {
453
+ const prefix = head ? head + ' ' : '';
454
+ acc.add(prefix + baseKey);
455
+ for (const f of irregularVerbMap[baseKey] || []) acc.add(prefix + f);
456
+ }
457
+ return Array.from(acc);
458
+ };
459
+
460
+ // Use compromise to get conjugations for potential verbs
461
+ const conjugationsForTerm = (term) => {
462
+ // mutate only the last word; return full-term variants
463
+ const { head, last } = splitHeadLast(term);
464
+ const forms = new Set();
465
+ if (!useCompromise || !nlp) return Array.from(forms);
466
+ const doc = nlp(last);
467
+ const verbs = doc.verbs();
468
+ if (!verbs.found) return Array.from(forms);
469
+ const conj = verbs.conjugate();
470
+ const prefix = head ? head + ' ' : '';
471
+ for (const c of conj || []) {
472
+ for (const k of ['PastTense', 'PresentTense', 'Infinitive', 'Gerund', 'Participle']) {
473
+ const v = c[k];
474
+ if (Array.isArray(v)) v.forEach(x => x && forms.add(prefix + String(x)));
475
+ else if (v) forms.add(prefix + String(v));
476
+ }
477
+ }
478
+ return Array.from(forms);
479
+ };
480
+
481
+ // Compute earliest stage match per article, then choose best stage overall with priority tie-breaker
482
+ const perArticleMatches = [];
483
+
484
+ for (const art of prioritized) {
485
+ const terms = termMap.get(art) || [];
486
+ let stage = 0;
487
+ let termHit = '';
488
+ let truncated = false;
489
+
490
+ // Stage 1: case-sensitive, word-boundary
491
+ if (stage === 0) {
492
+ for (const tobj of terms) {
493
+ const termOrig = tobj.orig;
494
+ const alts = new Set([termOrig]);
495
+ for (const a of pluralizeTerm(termOrig)) alts.add(a);
496
+ // add irregular forms for last word; and conjugations when enabled
497
+ for (const a of irregularFormsForTerm(termOrig)) alts.add(a);
498
+ for (const a of conjugationsForTerm(termOrig)) alts.add(a);
499
+ for (const alt of alts) {
500
+ const re1 = new RegExp(`\\b${escapeRegExp(alt)}\\b`);
501
+ if (re1.test(textOrig)) { stage = 1; termHit = termOrig; break; }
502
+ }
503
+ if (stage === 1) break;
504
+ }
505
+ }
506
+ // Stage 2: case-insensitive, word-boundary
507
+ if (stage === 0) {
508
+ for (const tobj of terms) {
509
+ const termOrig = tobj.orig;
510
+ const alts = new Set([termOrig]);
511
+ for (const a of pluralizeTerm(termOrig)) alts.add(a);
512
+ for (const a of irregularFormsForTerm(termOrig)) alts.add(a);
513
+ for (const a of conjugationsForTerm(termOrig)) alts.add(a);
514
+ for (const alt of alts) {
515
+ const re2 = new RegExp(`\\b${escapeRegExp(alt)}\\b`, 'i');
516
+ if (re2.test(textOrig)) { stage = 2; termHit = termOrig; break; }
517
+ }
518
+ if (stage === 2) break;
519
+ }
520
+ }
521
+ // Stage 3: case-sensitive, substring (no word-boundary)
522
+ if (stage === 0) {
523
+ for (const tobj of terms) {
524
+ const termOrig = tobj.orig;
525
+ if (termOrig && textOrig.includes(termOrig)) { stage = 3; termHit = termOrig; break; }
526
+ }
527
+ }
528
+ // Stage 4: case-insensitive, substring on derived stripped forms (no iterative truncation),
529
+ // mutating only the last word for multi-word terms
530
+ if (stage === 0) {
531
+ const strippedForms = (base) => {
532
+ const { head, last } = splitHeadLast(base);
533
+ const prefix = head ? head + ' ' : '';
534
+ const forms = new Set();
535
+ const addIf = (s) => {
536
+ const v = String(s || '').trim().toLowerCase();
537
+ if (v && v.length >= 3) forms.add(v);
538
+ };
539
+ const addFromLast = (w) => {
540
+ const lw = String(w || '').toLowerCase();
541
+ if (!lw) return;
542
+ const full = prefix + lw;
543
+ addIf(full);
544
+ const addVar = (x) => addIf(prefix + x);
545
+ if (/y$/i.test(lw)) addVar(lw.slice(0, -1));
546
+ if (/e$/i.test(lw)) addVar(lw.slice(0, -1));
547
+ if (/ing$/i.test(lw)) addVar(lw.slice(0, -3));
548
+ if (/ed$/i.test(lw)) addVar(lw.slice(0, -2));
549
+ if (/es$/i.test(lw)) addVar(lw.slice(0, -2));
550
+ if (/s$/i.test(lw) && !/ss$/i.test(lw)) addVar(lw.slice(0, -1));
551
+ };
552
+ const addYEOnlyFromLast = (w) => {
553
+ const lw = String(w || '').toLowerCase();
554
+ if (!lw) return;
555
+ const full = prefix + lw;
556
+ addIf(full);
557
+ const addVar = (x) => addIf(prefix + x);
558
+ if (/y$/i.test(lw)) addVar(lw.slice(0, -1));
559
+ if (/e$/i.test(lw)) addVar(lw.slice(0, -1));
560
+ };
561
+ // base last word and its stripped variants
562
+ addFromLast(last);
563
+ // For conjugations/irregulars of the last word, only drop final y/e
564
+ for (const x of conjugationsForTerm(base)) {
565
+ const { head: h2, last: l2 } = splitHeadLast(x);
566
+ // ensure we only consider variants that kept the same head
567
+ if ((h2 || '') === (head || '')) addYEOnlyFromLast(l2);
568
+ }
569
+ for (const x of irregularFormsForTerm(base)) {
570
+ const { head: h2, last: l2 } = splitHeadLast(x);
571
+ if ((h2 || '') === (head || '')) addYEOnlyFromLast(l2);
572
+ }
573
+ return Array.from(forms);
574
+ };
575
+ outerStrip:
576
+ for (const tobj of terms) {
577
+ const termOrig = tobj.orig;
578
+ const forms = strippedForms(termOrig);
579
+ for (const f of forms) {
580
+ if (!f) continue;
581
+ if (textLower.includes(f)) { stage = 4; termHit = termOrig; truncated = false; break outerStrip; }
582
+ }
583
+ }
584
+ }
585
+
586
+ if (stage > 0) {
587
+ perArticleMatches.push({ art, stage, termHit, truncated });
588
+ }
589
+ }
590
+
591
+ if (!perArticleMatches.length) return null;
592
+
593
+ // Determine best stage among all matches
594
+ const bestStage = Math.min(...perArticleMatches.map(m => m.stage));
595
+ const bestMatches = perArticleMatches.filter(m => m.stage === bestStage);
596
+ // Among best matches, pick the one that appears earliest in prioritized list
597
+ const artIndex = new Map(prioritized.map((a, i) => [a, i]));
598
+ bestMatches.sort((a, b) => artIndex.get(a.art) - artIndex.get(b.art));
599
+ const chosenMatch = bestMatches[0];
600
+
601
+ // Disambiguation: list all matched articles
602
+ const matchesList = perArticleMatches.map(m => m.art);
603
+ const disamb = matchesList.length > 1 ? `(${matchesList.join(', ')})` : '';
604
+
605
+ const isVariant = (chosenMatch.stage >= 3) || chosenMatch.truncated;
606
+ let variantTerm = isVariant ? chosenMatch.termHit : '';
607
+ // If marked variant due to non-word-boundary/truncation, but ANY term from the chosen
608
+ // article matches on word-boundaries case-insensitively, then do NOT mark as variant.
609
+ if (variantTerm) {
610
+ const termObjs = termMap.get(chosenMatch.art) || [];
611
+ const hasWordBoundMatch = termObjs.some(tobj => {
612
+ const termOrig = tobj.orig;
613
+ if (!termOrig) return false;
614
+ const re = new RegExp(`\\b${escapeRegExp(termOrig)}\\b`, 'i');
615
+ return re.test(textOrig);
616
+ });
617
+ if (hasWordBoundMatch) {
618
+ variantTerm = '';
619
+ } else {
620
+ // Also suppress if a proper plural of any term matches with word boundaries
621
+ const hasPluralBoundMatch = termObjs.some(tobj => {
622
+ const termOrig = tobj.orig;
623
+ if (!termOrig) return false;
624
+ const plurals = pluralizeTerm(termOrig);
625
+ return plurals.some(p => new RegExp(`\\b${escapeRegExp(p)}\\b`, 'i').test(textOrig));
626
+ });
627
+ if (hasPluralBoundMatch) {
628
+ variantTerm = '';
629
+ } else {
630
+ // Finally, if the matched term inflects (-ing, -ed) OR has irregular forms that match, suppress variant
631
+ const base = chosenMatch.termHit || '';
632
+ const infl = new Set(ingEdFormsForTerm(base));
633
+ for (const f of irregularFormsForTerm(base)) infl.add(f);
634
+ const hasInflBoundMatch = Array.from(infl).some(p => new RegExp(`\\b${escapeRegExp(p)}\\b`, 'i').test(textOrig));
635
+ if (hasInflBoundMatch) variantTerm = '';
636
+ }
637
+ }
638
+ }
639
+
640
+ return { article: chosenMatch.art, disamb, variantTerm };
641
+ }
642
+
643
+ export async function generateTwlByBook(bookCode, options = {}) {
644
+ // Import Node-specific modules conditionally
645
+ const { addGLQuoteCols, convertGLQuotes2OLQuotes } = await import('tsv-quote-converters');
646
+
647
+ const useCompromise = !!options.useCompromise;
648
+ let nlp = null;
649
+ if (useCompromise) {
650
+ const mod = await import('compromise');
651
+ nlp = mod.default || mod;
652
+ }
653
+ const bibleData = await readBooks();
654
+ const meta = findBookMeta(bibleData, bookCode);
655
+ if (!meta) throw new Error(`Unknown book code: ${bookCode}`);
656
+ const usfm = await fetchUsfm(meta.usfm, meta.testament);
657
+ const twJson = await loadTwJsonLocal();
658
+ const strongPivot = pivotByStrong(twJson);
659
+
660
+ // 1) initial TSV
661
+ const baseTsv = buildInitialTsv(usfm, strongPivot, meta.key);
662
+
663
+ // 2) add GLQuote and GLOccurrence
664
+ const glRes = await addGLQuoteCols({
665
+ bibleLinks: ["unfoldingWord/en_ult/master"],
666
+ bookCode: meta.key,
667
+ tsvContent: baseTsv,
668
+ trySeparatorsAndOccurrences: true,
669
+ });
670
+ const withGl = glRes.output;
671
+
672
+ // 3) Convert GLQuote/GLOccurrence into OrigWords/Occurrence and convert to OL quotes BEFORE matching
673
+ const lines0 = withGl.split(/\r?\n/);
674
+ const header0 = lines0.shift();
675
+ const h0 = header0.split('\t');
676
+ const I0 = {
677
+ Reference: h0.indexOf('Reference'),
678
+ ID: h0.indexOf('ID'),
679
+ Tags: h0.indexOf('Tags'),
680
+ OrigWords: h0.indexOf('OrigWords'),
681
+ Occurrence: h0.indexOf('Occurrence'),
682
+ TWLink: h0.indexOf('TWLink'),
683
+ GLQuote: h0.indexOf('GLQuote'),
684
+ GLOccurrence: h0.indexOf('GLOccurrence'),
685
+ };
686
+ const rebuilt0 = [header0].concat(lines0.filter(Boolean).map(row => {
687
+ const c = row.split('\t');
688
+ const newCols = c.slice();
689
+ if (I0.GLQuote >= 0) newCols[I0.OrigWords] = c[I0.GLQuote];
690
+ if (I0.GLOccurrence >= 0) newCols[I0.Occurrence] = c[I0.GLOccurrence];
691
+ return newCols.join('\t');
692
+ })).join('\n');
693
+ const convEarly = await convertGLQuotes2OLQuotes({
694
+ bibleLinks: ["unfoldingWord/en_ult/master"],
695
+ bookCode: meta.key,
696
+ tsvContent: rebuilt0,
697
+ trySeparatorsAndOccurrences: true,
698
+ });
699
+
700
+ // 4) Reorder columns and add Strongs + randomized 4-char IDs before matching
701
+ const linesA = convEarly.output.split(/\r?\n/);
702
+ const headerA = linesA.shift();
703
+ const aCols = headerA.split('\t');
704
+ const A = {
705
+ Reference: aCols.indexOf('Reference'),
706
+ ID: aCols.indexOf('ID'),
707
+ Tags: aCols.indexOf('Tags'),
708
+ OrigWords: aCols.indexOf('OrigWords'),
709
+ Occurrence: aCols.indexOf('Occurrence'),
710
+ TWLink: aCols.indexOf('TWLink'),
711
+ GLQuote: aCols.indexOf('GLQuote'),
712
+ GLOccurrence: aCols.indexOf('GLOccurrence'),
713
+ };
714
+
715
+ // New header order: Reference, ID, Tags, OrigWords, Occurrence, TWLink, Strongs, GLQuote, GLOccurrence
716
+ const finalHeaderBase = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'Strongs', 'GLQuote', 'GLOccurrence'];
717
+ const usedIds = new Set();
718
+ const genId = () => {
719
+ const letters = 'abcdefghijklmnopqrstuvwxyz';
720
+ const alnum = 'abcdefghijklmnopqrstuvwxyz0123456789';
721
+ while (true) {
722
+ const first = letters[Math.floor(Math.random() * letters.length)];
723
+ let rest = '';
724
+ for (let i = 0; i < 3; i++) rest += alnum[Math.floor(Math.random() * alnum.length)];
725
+ const id = first + rest;
726
+ if (!usedIds.has(id)) { usedIds.add(id); return id; }
727
+ }
728
+ };
729
+
730
+ const preparedRows = [];
731
+ for (const ln of linesA) {
732
+ if (!ln.trim()) continue;
733
+ const c = ln.split('\t');
734
+ if (c.length < 7) continue;
735
+ const strongsVal = c[A.ID];
736
+ const newId = genId();
737
+ const newRow = [
738
+ c[A.Reference],
739
+ newId,
740
+ c[A.Tags],
741
+ c[A.OrigWords],
742
+ c[A.Occurrence],
743
+ c[A.TWLink],
744
+ strongsVal,
745
+ c[A.GLQuote],
746
+ c[A.GLOccurrence],
747
+ ];
748
+ preparedRows.push(newRow);
749
+ }
750
+
751
+ // Indexes for prepared rows
752
+ const H = {
753
+ Reference: 0,
754
+ ID: 1,
755
+ Tags: 2,
756
+ OrigWords: 3,
757
+ Occurrence: 4,
758
+ TWLink: 5,
759
+ Strongs: 6,
760
+ GLQuote: 7,
761
+ GLOccurrence: 8,
762
+ };
763
+
764
+ // 5) pick best TWLink based on GLQuote terms using Strongs column; include Variant of column
765
+ const termMap = buildArticleTermMap(twJson);
766
+ const outRows = [finalHeaderBase.concat(['Variant of', 'Disambiguation']).join('\t')];
767
+ const noMatchRows = [finalHeaderBase.concat(['Disambiguation']).join('\t')];
768
+ let totalRows = 0;
769
+ let droppedRows = 0;
770
+ let multiDisambRows = 0;
771
+ const noMatchSamples = [];
772
+
773
+ for (const cols of preparedRows) {
774
+ totalRows++;
775
+ const strongId = cols[H.Strongs];
776
+ const glq = cols[H.GLQuote] || '';
777
+ const result = chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, { useCompromise, nlp });
778
+ if (!result) {
779
+ droppedRows++;
780
+ if (noMatchSamples.length < 8) {
781
+ const ref = cols[H.Reference] || '';
782
+ noMatchSamples.push(`${ref}\t${strongId}\t${glq}`);
783
+ }
784
+ const tried = prioritizeArticles(glq, strongId, strongPivot) || [];
785
+ const disambTried = tried.length ? `(${tried.join(', ')})` : '';
786
+ noMatchRows.push(cols.join('\t') + '\t' + disambTried);
787
+ continue;
788
+ }
789
+ const art = result.article;
790
+ cols[H.TWLink] = `rc://*/tw/dict/bible/${art}`;
791
+ // Update Tags based on selected article prefix
792
+ let tag = '';
793
+ if (art.startsWith('kt/')) tag = 'keyterm';
794
+ else if (art.startsWith('names/')) tag = 'name';
795
+ cols[H.Tags] = tag;
796
+ if (result.disamb) multiDisambRows++;
797
+ const variantOf = result.variantTerm || '';
798
+ outRows.push(cols.join('\t') + '\t' + variantOf + '\t' + (result.disamb || ''));
799
+ }
800
+
801
+ const keptRows = totalRows - droppedRows;
802
+ const pct = totalRows ? ((keptRows / totalRows) * 100).toFixed(1) : '0.0';
803
+ console.log(`[TWL] ${bookCode.toUpperCase()}: kept ${keptRows}/${totalRows} (${pct}%), dropped ${droppedRows}, disambiguated ${multiDisambRows}`);
804
+ if (noMatchSamples.length) {
805
+ console.log(`[TWL] ${bookCode.toUpperCase()}: no-match samples (up to 8):`);
806
+ for (const s of noMatchSamples) console.log(` ${s}`);
807
+ }
808
+
809
+ const matchedTsv = outRows.join('\n');
810
+ const noMatchTsv = noMatchRows.join('\n');
811
+ return { matchedTsv, noMatchTsv };
812
+ }