twl-generator 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,395 @@
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+
4
+ /**
5
+ * Generate morphological variants of a term
6
+ */
7
+ function generateVariants(term) {
8
+ const variants = new Set([term]);
9
+
10
+ // Handle pluralization
11
+ if (term.endsWith('s') && term.length > 2) {
12
+ variants.add(term.slice(0, -1)); // dogs -> dog
13
+ } else {
14
+ variants.add(term + 's'); // dog -> dogs
15
+ }
16
+
17
+ // Handle 'es' endings
18
+ if (term.endsWith('es') && term.length > 3) {
19
+ variants.add(term.slice(0, -2)); // horses -> horse
20
+ } else if (term.endsWith('e')) {
21
+ variants.add(term + 's'); // horse -> horses
22
+ } else if (/[sxz]$|[cs]h$/.test(term)) {
23
+ variants.add(term + 'es'); // church -> churches
24
+ }
25
+
26
+ // Handle 'ies' endings for words ending in 'y'
27
+ if (term.endsWith('ies') && term.length > 4) {
28
+ variants.add(term.slice(0, -3) + 'y'); // cities -> city
29
+ } else if (term.endsWith('y') && term.length > 2 && !/[aeiou]y$/.test(term)) {
30
+ variants.add(term.slice(0, -1) + 'ies'); // city -> cities
31
+ }
32
+
33
+ // Handle possessive forms
34
+ variants.add(term + "'s");
35
+ variants.add(term + "'");
36
+ if (term.endsWith('s')) {
37
+ variants.add(term + "'");
38
+ }
39
+
40
+ // Handle -ed/-ing forms (basic)
41
+ if (term.endsWith('ed') && term.length > 3) {
42
+ variants.add(term.slice(0, -2)); // walked -> walk
43
+ }
44
+ if (term.endsWith('ing') && term.length > 4) {
45
+ variants.add(term.slice(0, -3)); // walking -> walk
46
+ }
47
+
48
+ // Double consonant handling for -ed/-ing
49
+ if (/[bcdfghjklmnpqrstvwxyz][aeiou][bcdfghjklmnpqrstvwxyz]$/.test(term)) {
50
+ variants.add(term + term.slice(-1) + 'ed'); // stop -> stopped
51
+ variants.add(term + term.slice(-1) + 'ing'); // stop -> stopping
52
+ }
53
+
54
+ // Regular -ed/-ing
55
+ if (!term.endsWith('e')) {
56
+ variants.add(term + 'ed');
57
+ variants.add(term + 'ing');
58
+ } else {
59
+ variants.add(term.slice(0, -1) + 'ed'); // love -> loved
60
+ variants.add(term.slice(0, -1) + 'ing'); // love -> loving
61
+ }
62
+
63
+ return Array.from(variants);
64
+ }
65
+
66
+ /**
67
+ * Optimized PrefixTrie for fast term matching with case sensitivity
68
+ */
69
+ class PrefixTrie {
70
+ constructor() {
71
+ this.exactCaseRoot = {}; // For exact case matches
72
+ this.lowerCaseRoot = {}; // For case-insensitive fallback
73
+ }
74
+
75
+ insert(term, originalTerm, articles, isOriginal = true) {
76
+ // Insert into exact case trie
77
+ this._insertIntoTree(this.exactCaseRoot, term, originalTerm, articles, isOriginal, true);
78
+
79
+ // Also insert into lowercase trie for fallback
80
+ this._insertIntoTree(this.lowerCaseRoot, term.toLowerCase(), originalTerm, articles, isOriginal, false);
81
+ }
82
+
83
+ _insertIntoTree(root, term, originalTerm, articles, isOriginal, isExactCase) {
84
+ let node = root;
85
+
86
+ for (const char of term) {
87
+ if (!node[char]) {
88
+ node[char] = {};
89
+ }
90
+ node = node[char];
91
+ }
92
+
93
+ // Store term data at the end node
94
+ if (!node._terms) {
95
+ node._terms = [];
96
+ }
97
+
98
+ node._terms.push({
99
+ term: originalTerm,
100
+ articles,
101
+ matchedText: term,
102
+ priority: isOriginal ? 0 : 1,
103
+ isExactCase
104
+ });
105
+ }
106
+
107
+ findMatches(text, startPos) {
108
+ // First try exact case matches
109
+ let matches = this._findMatchesInTree(this.exactCaseRoot, text, startPos, true);
110
+
111
+ // If no exact case matches, try case-insensitive
112
+ if (matches.length === 0) {
113
+ matches = this._findMatchesInTree(this.lowerCaseRoot, text.toLowerCase(), startPos, false);
114
+ }
115
+
116
+ return matches;
117
+ }
118
+
119
+ _findMatchesInTree(root, text, startPos, isExactCase) {
120
+ const matches = [];
121
+ let node = root;
122
+ let currentPos = startPos;
123
+
124
+ // Try to match as long as possible
125
+ while (currentPos < text.length) {
126
+ const char = text[currentPos];
127
+
128
+ if (!node[char]) {
129
+ break; // No more matches possible
130
+ }
131
+
132
+ node = node[char];
133
+ currentPos++;
134
+
135
+ // If we found terms at this position, collect them
136
+ if (node._terms) {
137
+ const matchLength = currentPos - startPos;
138
+ const originalMatchedText = text.substring(startPos, currentPos);
139
+
140
+ // Check if this is a valid word boundary match
141
+ const isWordBoundary = currentPos >= text.length ||
142
+ /[\s\p{P}]/.test(text[currentPos]) ||
143
+ !/[\w]/.test(text[currentPos]);
144
+
145
+ if (isWordBoundary || matchLength === 1) {
146
+ for (const termData of node._terms) {
147
+ matches.push({
148
+ term: termData.term,
149
+ articles: termData.articles,
150
+ matchedText: originalMatchedText, // Use the original text, not the normalized version
151
+ length: matchLength,
152
+ priority: termData.priority,
153
+ isExactCase: isExactCase
154
+ });
155
+ }
156
+ }
157
+ }
158
+ }
159
+
160
+ // Sort by length (longer first), then by priority, then by case match preference
161
+ return matches.sort((a, b) => {
162
+ if (b.length !== a.length) {
163
+ return b.length - a.length;
164
+ }
165
+ if (a.priority !== b.priority) {
166
+ return a.priority - b.priority;
167
+ }
168
+ // Prefer exact case matches
169
+ if (a.isExactCase !== b.isExactCase) {
170
+ return a.isExactCase ? -1 : 1;
171
+ }
172
+ return 0;
173
+ });
174
+ }
175
+ }
176
+
177
+ /**
178
+ * Create optimized term map using PrefixTrie
179
+ */
180
+ function createOptimizedTermMap(twTerms) {
181
+ const trie = new PrefixTrie();
182
+ let termCount = 0;
183
+
184
+ console.log('Building optimized term map...');
185
+
186
+ for (const [originalTerm, articles] of Object.entries(twTerms)) {
187
+ // Add original term
188
+ trie.insert(originalTerm, originalTerm, articles, true);
189
+ termCount++;
190
+
191
+ // Generate and add variants for single words only to avoid exponential explosion
192
+ if (!originalTerm.includes(' ')) {
193
+ const variants = generateVariants(originalTerm);
194
+ for (const variant of variants) {
195
+ if (variant !== originalTerm) {
196
+ trie.insert(variant, originalTerm, articles, false);
197
+ termCount++;
198
+ }
199
+ }
200
+ }
201
+ }
202
+
203
+ console.log(`Term map built with ${termCount} terms and variants`);
204
+ return trie;
205
+ }
206
+
207
+ /**
208
+ * Fast matching using optimized algorithm
209
+ */
210
+ function findMatches(verseText, termTrie) {
211
+ const matches = [];
212
+ let currentPos = 0;
213
+ let processedText = '';
214
+
215
+ // Normalize text
216
+ const normalizedText = verseText
217
+ .replace(/[–—―]/g, ' ')
218
+ .replace(/[""]/g, '"')
219
+ .replace(/['']/g, "'");
220
+
221
+ while (currentPos < normalizedText.length) {
222
+ // Skip leading whitespace and punctuation (but keep apostrophes for words like don't)
223
+ while (currentPos < normalizedText.length && /[\s\p{P}]/u.test(normalizedText[currentPos]) && !/['']/.test(normalizedText[currentPos])) {
224
+ processedText += normalizedText[currentPos];
225
+ currentPos++;
226
+ }
227
+
228
+ if (currentPos >= normalizedText.length) break;
229
+
230
+ // Try to find matches starting at current position
231
+ const candidateMatches = termTrie.findMatches(normalizedText, currentPos);
232
+ let bestMatch = null;
233
+
234
+ // Pick the best match (longest, then by priority)
235
+ if (candidateMatches.length > 0) {
236
+ bestMatch = candidateMatches[0];
237
+ }
238
+
239
+ if (bestMatch) {
240
+ // Create context with brackets
241
+ const matchedText = bestMatch.matchedText;
242
+ const context = processedText + '[' + matchedText + ']' + normalizedText.substring(currentPos + bestMatch.length);
243
+
244
+ matches.push({
245
+ term: bestMatch.term,
246
+ articles: bestMatch.articles,
247
+ matchedText: matchedText,
248
+ context: context,
249
+ priority: bestMatch.priority
250
+ });
251
+
252
+ // Move past the matched text
253
+ processedText += matchedText;
254
+ currentPos += bestMatch.length;
255
+ } else {
256
+ // No match found, move to next character/word boundary
257
+ const nextWordBoundary = normalizedText.substring(currentPos).search(/[\s\p{P}]/u);
258
+ const moveDistance = nextWordBoundary === -1 ? 1 : Math.max(1, nextWordBoundary);
259
+
260
+ processedText += normalizedText.substring(currentPos, currentPos + moveDistance);
261
+ currentPos += moveDistance;
262
+ }
263
+ }
264
+
265
+ return matches;
266
+ }
267
+
268
+ /**
269
+ * Generate a 4-character hex ID starting with a letter
270
+ */
271
+ function generateId() {
272
+ const letters = 'abcdef';
273
+ const hex = '0123456789abcdef';
274
+ let id = letters[Math.floor(Math.random() * letters.length)];
275
+ for (let i = 0; i < 3; i++) {
276
+ id += hex[Math.floor(Math.random() * hex.length)];
277
+ }
278
+ return id;
279
+ }
280
+
281
+ /**
282
+ * Get article category for Tags column
283
+ */
284
+ function getArticleCategory(articlePath) {
285
+ if (articlePath.startsWith('kt/')) return 'keyterm';
286
+ if (articlePath.startsWith('names/')) return 'name';
287
+ return '';
288
+ }
289
+
290
+ /**
291
+ * Create TWLink from article path
292
+ */
293
+ function createTWLink(articlePath) {
294
+ return `rc://*/tw/dict/bible/${articlePath.replace('.md', '')}`;
295
+ }
296
+
297
+ /**
298
+ * Create disambiguation string
299
+ */
300
+ function createDisambiguation(articles) {
301
+ if (articles.length <= 1) return '';
302
+ const paths = articles.map(path => path.replace('.md', '')).sort();
303
+ return `(${paths.join(', ')})`;
304
+ }
305
+
306
+ /**
307
+ * Process verses and generate TWL matches using the optimized algorithm
308
+ */
309
+ export function generateTWLMatches(twTerms, verses) {
310
+ // Use the optimized trie-based approach
311
+ const termTrie = createOptimizedTermMap(twTerms);
312
+ const tsvRows = [];
313
+
314
+ // Add TSV header
315
+ tsvRows.push('Reference\tID\tTags\tOrigWords\tOccurrence\tTWLink\tDisambiguation\tContext');
316
+
317
+ let totalVerses = 0;
318
+ let processedVerses = 0;
319
+
320
+ // Count total verses for progress
321
+ for (const chapter of Object.values(verses)) {
322
+ totalVerses += Object.keys(chapter).length;
323
+ }
324
+
325
+ console.log(`Processing ${totalVerses} verses...`);
326
+
327
+ for (const [chapterNum, chapter] of Object.entries(verses)) {
328
+ for (const [verseNum, verseText] of Object.entries(chapter)) {
329
+ const reference = `${chapterNum}:${verseNum}`;
330
+ const matches = findMatches(verseText, termTrie);
331
+
332
+ // Count occurrences for each unique match term
333
+ const occurrenceCounts = new Map();
334
+
335
+ // Collect all rows for this verse
336
+ const verseRows = [];
337
+
338
+ for (const match of matches) {
339
+ // Count occurrences based on the exact matched text (case-sensitive with punctuation)
340
+ const exactMatchKey = match.matchedText;
341
+ occurrenceCounts.set(exactMatchKey, (occurrenceCounts.get(exactMatchKey) || 0) + 1);
342
+
343
+ const id = generateId();
344
+ const tags = getArticleCategory(match.articles[0]);
345
+ const origWords = match.matchedText;
346
+ const occurrence = occurrenceCounts.get(exactMatchKey);
347
+ const twLink = createTWLink(match.articles[0]);
348
+ const disambiguation = createDisambiguation(match.articles);
349
+ const context = match.context;
350
+
351
+ verseRows.push({
352
+ reference,
353
+ id,
354
+ tags,
355
+ origWords,
356
+ occurrence,
357
+ twLink,
358
+ disambiguation,
359
+ context,
360
+ bracketPosition: context.indexOf('[')
361
+ });
362
+ }
363
+
364
+ // Sort by bracket position within this verse (as before)
365
+ verseRows.sort((a, b) => {
366
+ if (a.bracketPosition === -1 && b.bracketPosition === -1) return 0;
367
+ if (a.bracketPosition === -1) return 1;
368
+ if (b.bracketPosition === -1) return -1;
369
+ return a.bracketPosition - b.bracketPosition;
370
+ });
371
+
372
+ // Add sorted rows to TSV
373
+ for (const row of verseRows) {
374
+ tsvRows.push([
375
+ row.reference,
376
+ row.id,
377
+ row.tags,
378
+ row.origWords,
379
+ row.occurrence,
380
+ row.twLink,
381
+ row.disambiguation,
382
+ row.context
383
+ ].join('\t'));
384
+ }
385
+
386
+ // Progress indicator
387
+ processedVerses++;
388
+ if (processedVerses % 100 === 0 || processedVerses === totalVerses) {
389
+ console.log(`Progress: ${processedVerses}/${totalVerses} verses (${Math.round(processedVerses / totalVerses * 100)}%)`);
390
+ }
391
+ }
392
+ }
393
+
394
+ return tsvRows.join('\n');
395
+ }
@@ -0,0 +1,104 @@
1
+ /* eslint-disable no-async-promise-executor, no-throw-literal */
2
+
3
+ import fetch from 'node-fetch';
4
+ import { BibleBookData } from '../common/books.js';
5
+
6
+ // Note: This version doesn't use usfm-js to avoid external dependencies
7
+ // It implements a simple USFM alignment remover for the specific case
8
+
9
+ export const removeAllTagsExceptChapterVerse = (usfmContent) => {
10
+ if (!usfmContent) return '';
11
+
12
+ let cleanContent = usfmContent;
13
+
14
+ // Remove word-level alignment markers like \w word|lemma="lemma" strong="H1234"\w*
15
+ cleanContent = cleanContent.replace(/\\w\s+([^|\\]+)\|[^\\]*\\w\*/g, '$1');
16
+
17
+ // Remove milestone markers like \zaln-s | \zaln-e\*
18
+ cleanContent = cleanContent.replace(/\\zaln-[se][^\\]*\\?\*?/g, '');
19
+
20
+ // Remove other alignment-related markers
21
+ cleanContent = cleanContent.replace(/\\k-[se][^\\]*\\?\*?/g, '');
22
+
23
+ // Remove empty lines that might result from marker removal
24
+ cleanContent = cleanContent.replace(/\n\s*\n\s*\n/g, '\n\n');
25
+
26
+ // Clean up any remaining alignment syntax patterns
27
+ cleanContent = cleanContent.replace(/\|[^\\]*(?=\\)/g, '');
28
+
29
+ cleanContent = cleanContent.replace(/\n/g, ' ');
30
+ cleanContent = cleanContent.replace(/ +\\v +/g, '\n\\v ');
31
+ cleanContent = cleanContent.replace(/ +\\c +/g, '\n\\c ');
32
+ cleanContent = cleanContent.replace(/ *(\\q\d*|\\p|\\ts\\\*) */g, ' ');
33
+ cleanContent = cleanContent.replace(/ +/g, ' ');
34
+ cleanContent = cleanContent.replace(/^ +$/g, '');
35
+ cleanContent = cleanContent.replace(/\\f .*?\\f\*/g, ' ');
36
+ cleanContent = cleanContent.replace(/[\{\}]/g, ''); // Remove any curly braces
37
+
38
+ // Remove all lines before the first \c marker, keeping the \c line
39
+ const lines = cleanContent.split('\n');
40
+ const firstCIndex = lines.findIndex(line => line.includes('\\c'));
41
+ if (firstCIndex > 0) {
42
+ cleanContent = lines.slice(firstCIndex).join('\n');
43
+ }
44
+
45
+ return cleanContent.trim();
46
+ };
47
+
48
+ /**
49
+ * Download and process USFM file for a given book
50
+ * @param {string} book - The book identifier
51
+ * @return {Promise<Object>} - Object with chapters and verses
52
+ */
53
+ export async function processUsfmForBook(book) {
54
+ if (!BibleBookData[book]) throw new Error(`Unknown book: ${book}`);
55
+
56
+ const usfmUrl = `https://git.door43.org/api/v1/repos/unfoldingWord/en_ult/contents/${BibleBookData[book].usfm}.usfm?ref=master`;
57
+ const usfmRes = await fetch(usfmUrl);
58
+ if (!usfmRes.ok) throw new Error(`Failed to download USFM file for ${book}`);
59
+ const usfmData = await usfmRes.json();
60
+ const usfmContent = Buffer.from(usfmData.content, 'base64').toString('utf-8');
61
+
62
+ // Remove alignments from USFM
63
+ const cleanUsfm = removeAllTagsExceptChapterVerse(usfmContent);
64
+
65
+ // Parse USFM into chapters and verses
66
+ return parseUsfmToVerses(cleanUsfm);
67
+ }
68
+
69
+ /**
70
+ * Parse clean USFM content into a chapters/verses object
71
+ * @param {string} usfm - Clean USFM content
72
+ * @return {Object} - Object keyed by chapter number, then verse number
73
+ */
74
+ export function parseUsfmToVerses(usfm) {
75
+ const versesObj = {};
76
+ let currentChapter = 1;
77
+
78
+ // Split by chapters and verses
79
+ const parts = usfm.split(/\\([cv])\s*(\d+)/);
80
+
81
+ for (let i = 1; i < parts.length; i += 3) {
82
+ const tag = parts[i]; // 'c' or 'v'
83
+ const number = parseInt(parts[i + 1]);
84
+ const text = parts[i + 2] || '';
85
+
86
+ if (tag === 'c') {
87
+ currentChapter = number;
88
+ if (!versesObj[currentChapter]) {
89
+ versesObj[currentChapter] = {};
90
+ }
91
+ } else if (tag === 'v') {
92
+ if (!versesObj[currentChapter]) {
93
+ versesObj[currentChapter] = {};
94
+ }
95
+ // Clean up the text: remove extra whitespace and newlines
96
+ const cleanText = text.replace(/\s+/g, ' ').trim();
97
+ if (cleanText) {
98
+ versesObj[currentChapter][number] = cleanText;
99
+ }
100
+ }
101
+ }
102
+
103
+ return versesObj;
104
+ }