twl-generator 1.4.1 → 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "twl-generator",
3
- "version": "1.4.1",
3
+ "version": "1.4.3",
4
4
  "description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -46,9 +46,10 @@
46
46
  "node": ">=18.0.0"
47
47
  },
48
48
  "dependencies": {
49
+ "compromise": "^14.14.2",
49
50
  "csv-parse": "^5.5.6",
50
51
  "csv-stringify": "^6.5.0",
51
- "compromise": "^14.14.2",
52
+ "jszip": "^3.10.1",
52
53
  "tsv-quote-converters": "^1.1.13"
53
54
  },
54
55
  "peerDependencies": {
@@ -59,4 +60,4 @@
59
60
  "optional": true
60
61
  }
61
62
  }
62
- }
63
+ }
package/src/cli.js CHANGED
@@ -15,7 +15,7 @@ async function readBooksJs() {
15
15
  }
16
16
 
17
17
  function parseArgs(argv) {
18
- const args = { book: '', out: '', outDir: '', all: false, useCompromise: false };
18
+ const args = { book: '', out: '', outDir: '', all: false, useCompromise: false, dcsHost: 'https://git.door43.org' };
19
19
  for (let i = 2; i < argv.length; i++) {
20
20
  const a = argv[i];
21
21
  if (a === '--book' || a === '-b') { args.book = argv[++i] || ''; }
@@ -23,21 +23,22 @@ function parseArgs(argv) {
23
23
  else if (a === '--out-dir' || a === '-O') { args.outDir = argv[++i] || ''; }
24
24
  else if (a === '--all' || a === '-A') { args.all = true; }
25
25
  else if (a === '--use-compromise') { args.useCompromise = true; }
26
+ else if (a === '--dcs') { args.dcsHost = argv[++i] || 'https://git.door43.org'; }
26
27
  }
27
28
  return args;
28
29
  }
29
30
 
30
31
  async function main() {
31
- const { book, out, outDir, all, useCompromise } = parseArgs(process.argv);
32
+ const { book, out, outDir, all, useCompromise, dcsHost } = parseArgs(process.argv);
32
33
  if (all || (book && book.toLowerCase() === 'all')) {
33
34
  const books = await readBooksJs();
34
35
  const codes = Object.keys(books);
35
36
  const destDir = outDir ? path.resolve(outDir) : path.resolve(THIS_DIR, '..'); // default to twl-generator dir
36
37
  await fs.mkdir(destDir, { recursive: true });
37
- console.error(`Generating TWL for ${codes.length} books to ${destDir} (useCompromise=${useCompromise})`);
38
+ console.error(`Generating TWL for ${codes.length} books to ${destDir} (useCompromise=${useCompromise}, dcsHost=${dcsHost})`);
38
39
  for (const code of codes) {
39
40
  try {
40
- const { matchedTsv, noMatchTsv } = await generateTwlByBook(code, { useCompromise });
41
+ const { matchedTsv, noMatchTsv } = await generateTwlByBook(code, { useCompromise, dcsHost });
41
42
  const fname = `${code.toLowerCase()}.twl.tsv`;
42
43
  const outPath = path.join(destDir, fname);
43
44
  await fs.writeFile(outPath, matchedTsv, 'utf8');
@@ -52,11 +53,12 @@ async function main() {
52
53
  }
53
54
 
54
55
  if (!book) {
55
- console.error('Usage: generate-twl --book <code>|all [--out <file.tsv> | --out-dir <dir>] [--use-compromise]');
56
+ console.error('Usage: generate-twl --book <code>|all [--out <file.tsv> | --out-dir <dir>] [--use-compromise] [--dcs <host>]');
57
+ console.error(' --dcs defaults to https://git.door43.org');
56
58
  process.exit(1);
57
59
  }
58
60
 
59
- const { matchedTsv, noMatchTsv } = await generateTwlByBook(book, { useCompromise });
61
+ const { matchedTsv, noMatchTsv } = await generateTwlByBook(book, { useCompromise, dcsHost });
60
62
  if (out) {
61
63
  const outPath = path.resolve(out);
62
64
  await fs.writeFile(outPath, matchedTsv, 'utf8');
package/src/index.js CHANGED
@@ -1,7 +1,6 @@
1
1
  import { BibleBookData } from './common/books.js';
2
2
 
3
3
  const isBrowser = typeof window !== 'undefined';
4
- const TW_JSON_URL = new URL('../tw_strongs_list.json', import.meta.url);
5
4
 
6
5
  async function readBooks() {
7
6
  // Build a simple CODE -> { usfm, testament } map from the local BibleBookData
@@ -20,29 +19,11 @@ function findBookMeta(bookMap, code) {
20
19
  return { key, ...meta };
21
20
  }
22
21
 
23
- async function fetchUsfm(usfmCode, testament) {
24
- const repo = testament === 'old' ? 'hbo_uhb' : 'el-x-koine_ugnt';
25
- const url = `https://git.door43.org/api/v1/repos/unfoldingWord/${repo}/contents/${usfmCode}.usfm`;
26
- const res = await fetch(url);
27
- if (!res.ok) throw new Error(`Failed to fetch USFM: ${res.status} ${res.statusText}`);
28
- const json = await res.json();
29
- const b64 = json.content || '';
30
-
31
- if (isBrowser) {
32
- // Browser: use atob and TextDecoder
33
- const binary = atob(b64);
34
- const bytes = Uint8Array.from(binary, c => c.charCodeAt(0));
35
- const decoder = new TextDecoder('utf-8');
36
- return decoder.decode(bytes);
37
- } else {
38
- // Node.js: use Buffer
39
- const { Buffer } = await import('node:buffer');
40
- const buf = Buffer.from(b64, 'base64');
41
- return buf.toString('utf8');
42
- }
43
- }
44
-
45
- function pivotByStrong(twMap) {
22
+ async function loadTermsFromEnTw(dcsHost = 'https://git.door43.org') {
23
+ // Use the updated zipProcessor that accepts dcsHost
24
+ const { generateTWTerms } = await import('./utils/zipProcessor.js');
25
+ return await generateTWTerms(dcsHost);
26
+ } function pivotByStrong(twMap) {
46
27
  // Build two structures:
47
28
  // 1) singles: strong -> Set(articles) including base (strip letter suffix)
48
29
  // 2) seqFirst: base-first-strong -> [{ article, seqBase, len }] preserving order in twMap
@@ -210,23 +191,6 @@ function buildInitialTsv(usfm, strongPivot, bookCode) {
210
191
  return tsv;
211
192
  }
212
193
 
213
- async function loadTwJsonLocal() {
214
- if (isBrowser) {
215
- // In browser, try to fetch from public path
216
- const url = '/tw_strongs_list.json';
217
- const res = await fetch(url);
218
- if (!res.ok) throw new Error(`Failed to fetch tw_strongs_list.json: ${res.status}`);
219
- return await res.json();
220
- } else {
221
- // In Node.js, read from file system
222
- const fs = await import('node:fs/promises');
223
- const { fileURLToPath } = await import('node:url');
224
- const filePath = fileURLToPath(TW_JSON_URL);
225
- const raw = await fs.readFile(filePath, 'utf8');
226
- return JSON.parse(raw);
227
- }
228
- }
229
-
230
194
  function buildArticleTermMap(twMap) {
231
195
  // Normalize helper: remove only trailing parenthetical notes and collapse whitespace
232
196
  const stripParensTrim = (s) => String(s || '').replace(/\s*\([^)]*\)\s*$/, '').replace(/\s+/g, ' ').trim();
@@ -863,20 +827,11 @@ function chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, twMap, opts
863
827
  }
864
828
 
865
829
  export async function generateTwlByBook(bookCode, options = {}) {
866
- // New: English-first matching (no Strong's), using ULT USFM verses
867
- // Build term -> [articles] from local tw_strongs_list.json (terms only; ignore Strong's)
868
- const twJson = await loadTwJsonLocal();
869
- const termToArticles = {};
870
- for (const [article, val] of Object.entries(twJson)) {
871
- const terms = (val && val.article && Array.isArray(val.article.terms)) ? val.article.terms : [];
872
- for (const raw of terms) {
873
- const term = String(raw || '').replace(/\s*\([^)]*\)\s*$/, '').trim();
874
- if (!term) continue;
875
- if (!termToArticles[term]) termToArticles[term] = [];
876
- // Use slug as-is (e.g., kt/grace)
877
- termToArticles[term].push(article);
878
- }
879
- }
830
+ // Extract dcsHost option with default
831
+ const dcsHost = options.dcsHost || 'https://git.door43.org';
832
+
833
+ // Load terms from en_tw zip file instead of local tw_strongs_list.json
834
+ const termToArticles = await loadTermsFromEnTw(dcsHost);
880
835
 
881
836
  // Build trie for fast scanning
882
837
  const { buildTermTrie, scanVerseMatches } = await import('./utils/twl-matcher.js');
@@ -887,7 +842,7 @@ export async function generateTwlByBook(bookCode, options = {}) {
887
842
  const bibleData = await readBooks();
888
843
  const meta = findBookMeta(bibleData, bookCode);
889
844
  if (!meta) throw new Error(`Unknown book code: ${bookCode}`);
890
- const versesByChapter = await processUsfmForBook(meta.key);
845
+ const versesByChapter = await processUsfmForBook(meta.key, dcsHost);
891
846
 
892
847
  // Header without Strongs; keep GLQuote/GLOccurrence and add Variant of, Disambiguation
893
848
  const header = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Variant of', 'Disambiguation'];
@@ -295,7 +295,6 @@ function createOptimizedTermMap(twTerms) {
295
295
  let variants = new Set([originalTerm]);
296
296
  const isName = articles[0].startsWith('names/') || articles[1]?.startsWith('names/')
297
297
  variants = generateVariants(originalTerm, isName);
298
- console.log(variants)
299
298
  for (const variant of variants) {
300
299
  if (variant !== originalTerm) {
301
300
  trie.insert(variant, originalTerm, articles, false);
@@ -68,13 +68,13 @@ export const removeAllTagsExceptChapterVerse = (usfmContent) => {
68
68
  * @param {string} book - The book identifier
69
69
  * @return {Promise<Object>} - Object with chapters and verses
70
70
  */
71
- export async function processUsfmForBook(book) {
71
+ export async function processUsfmForBook(book, dcsHost = 'https://git.door43.org') {
72
72
  // Normalize book key to lowercase to match BibleBookData keys
73
73
  const key = String(book || '').toLowerCase();
74
74
  if (!BibleBookData[key]) throw new Error(`Unknown book: ${book}`);
75
75
 
76
76
  const fetch = await getFetch();
77
- const usfmUrl = `https://git.door43.org/api/v1/repos/unfoldingWord/en_ult/contents/${BibleBookData[key].usfm}.usfm?ref=master`;
77
+ const usfmUrl = `${dcsHost}/api/v1/repos/unfoldingWord/en_ult/contents/${BibleBookData[key].usfm}.usfm?ref=master`;
78
78
  const usfmRes = await fetch(usfmUrl);
79
79
  if (!usfmRes.ok) throw new Error(`Failed to download USFM file for ${book}`);
80
80
  const usfmData = await usfmRes.json();
@@ -1,11 +1,11 @@
1
1
  /**
2
2
  * Universal TWL zipProcessor - Works in both Node.js and Browser environments
3
3
  *
4
- * Caches the raw ZIP file and processes term headers on-demand
4
+ * Downloads and processes en_tw ZIP files on-demand (no caching per user request)
5
5
  *
6
6
  * Usage in React.js:
7
7
  * import { generateTWTerms } from './utils/zipProcessor.js';
8
- * const terms = await generateTWTerms();
8
+ * const terms = await generateTWTerms('https://git.door43.org');
9
9
  */
10
10
  import JSZip from "jszip";
11
11
 
@@ -13,141 +13,6 @@ import JSZip from "jszip";
13
13
  const isNode = typeof process !== 'undefined' && process.versions?.node;
14
14
  const isBrowser = typeof window !== 'undefined';
15
15
 
16
- const ZIP_URL = 'https://git.door43.org/unfoldingWord/en_tw/archive/master.zip';
17
- const CACHE_KEY = 'twl_zip_cache';
18
- const CACHE_VERSION = '1.0';
19
-
20
- // In-memory cache for processed terms (per session)
21
- let processedTermsCache = null;
22
-
23
- async function getCachedZip() {
24
- if (isBrowser) {
25
- // Browser: Use localStorage for ZIP cache
26
- try {
27
- const cached = localStorage.getItem(CACHE_KEY);
28
- if (cached) {
29
- const data = JSON.parse(cached);
30
- // Only use cache if version matches and cache is less than 5 minutes old
31
- const FIVE_MINUTES = 5 * 60 * 1000;
32
- if (
33
- data.version === CACHE_VERSION &&
34
- data.timestamp &&
35
- (Date.now() - data.timestamp) < FIVE_MINUTES
36
- ) {
37
- console.log('Using cached ZIP from browser storage');
38
- return new Uint8Array(data.zipData);
39
- } else {
40
- localStorage.removeItem(CACHE_KEY);
41
- }
42
- }
43
- } catch (error) {
44
- console.log('Browser ZIP cache corrupted, re-downloading...');
45
- try { localStorage.removeItem(CACHE_KEY); } catch (e) { }
46
- }
47
- }
48
- // Note: In Node.js we could cache to filesystem, but fresh download is fine for CLI usage
49
-
50
- return null;
51
- }
52
-
53
- /**
54
- * Cache ZIP data in appropriate storage
55
- */
56
- async function cacheZip(zipBuffer) {
57
- if (isBrowser) {
58
- try {
59
- const cacheData = {
60
- version: CACHE_VERSION,
61
- timestamp: Date.now(),
62
- zipData: Array.from(new Uint8Array(zipBuffer))
63
- };
64
- localStorage.setItem(CACHE_KEY, JSON.stringify(cacheData));
65
- console.log('ZIP cached in browser storage');
66
- } catch (error) {
67
- console.warn('Failed to cache ZIP in browser:', error.message);
68
- }
69
- }
70
- }
71
-
72
- /**
73
- * Get browser storage (localStorage or sessionStorage)
74
- */
75
- function getBrowserStorage() {
76
- if (!isBrowser) return null;
77
-
78
- try {
79
- return localStorage || sessionStorage || null;
80
- } catch (e) {
81
- console.warn('Browser storage not available:', e.message);
82
- return null;
83
- }
84
- }
85
-
86
- /**
87
- * Get cached terms from appropriate storage
88
- */
89
- async function getCachedTerms() {
90
- // Check in-memory cache first (fastest)
91
- if (memoryCache) {
92
- console.log('Using in-memory cached article terms');
93
- return memoryCache;
94
- }
95
-
96
- if (isBrowser) {
97
- // Browser caching with localStorage/sessionStorage
98
- const storage = getBrowserStorage();
99
- if (storage) {
100
- try {
101
- const cached = storage.getItem(CACHE_KEY);
102
- if (cached) {
103
- const data = JSON.parse(cached);
104
- if (data.version === CACHE_VERSION) {
105
- console.log('Using browser cached article terms');
106
- memoryCache = data.terms;
107
- return data.terms;
108
- } else {
109
- console.log('Browser cache version mismatch, regenerating...');
110
- storage.removeItem(CACHE_KEY);
111
- }
112
- }
113
- } catch (error) {
114
- console.log('Browser cache corrupted, regenerating...');
115
- try {
116
- storage.removeItem(CACHE_KEY);
117
- } catch (e) { /* ignore cleanup errors */ }
118
- }
119
- }
120
- }
121
-
122
- return null;
123
- }
124
-
125
- /**
126
- * Cache terms in appropriate storage
127
- */
128
- async function cacheTerms(termMap) {
129
- // Always cache in memory for this session
130
- memoryCache = termMap;
131
-
132
- if (isBrowser) {
133
- // Browser caching
134
- const storage = getBrowserStorage();
135
- if (storage) {
136
- try {
137
- const cacheData = {
138
- version: CACHE_VERSION,
139
- timestamp: Date.now(),
140
- terms: termMap
141
- };
142
- storage.setItem(CACHE_KEY, JSON.stringify(cacheData));
143
- console.log('Article terms cached in browser storage');
144
- } catch (error) {
145
- console.warn('Failed to cache in browser storage:', error.message);
146
- }
147
- }
148
- }
149
- }
150
-
151
16
  /**
152
17
  * Process ZIP buffer and extract term mappings
153
18
  */
@@ -173,7 +38,7 @@ async function processZipBuffer(zipBuffer) {
173
38
  const content = await entry.getData(); // Await the async string content
174
39
  const firstLine = content.split('\n')[0];
175
40
  const terms = firstLine.replace(/^#/, '').trim().split(',').map(t => t.trim()).filter(Boolean);
176
- const truncated = entry.entryName.replace('en_tw/bible/', '');
41
+ const truncated = entry.entryName.replace('en_tw/bible/', '').replace(/\.md$/, '');
177
42
 
178
43
  for (const term of terms) {
179
44
  // Normalize terms by removing parentheses and spaces before them
@@ -195,28 +60,15 @@ async function processZipBuffer(zipBuffer) {
195
60
  return termMap;
196
61
  }
197
62
 
198
- export async function generateTWTerms() {
199
- // Check if we already processed terms this session
200
- if (processedTermsCache) {
201
- console.log('Using in-memory processed terms');
202
- return processedTermsCache;
203
- }
204
-
205
- // Try to get cached ZIP first
206
- let zipBuffer = await getCachedZip();
63
+ export async function generateTWTerms(dcsHost = 'https://git.door43.org') {
64
+ // Always download fresh ZIP (no caching per user request)
65
+ const zipUrl = `${dcsHost}/unfoldingWord/en_tw/archive/master.zip`;
66
+ console.log(`Downloading TW archive from ${zipUrl}...`);
207
67
 
208
- if (!zipBuffer) {
209
- // Download fresh ZIP
210
- console.log('Downloading TW archive...');
68
+ const res = await fetch(zipUrl);
69
+ if (!res.ok) throw new Error(`Failed to download ZIP: ${res.status} ${res.statusText}`);
211
70
 
212
- const res = await fetch(ZIP_URL);
213
- if (!res.ok) throw new Error(`Failed to download ZIP: ${res.status} ${res.statusText}`);
214
-
215
- zipBuffer = await res.arrayBuffer();
216
-
217
- // Cache the ZIP for next time
218
- await cacheZip(zipBuffer);
219
- }
71
+ const zipBuffer = await res.arrayBuffer();
220
72
 
221
73
  // Process ZIP to extract terms
222
74
  console.log('Processing TW articles...');
@@ -224,64 +76,16 @@ export async function generateTWTerms() {
224
76
 
225
77
  console.log(`Generated ${Object.keys(termMap).length} terms from TW archive`);
226
78
 
227
- // Cache processed terms for this session
228
- processedTermsCache = termMap;
229
-
230
79
  return termMap;
231
80
  }
232
81
 
233
82
  /**
234
- * Clear cache - useful for forcing refresh
83
+ * Get information about the current environment for debugging
235
84
  */
236
- export async function clearCache() {
237
- // Clear in-memory cache
238
- processedTermsCache = null;
239
-
240
- if (isBrowser) {
241
- try {
242
- localStorage.removeItem(CACHE_KEY);
243
- console.log('Browser ZIP cache cleared');
244
- return true;
245
- } catch (error) {
246
- console.warn('Failed to clear browser cache:', error.message);
247
- return false;
248
- }
249
- }
250
-
251
- console.log('Memory cache cleared');
252
- return true;
253
- }
254
-
255
- /**
256
- * Get cache information for debugging
257
- */
258
- export function getCacheInfo() {
259
- const info = {
85
+ export function getEnvironmentInfo() {
86
+ return {
260
87
  environment: isNode ? 'Node.js' : (isBrowser ? 'Browser' : 'Unknown'),
261
- hasProcessedTerms: !!processedTermsCache,
262
- hasZipCache: false,
263
- termCount: 0,
264
- cacheVersion: CACHE_VERSION
88
+ hasFetch: typeof fetch !== 'undefined',
89
+ hasJSZip: typeof JSZip !== 'undefined'
265
90
  };
266
-
267
- // Check processed terms
268
- if (processedTermsCache) {
269
- info.termCount = Object.keys(processedTermsCache).length;
270
- }
271
-
272
- // Check ZIP cache in browser
273
- if (isBrowser) {
274
- try {
275
- const cached = localStorage.getItem(CACHE_KEY);
276
- if (cached) {
277
- const data = JSON.parse(cached);
278
- info.hasZipCache = true;
279
- info.timestamp = data.timestamp ? new Date(data.timestamp) : null;
280
- }
281
- } catch (error) {
282
- // Ignore parse errors
283
- }
284
- }
285
-
286
- return info;
287
91
  }