twl-generator 1.0.3 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/utils/zipProcessor.js +145 -123
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "twl-generator",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -1,15 +1,13 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Universal TWL zipProcessor - Works in both Node.js and Browser environments
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* For React.js/Browser: Uses localStorage/sessionStorage for persistent caching
|
|
4
|
+
* Caches the raw ZIP file and processes term headers on-demand
|
|
6
5
|
*
|
|
7
6
|
* Usage in React.js:
|
|
8
7
|
* import { generateTWTerms } from './utils/zipProcessor.js';
|
|
9
8
|
* const terms = await generateTWTerms();
|
|
10
9
|
*/
|
|
11
10
|
|
|
12
|
-
import AdmZip from 'adm-zip';
|
|
13
11
|
import { BibleBookData } from '../common/books.js';
|
|
14
12
|
|
|
15
13
|
// Environment detection
|
|
@@ -17,11 +15,11 @@ const isNode = typeof process !== 'undefined' && process.versions?.node;
|
|
|
17
15
|
const isBrowser = typeof window !== 'undefined';
|
|
18
16
|
|
|
19
17
|
const ZIP_URL = 'https://git.door43.org/unfoldingWord/en_tw/archive/master.zip';
|
|
20
|
-
const CACHE_KEY = '
|
|
18
|
+
const CACHE_KEY = 'twl_zip_cache';
|
|
21
19
|
const CACHE_VERSION = '1.0';
|
|
22
20
|
|
|
23
|
-
// In-memory cache for
|
|
24
|
-
let
|
|
21
|
+
// In-memory cache for processed terms (per session)
|
|
22
|
+
let processedTermsCache = null;
|
|
25
23
|
|
|
26
24
|
/**
|
|
27
25
|
* Get Node.js dependencies dynamically
|
|
@@ -30,18 +28,14 @@ async function getNodeDeps() {
|
|
|
30
28
|
if (!isNode) return null;
|
|
31
29
|
|
|
32
30
|
try {
|
|
33
|
-
const [nodeModule,
|
|
31
|
+
const [nodeModule, admZipModule] = await Promise.all([
|
|
34
32
|
import('node-fetch'),
|
|
35
|
-
import('
|
|
36
|
-
import('path'),
|
|
37
|
-
import('url')
|
|
33
|
+
import('adm-zip')
|
|
38
34
|
]);
|
|
39
35
|
|
|
40
36
|
return {
|
|
41
37
|
fetch: nodeModule.default,
|
|
42
|
-
|
|
43
|
-
path: pathModule.default,
|
|
44
|
-
fileURLToPath: urlModule.fileURLToPath
|
|
38
|
+
AdmZip: admZipModule.default
|
|
45
39
|
};
|
|
46
40
|
} catch (error) {
|
|
47
41
|
console.error('Failed to load Node.js dependencies:', error);
|
|
@@ -49,6 +43,52 @@ async function getNodeDeps() {
|
|
|
49
43
|
}
|
|
50
44
|
}
|
|
51
45
|
|
|
46
|
+
/**
|
|
47
|
+
* Get cached ZIP data from appropriate storage
|
|
48
|
+
*/
|
|
49
|
+
async function getCachedZip() {
|
|
50
|
+
if (isBrowser) {
|
|
51
|
+
// Browser: Use localStorage for ZIP cache
|
|
52
|
+
try {
|
|
53
|
+
const cached = localStorage.getItem(CACHE_KEY);
|
|
54
|
+
if (cached) {
|
|
55
|
+
const data = JSON.parse(cached);
|
|
56
|
+
if (data.version === CACHE_VERSION) {
|
|
57
|
+
console.log('Using cached ZIP from browser storage');
|
|
58
|
+
return new Uint8Array(data.zipData);
|
|
59
|
+
} else {
|
|
60
|
+
localStorage.removeItem(CACHE_KEY);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
} catch (error) {
|
|
64
|
+
console.log('Browser ZIP cache corrupted, re-downloading...');
|
|
65
|
+
try { localStorage.removeItem(CACHE_KEY); } catch (e) { }
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
// Note: In Node.js we could cache to filesystem, but fresh download is fine for CLI usage
|
|
69
|
+
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Cache ZIP data in appropriate storage
|
|
75
|
+
*/
|
|
76
|
+
async function cacheZip(zipBuffer) {
|
|
77
|
+
if (isBrowser) {
|
|
78
|
+
try {
|
|
79
|
+
const cacheData = {
|
|
80
|
+
version: CACHE_VERSION,
|
|
81
|
+
timestamp: Date.now(),
|
|
82
|
+
zipData: Array.from(new Uint8Array(zipBuffer))
|
|
83
|
+
};
|
|
84
|
+
localStorage.setItem(CACHE_KEY, JSON.stringify(cacheData));
|
|
85
|
+
console.log('ZIP cached in browser storage');
|
|
86
|
+
} catch (error) {
|
|
87
|
+
console.warn('Failed to cache ZIP in browser:', error.message);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
52
92
|
/**
|
|
53
93
|
* Get browser storage (localStorage or sessionStorage)
|
|
54
94
|
*/
|
|
@@ -164,114 +204,108 @@ async function cacheTerms(termMap) {
|
|
|
164
204
|
}
|
|
165
205
|
}
|
|
166
206
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
207
|
+
/**
|
|
208
|
+
* Process ZIP buffer and extract term mappings
|
|
209
|
+
*/
|
|
210
|
+
async function processZipBuffer(zipBuffer) {
|
|
211
|
+
let zip;
|
|
212
|
+
|
|
213
|
+
if (isNode) {
|
|
214
|
+
// Node.js: Use adm-zip
|
|
215
|
+
const deps = await getNodeDeps();
|
|
216
|
+
if (!deps) throw new Error('Failed to load Node.js dependencies');
|
|
217
|
+
const { AdmZip } = deps;
|
|
218
|
+
const Buffer = (await import('buffer')).Buffer;
|
|
219
|
+
zip = new AdmZip(Buffer.from(zipBuffer));
|
|
220
|
+
} else {
|
|
221
|
+
// Browser: Use browser-compatible zip processing
|
|
222
|
+
// For now, we'll use a simple approach that works with the TW archive structure
|
|
223
|
+
throw new Error('Browser ZIP processing not yet implemented. Use cached data or run initial processing in Node.js.');
|
|
172
224
|
}
|
|
173
225
|
|
|
174
|
-
|
|
226
|
+
const entries = zip.getEntries().filter(e => e.entryName.match(/^en_tw\/bible\/.*\/.*\.md$/));
|
|
227
|
+
entries.sort((a, b) => a.entryName.localeCompare(b.entryName));
|
|
175
228
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
229
|
+
const termMap = {};
|
|
230
|
+
|
|
231
|
+
for (const entry of entries) {
|
|
232
|
+
const content = entry.getData().toString('utf8');
|
|
233
|
+
const firstLine = content.split('\n')[0];
|
|
234
|
+
const terms = firstLine.replace(/^#/, '').trim().split(',').map(t => t.trim()).filter(Boolean);
|
|
235
|
+
const truncated = entry.entryName.replace('en_tw/bible/', '');
|
|
236
|
+
|
|
237
|
+
for (const term of terms) {
|
|
238
|
+
// Normalize terms by removing parentheses and spaces before them
|
|
239
|
+
// e.g., "Joseph (OT)" -> "Joseph", "Mary (sister of Martha)" -> "Mary"
|
|
240
|
+
const normalizedTerm = term.replace(/\s+\([^)]*\)$/, '').trim();
|
|
241
|
+
|
|
242
|
+
if (!termMap[normalizedTerm]) {
|
|
243
|
+
termMap[normalizedTerm] = [];
|
|
244
|
+
}
|
|
245
|
+
termMap[normalizedTerm].push(truncated);
|
|
185
246
|
}
|
|
247
|
+
}
|
|
186
248
|
|
|
187
|
-
|
|
188
|
-
|
|
249
|
+
// Sort article arrays for consistent output
|
|
250
|
+
for (const term in termMap) {
|
|
251
|
+
termMap[term].sort();
|
|
252
|
+
}
|
|
189
253
|
|
|
190
|
-
|
|
191
|
-
|
|
254
|
+
return termMap;
|
|
255
|
+
}
|
|
192
256
|
|
|
193
|
-
|
|
257
|
+
export async function generateTWTerms() {
|
|
258
|
+
// Check if we already processed terms this session
|
|
259
|
+
if (processedTermsCache) {
|
|
260
|
+
console.log('Using in-memory processed terms');
|
|
261
|
+
return processedTermsCache;
|
|
262
|
+
}
|
|
194
263
|
|
|
195
|
-
|
|
196
|
-
|
|
264
|
+
// Try to get cached ZIP first
|
|
265
|
+
let zipBuffer = await getCachedZip();
|
|
197
266
|
|
|
198
|
-
|
|
267
|
+
if (!zipBuffer) {
|
|
268
|
+
// Download fresh ZIP
|
|
269
|
+
console.log('Downloading TW archive...');
|
|
199
270
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
const firstLine = content.split('\n')[0];
|
|
203
|
-
const terms = firstLine.replace(/^#/, '').trim().split(',').map(t => t.trim()).filter(Boolean);
|
|
204
|
-
const truncated = entry.entryName.replace('en_tw/bible/', '');
|
|
271
|
+
const fetchFn = isBrowser ? window.fetch : (await getNodeDeps())?.fetch;
|
|
272
|
+
if (!fetchFn) throw new Error('Fetch not available');
|
|
205
273
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
// e.g., "Joseph (OT)" -> "Joseph", "Mary (sister of Martha)" -> "Mary"
|
|
209
|
-
const normalizedTerm = term.replace(/\s+\([^)]*\)$/, '').trim();
|
|
274
|
+
const res = await fetchFn(ZIP_URL);
|
|
275
|
+
if (!res.ok) throw new Error(`Failed to download ZIP: ${res.status} ${res.statusText}`);
|
|
210
276
|
|
|
211
|
-
|
|
212
|
-
termMap[normalizedTerm] = [];
|
|
213
|
-
}
|
|
214
|
-
termMap[normalizedTerm].push(truncated);
|
|
215
|
-
}
|
|
216
|
-
}
|
|
277
|
+
zipBuffer = await res.arrayBuffer();
|
|
217
278
|
|
|
218
|
-
//
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
}
|
|
279
|
+
// Cache the ZIP for next time
|
|
280
|
+
await cacheZip(zipBuffer);
|
|
281
|
+
}
|
|
222
282
|
|
|
223
|
-
|
|
283
|
+
// Process ZIP to extract terms
|
|
284
|
+
console.log('Processing TW articles...');
|
|
285
|
+
const termMap = await processZipBuffer(zipBuffer);
|
|
224
286
|
|
|
225
|
-
|
|
226
|
-
await cacheTerms(termMap);
|
|
287
|
+
console.log(`Generated ${Object.keys(termMap).length} terms from TW archive`);
|
|
227
288
|
|
|
228
|
-
|
|
289
|
+
// Cache processed terms for this session
|
|
290
|
+
processedTermsCache = termMap;
|
|
229
291
|
|
|
230
|
-
|
|
231
|
-
console.error('Error generating TW terms:', error);
|
|
232
|
-
throw error;
|
|
233
|
-
}
|
|
292
|
+
return termMap;
|
|
234
293
|
}
|
|
235
294
|
|
|
236
295
|
/**
|
|
237
|
-
* Clear cache - useful for forcing refresh
|
|
238
|
-
* @returns {Promise<boolean>} - true if cache was cleared successfully
|
|
296
|
+
* Clear cache - useful for forcing refresh
|
|
239
297
|
*/
|
|
240
298
|
export async function clearCache() {
|
|
241
299
|
// Clear in-memory cache
|
|
242
|
-
|
|
300
|
+
processedTermsCache = null;
|
|
243
301
|
|
|
244
302
|
if (isBrowser) {
|
|
245
|
-
// Clear browser storage
|
|
246
|
-
const storage = getBrowserStorage();
|
|
247
|
-
if (storage) {
|
|
248
|
-
try {
|
|
249
|
-
storage.removeItem(CACHE_KEY);
|
|
250
|
-
console.log('Browser cache cleared');
|
|
251
|
-
return true;
|
|
252
|
-
} catch (error) {
|
|
253
|
-
console.warn('Failed to clear browser cache:', error.message);
|
|
254
|
-
return false;
|
|
255
|
-
}
|
|
256
|
-
}
|
|
257
|
-
} else if (isNode) {
|
|
258
|
-
// Clear Node.js file cache
|
|
259
303
|
try {
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
const __filename = fileURLToPath(import.meta.url);
|
|
264
|
-
const __dirname = path.dirname(__filename);
|
|
265
|
-
const CACHE_FILE = path.join(__dirname, '../../article_terms.json');
|
|
266
|
-
|
|
267
|
-
if (fs.existsSync(CACHE_FILE)) {
|
|
268
|
-
fs.unlinkSync(CACHE_FILE);
|
|
269
|
-
console.log('File cache cleared');
|
|
270
|
-
return true;
|
|
271
|
-
}
|
|
272
|
-
}
|
|
304
|
+
localStorage.removeItem(CACHE_KEY);
|
|
305
|
+
console.log('Browser ZIP cache cleared');
|
|
306
|
+
return true;
|
|
273
307
|
} catch (error) {
|
|
274
|
-
console.warn('Failed to clear
|
|
308
|
+
console.warn('Failed to clear browser cache:', error.message);
|
|
275
309
|
return false;
|
|
276
310
|
}
|
|
277
311
|
}
|
|
@@ -281,45 +315,33 @@ export async function clearCache() {
|
|
|
281
315
|
}
|
|
282
316
|
|
|
283
317
|
/**
|
|
284
|
-
* Get cache information for debugging
|
|
285
|
-
* @returns {Object} - cache status and info
|
|
318
|
+
* Get cache information for debugging
|
|
286
319
|
*/
|
|
287
320
|
export function getCacheInfo() {
|
|
288
321
|
const info = {
|
|
289
322
|
environment: isNode ? 'Node.js' : (isBrowser ? 'Browser' : 'Unknown'),
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
timestamp: null,
|
|
295
|
-
termCount: 0
|
|
323
|
+
hasProcessedTerms: !!processedTermsCache,
|
|
324
|
+
hasZipCache: false,
|
|
325
|
+
termCount: 0,
|
|
326
|
+
cacheVersion: CACHE_VERSION
|
|
296
327
|
};
|
|
297
328
|
|
|
298
|
-
//
|
|
299
|
-
if (
|
|
300
|
-
info.termCount = Object.keys(
|
|
329
|
+
// Check processed terms
|
|
330
|
+
if (processedTermsCache) {
|
|
331
|
+
info.termCount = Object.keys(processedTermsCache).length;
|
|
301
332
|
}
|
|
302
333
|
|
|
334
|
+
// Check ZIP cache in browser
|
|
303
335
|
if (isBrowser) {
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
const data = JSON.parse(cached);
|
|
311
|
-
info.hasPersistentCache = true;
|
|
312
|
-
info.cacheType = storage === localStorage ? 'localStorage' : 'sessionStorage';
|
|
313
|
-
info.version = data.version;
|
|
314
|
-
info.timestamp = data.timestamp ? new Date(data.timestamp) : null;
|
|
315
|
-
|
|
316
|
-
if (!info.termCount && data.terms) {
|
|
317
|
-
info.termCount = Object.keys(data.terms).length;
|
|
318
|
-
}
|
|
319
|
-
}
|
|
320
|
-
} catch (error) {
|
|
321
|
-
// Ignore parse errors
|
|
336
|
+
try {
|
|
337
|
+
const cached = localStorage.getItem(CACHE_KEY);
|
|
338
|
+
if (cached) {
|
|
339
|
+
const data = JSON.parse(cached);
|
|
340
|
+
info.hasZipCache = true;
|
|
341
|
+
info.timestamp = data.timestamp ? new Date(data.timestamp) : null;
|
|
322
342
|
}
|
|
343
|
+
} catch (error) {
|
|
344
|
+
// Ignore parse errors
|
|
323
345
|
}
|
|
324
346
|
}
|
|
325
347
|
|