twl-generator 1.0.4 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/utils/zipProcessor.js +141 -141
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "twl-generator",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Universal TWL zipProcessor - Works in both Node.js and Browser environments
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
* For React.js/Browser: Uses localStorage/sessionStorage for persistent caching
|
|
4
|
+
* Caches the raw ZIP file and processes term headers on-demand
|
|
6
5
|
*
|
|
7
6
|
* Usage in React.js:
|
|
8
7
|
* import { generateTWTerms } from './utils/zipProcessor.js';
|
|
@@ -16,11 +15,11 @@ const isNode = typeof process !== 'undefined' && process.versions?.node;
|
|
|
16
15
|
const isBrowser = typeof window !== 'undefined';
|
|
17
16
|
|
|
18
17
|
const ZIP_URL = 'https://git.door43.org/unfoldingWord/en_tw/archive/master.zip';
|
|
19
|
-
const CACHE_KEY = '
|
|
18
|
+
const CACHE_KEY = 'twl_zip_cache';
|
|
20
19
|
const CACHE_VERSION = '1.0';
|
|
21
20
|
|
|
22
|
-
// In-memory cache for
|
|
23
|
-
let
|
|
21
|
+
// In-memory cache for processed terms (per session)
|
|
22
|
+
let processedTermsCache = null;
|
|
24
23
|
|
|
25
24
|
/**
|
|
26
25
|
* Get Node.js dependencies dynamically
|
|
@@ -29,19 +28,13 @@ async function getNodeDeps() {
|
|
|
29
28
|
if (!isNode) return null;
|
|
30
29
|
|
|
31
30
|
try {
|
|
32
|
-
const [nodeModule,
|
|
31
|
+
const [nodeModule, admZipModule] = await Promise.all([
|
|
33
32
|
import('node-fetch'),
|
|
34
|
-
import('fs'),
|
|
35
|
-
import('path'),
|
|
36
|
-
import('url'),
|
|
37
33
|
import('adm-zip')
|
|
38
34
|
]);
|
|
39
35
|
|
|
40
36
|
return {
|
|
41
37
|
fetch: nodeModule.default,
|
|
42
|
-
fs: fsModule.default,
|
|
43
|
-
path: pathModule.default,
|
|
44
|
-
fileURLToPath: urlModule.fileURLToPath,
|
|
45
38
|
AdmZip: admZipModule.default
|
|
46
39
|
};
|
|
47
40
|
} catch (error) {
|
|
@@ -50,6 +43,52 @@ async function getNodeDeps() {
|
|
|
50
43
|
}
|
|
51
44
|
}
|
|
52
45
|
|
|
46
|
+
/**
|
|
47
|
+
* Get cached ZIP data from appropriate storage
|
|
48
|
+
*/
|
|
49
|
+
async function getCachedZip() {
|
|
50
|
+
if (isBrowser) {
|
|
51
|
+
// Browser: Use localStorage for ZIP cache
|
|
52
|
+
try {
|
|
53
|
+
const cached = localStorage.getItem(CACHE_KEY);
|
|
54
|
+
if (cached) {
|
|
55
|
+
const data = JSON.parse(cached);
|
|
56
|
+
if (data.version === CACHE_VERSION) {
|
|
57
|
+
console.log('Using cached ZIP from browser storage');
|
|
58
|
+
return new Uint8Array(data.zipData);
|
|
59
|
+
} else {
|
|
60
|
+
localStorage.removeItem(CACHE_KEY);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
} catch (error) {
|
|
64
|
+
console.log('Browser ZIP cache corrupted, re-downloading...');
|
|
65
|
+
try { localStorage.removeItem(CACHE_KEY); } catch (e) { }
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
// Note: In Node.js we could cache to filesystem, but fresh download is fine for CLI usage
|
|
69
|
+
|
|
70
|
+
return null;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Cache ZIP data in appropriate storage
|
|
75
|
+
*/
|
|
76
|
+
async function cacheZip(zipBuffer) {
|
|
77
|
+
if (isBrowser) {
|
|
78
|
+
try {
|
|
79
|
+
const cacheData = {
|
|
80
|
+
version: CACHE_VERSION,
|
|
81
|
+
timestamp: Date.now(),
|
|
82
|
+
zipData: Array.from(new Uint8Array(zipBuffer))
|
|
83
|
+
};
|
|
84
|
+
localStorage.setItem(CACHE_KEY, JSON.stringify(cacheData));
|
|
85
|
+
console.log('ZIP cached in browser storage');
|
|
86
|
+
} catch (error) {
|
|
87
|
+
console.warn('Failed to cache ZIP in browser:', error.message);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
53
92
|
/**
|
|
54
93
|
* Get browser storage (localStorage or sessionStorage)
|
|
55
94
|
*/
|
|
@@ -166,134 +205,107 @@ async function cacheTerms(termMap) {
|
|
|
166
205
|
}
|
|
167
206
|
|
|
168
207
|
/**
|
|
169
|
-
* Process
|
|
208
|
+
* Process ZIP buffer and extract term mappings
|
|
170
209
|
*/
|
|
171
|
-
function
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
210
|
+
async function processZipBuffer(zipBuffer) {
|
|
211
|
+
let zip;
|
|
212
|
+
|
|
213
|
+
if (isNode) {
|
|
214
|
+
// Node.js: Use adm-zip
|
|
215
|
+
const deps = await getNodeDeps();
|
|
216
|
+
if (!deps) throw new Error('Failed to load Node.js dependencies');
|
|
217
|
+
const { AdmZip } = deps;
|
|
218
|
+
const Buffer = (await import('buffer')).Buffer;
|
|
219
|
+
zip = new AdmZip(Buffer.from(zipBuffer));
|
|
220
|
+
} else {
|
|
221
|
+
// Browser: Use browser-compatible zip processing
|
|
222
|
+
// For now, we'll use a simple approach that works with the TW archive structure
|
|
223
|
+
throw new Error('Browser ZIP processing not yet implemented. Use cached data or run initial processing in Node.js.');
|
|
224
|
+
}
|
|
175
225
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
*/
|
|
179
|
-
async function processBrowserZip(buffer) {
|
|
180
|
-
// For browser compatibility, we'll throw an error for now
|
|
181
|
-
// This would require a browser-compatible zip library or different approach
|
|
182
|
-
throw new Error('ZIP processing in browser requires cached terms. Please ensure article_terms.json is available or use Node.js environment for initial processing.');
|
|
183
|
-
}
|
|
226
|
+
const entries = zip.getEntries().filter(e => e.entryName.match(/^en_tw\/bible\/.*\/.*\.md$/));
|
|
227
|
+
entries.sort((a, b) => a.entryName.localeCompare(b.entryName));
|
|
184
228
|
|
|
185
|
-
|
|
186
|
-
// Try to get cached terms first
|
|
187
|
-
const cachedTerms = await getCachedTerms();
|
|
188
|
-
if (cachedTerms) {
|
|
189
|
-
return cachedTerms;
|
|
190
|
-
}
|
|
229
|
+
const termMap = {};
|
|
191
230
|
|
|
192
|
-
|
|
231
|
+
for (const entry of entries) {
|
|
232
|
+
const content = entry.getData().toString('utf8');
|
|
233
|
+
const firstLine = content.split('\n')[0];
|
|
234
|
+
const terms = firstLine.replace(/^#/, '').trim().split(',').map(t => t.trim()).filter(Boolean);
|
|
235
|
+
const truncated = entry.entryName.replace('en_tw/bible/', '');
|
|
193
236
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
const deps = await getNodeDeps();
|
|
204
|
-
if (!deps) throw new Error('Failed to load Node.js dependencies');
|
|
205
|
-
fetchFn = deps.fetch;
|
|
206
|
-
const AdmZip = deps.AdmZip;
|
|
207
|
-
const Buffer = (await import('buffer')).Buffer;
|
|
208
|
-
processZip = (buffer) => processNodeZip(AdmZip, Buffer, buffer);
|
|
237
|
+
for (const term of terms) {
|
|
238
|
+
// Normalize terms by removing parentheses and spaces before them
|
|
239
|
+
// e.g., "Joseph (OT)" -> "Joseph", "Mary (sister of Martha)" -> "Mary"
|
|
240
|
+
const normalizedTerm = term.replace(/\s+\([^)]*\)$/, '').trim();
|
|
241
|
+
|
|
242
|
+
if (!termMap[normalizedTerm]) {
|
|
243
|
+
termMap[normalizedTerm] = [];
|
|
244
|
+
}
|
|
245
|
+
termMap[normalizedTerm].push(truncated);
|
|
209
246
|
}
|
|
247
|
+
}
|
|
210
248
|
|
|
211
|
-
|
|
212
|
-
|
|
249
|
+
// Sort article arrays for consistent output
|
|
250
|
+
for (const term in termMap) {
|
|
251
|
+
termMap[term].sort();
|
|
252
|
+
}
|
|
213
253
|
|
|
214
|
-
|
|
215
|
-
|
|
254
|
+
return termMap;
|
|
255
|
+
}
|
|
216
256
|
|
|
217
|
-
|
|
218
|
-
|
|
257
|
+
export async function generateTWTerms() {
|
|
258
|
+
// Check if we already processed terms this session
|
|
259
|
+
if (processedTermsCache) {
|
|
260
|
+
console.log('Using in-memory processed terms');
|
|
261
|
+
return processedTermsCache;
|
|
262
|
+
}
|
|
219
263
|
|
|
220
|
-
|
|
264
|
+
// Try to get cached ZIP first
|
|
265
|
+
let zipBuffer = await getCachedZip();
|
|
221
266
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
const terms = firstLine.replace(/^#/, '').trim().split(',').map(t => t.trim()).filter(Boolean);
|
|
226
|
-
const truncated = entry.entryName.replace('en_tw/bible/', '');
|
|
267
|
+
if (!zipBuffer) {
|
|
268
|
+
// Download fresh ZIP
|
|
269
|
+
console.log('Downloading TW archive...');
|
|
227
270
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
// e.g., "Joseph (OT)" -> "Joseph", "Mary (sister of Martha)" -> "Mary"
|
|
231
|
-
const normalizedTerm = term.replace(/\s+\([^)]*\)$/, '').trim();
|
|
271
|
+
const fetchFn = isBrowser ? window.fetch : (await getNodeDeps())?.fetch;
|
|
272
|
+
if (!fetchFn) throw new Error('Fetch not available');
|
|
232
273
|
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
}
|
|
236
|
-
termMap[normalizedTerm].push(truncated);
|
|
237
|
-
}
|
|
238
|
-
}
|
|
274
|
+
const res = await fetchFn(ZIP_URL);
|
|
275
|
+
if (!res.ok) throw new Error(`Failed to download ZIP: ${res.status} ${res.statusText}`);
|
|
239
276
|
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
277
|
+
zipBuffer = await res.arrayBuffer();
|
|
278
|
+
|
|
279
|
+
// Cache the ZIP for next time
|
|
280
|
+
await cacheZip(zipBuffer);
|
|
281
|
+
}
|
|
244
282
|
|
|
245
|
-
|
|
283
|
+
// Process ZIP to extract terms
|
|
284
|
+
console.log('Processing TW articles...');
|
|
285
|
+
const termMap = await processZipBuffer(zipBuffer);
|
|
246
286
|
|
|
247
|
-
|
|
248
|
-
await cacheTerms(termMap);
|
|
287
|
+
console.log(`Generated ${Object.keys(termMap).length} terms from TW archive`);
|
|
249
288
|
|
|
250
|
-
|
|
289
|
+
// Cache processed terms for this session
|
|
290
|
+
processedTermsCache = termMap;
|
|
251
291
|
|
|
252
|
-
|
|
253
|
-
console.error('Error generating TW terms:', error);
|
|
254
|
-
throw error;
|
|
255
|
-
}
|
|
292
|
+
return termMap;
|
|
256
293
|
}
|
|
257
294
|
|
|
258
295
|
/**
|
|
259
|
-
* Clear cache - useful for forcing refresh
|
|
260
|
-
* @returns {Promise<boolean>} - true if cache was cleared successfully
|
|
296
|
+
* Clear cache - useful for forcing refresh
|
|
261
297
|
*/
|
|
262
298
|
export async function clearCache() {
|
|
263
299
|
// Clear in-memory cache
|
|
264
|
-
|
|
300
|
+
processedTermsCache = null;
|
|
265
301
|
|
|
266
302
|
if (isBrowser) {
|
|
267
|
-
// Clear browser storage
|
|
268
|
-
const storage = getBrowserStorage();
|
|
269
|
-
if (storage) {
|
|
270
|
-
try {
|
|
271
|
-
storage.removeItem(CACHE_KEY);
|
|
272
|
-
console.log('Browser cache cleared');
|
|
273
|
-
return true;
|
|
274
|
-
} catch (error) {
|
|
275
|
-
console.warn('Failed to clear browser cache:', error.message);
|
|
276
|
-
return false;
|
|
277
|
-
}
|
|
278
|
-
}
|
|
279
|
-
} else if (isNode) {
|
|
280
|
-
// Clear Node.js file cache
|
|
281
303
|
try {
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
const __filename = fileURLToPath(import.meta.url);
|
|
286
|
-
const __dirname = path.dirname(__filename);
|
|
287
|
-
const CACHE_FILE = path.join(__dirname, '../../article_terms.json');
|
|
288
|
-
|
|
289
|
-
if (fs.existsSync(CACHE_FILE)) {
|
|
290
|
-
fs.unlinkSync(CACHE_FILE);
|
|
291
|
-
console.log('File cache cleared');
|
|
292
|
-
return true;
|
|
293
|
-
}
|
|
294
|
-
}
|
|
304
|
+
localStorage.removeItem(CACHE_KEY);
|
|
305
|
+
console.log('Browser ZIP cache cleared');
|
|
306
|
+
return true;
|
|
295
307
|
} catch (error) {
|
|
296
|
-
console.warn('Failed to clear
|
|
308
|
+
console.warn('Failed to clear browser cache:', error.message);
|
|
297
309
|
return false;
|
|
298
310
|
}
|
|
299
311
|
}
|
|
@@ -303,45 +315,33 @@ export async function clearCache() {
|
|
|
303
315
|
}
|
|
304
316
|
|
|
305
317
|
/**
|
|
306
|
-
* Get cache information for debugging
|
|
307
|
-
* @returns {Object} - cache status and info
|
|
318
|
+
* Get cache information for debugging
|
|
308
319
|
*/
|
|
309
320
|
export function getCacheInfo() {
|
|
310
321
|
const info = {
|
|
311
322
|
environment: isNode ? 'Node.js' : (isBrowser ? 'Browser' : 'Unknown'),
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
timestamp: null,
|
|
317
|
-
termCount: 0
|
|
323
|
+
hasProcessedTerms: !!processedTermsCache,
|
|
324
|
+
hasZipCache: false,
|
|
325
|
+
termCount: 0,
|
|
326
|
+
cacheVersion: CACHE_VERSION
|
|
318
327
|
};
|
|
319
328
|
|
|
320
|
-
//
|
|
321
|
-
if (
|
|
322
|
-
info.termCount = Object.keys(
|
|
329
|
+
// Check processed terms
|
|
330
|
+
if (processedTermsCache) {
|
|
331
|
+
info.termCount = Object.keys(processedTermsCache).length;
|
|
323
332
|
}
|
|
324
333
|
|
|
334
|
+
// Check ZIP cache in browser
|
|
325
335
|
if (isBrowser) {
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
const data = JSON.parse(cached);
|
|
333
|
-
info.hasPersistentCache = true;
|
|
334
|
-
info.cacheType = storage === localStorage ? 'localStorage' : 'sessionStorage';
|
|
335
|
-
info.version = data.version;
|
|
336
|
-
info.timestamp = data.timestamp ? new Date(data.timestamp) : null;
|
|
337
|
-
|
|
338
|
-
if (!info.termCount && data.terms) {
|
|
339
|
-
info.termCount = Object.keys(data.terms).length;
|
|
340
|
-
}
|
|
341
|
-
}
|
|
342
|
-
} catch (error) {
|
|
343
|
-
// Ignore parse errors
|
|
336
|
+
try {
|
|
337
|
+
const cached = localStorage.getItem(CACHE_KEY);
|
|
338
|
+
if (cached) {
|
|
339
|
+
const data = JSON.parse(cached);
|
|
340
|
+
info.hasZipCache = true;
|
|
341
|
+
info.timestamp = data.timestamp ? new Date(data.timestamp) : null;
|
|
344
342
|
}
|
|
343
|
+
} catch (error) {
|
|
344
|
+
// Ignore parse errors
|
|
345
345
|
}
|
|
346
346
|
}
|
|
347
347
|
|