twl-generator 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +155 -0
- package/package.json +60 -0
- package/src/cli.js +86 -0
- package/src/common/books.js +180 -0
- package/src/index.js +31 -0
- package/src/utils/twl-matcher.js +395 -0
- package/src/utils/usfm-alignment-remover.js +104 -0
- package/src/utils/zipProcessor.js +329 -0
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Universal TWL zipProcessor - Works in both Node.js and Browser environments
|
|
3
|
+
*
|
|
4
|
+
* For Node.js (CLI): Uses file system caching with article_terms.json
|
|
5
|
+
* For React.js/Browser: Uses localStorage/sessionStorage for persistent caching
|
|
6
|
+
*
|
|
7
|
+
* Usage in React.js:
|
|
8
|
+
* import { generateTWTerms } from './utils/zipProcessor.js';
|
|
9
|
+
* const terms = await generateTWTerms('JHN');
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import AdmZip from 'adm-zip';
|
|
13
|
+
import { BibleBookData } from '../common/books.js';
|
|
14
|
+
|
|
15
|
+
// Environment detection
|
|
16
|
+
const isNode = typeof process !== 'undefined' && process.versions?.node;
|
|
17
|
+
const isBrowser = typeof window !== 'undefined';
|
|
18
|
+
|
|
19
|
+
const ZIP_URL = 'https://git.door43.org/unfoldingWord/en_tw/archive/master.zip';
|
|
20
|
+
const CACHE_KEY = 'twl_article_terms';
|
|
21
|
+
const CACHE_VERSION = '1.0';
|
|
22
|
+
|
|
23
|
+
// In-memory cache for current session
|
|
24
|
+
let memoryCache = null;
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Get Node.js dependencies dynamically
|
|
28
|
+
*/
|
|
29
|
+
async function getNodeDeps() {
|
|
30
|
+
if (!isNode) return null;
|
|
31
|
+
|
|
32
|
+
try {
|
|
33
|
+
const [nodeModule, fsModule, pathModule, urlModule] = await Promise.all([
|
|
34
|
+
import('node-fetch'),
|
|
35
|
+
import('fs'),
|
|
36
|
+
import('path'),
|
|
37
|
+
import('url')
|
|
38
|
+
]);
|
|
39
|
+
|
|
40
|
+
return {
|
|
41
|
+
fetch: nodeModule.default,
|
|
42
|
+
fs: fsModule.default,
|
|
43
|
+
path: pathModule.default,
|
|
44
|
+
fileURLToPath: urlModule.fileURLToPath
|
|
45
|
+
};
|
|
46
|
+
} catch (error) {
|
|
47
|
+
console.error('Failed to load Node.js dependencies:', error);
|
|
48
|
+
return null;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Get browser storage (localStorage or sessionStorage)
|
|
54
|
+
*/
|
|
55
|
+
function getBrowserStorage() {
|
|
56
|
+
if (!isBrowser) return null;
|
|
57
|
+
|
|
58
|
+
try {
|
|
59
|
+
return localStorage || sessionStorage || null;
|
|
60
|
+
} catch (e) {
|
|
61
|
+
console.warn('Browser storage not available:', e.message);
|
|
62
|
+
return null;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Get cached terms from appropriate storage
|
|
68
|
+
*/
|
|
69
|
+
async function getCachedTerms() {
|
|
70
|
+
// Check in-memory cache first (fastest)
|
|
71
|
+
if (memoryCache) {
|
|
72
|
+
console.log('Using in-memory cached article terms');
|
|
73
|
+
return memoryCache;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if (isBrowser) {
|
|
77
|
+
// Browser caching with localStorage/sessionStorage
|
|
78
|
+
const storage = getBrowserStorage();
|
|
79
|
+
if (storage) {
|
|
80
|
+
try {
|
|
81
|
+
const cached = storage.getItem(CACHE_KEY);
|
|
82
|
+
if (cached) {
|
|
83
|
+
const data = JSON.parse(cached);
|
|
84
|
+
if (data.version === CACHE_VERSION) {
|
|
85
|
+
console.log('Using browser cached article terms');
|
|
86
|
+
memoryCache = data.terms;
|
|
87
|
+
return data.terms;
|
|
88
|
+
} else {
|
|
89
|
+
console.log('Browser cache version mismatch, regenerating...');
|
|
90
|
+
storage.removeItem(CACHE_KEY);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
} catch (error) {
|
|
94
|
+
console.log('Browser cache corrupted, regenerating...');
|
|
95
|
+
try {
|
|
96
|
+
storage.removeItem(CACHE_KEY);
|
|
97
|
+
} catch (e) { /* ignore cleanup errors */ }
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
} else if (isNode) {
|
|
101
|
+
// Node.js file system caching
|
|
102
|
+
try {
|
|
103
|
+
const deps = await getNodeDeps();
|
|
104
|
+
if (!deps) return null;
|
|
105
|
+
|
|
106
|
+
const { fs, path, fileURLToPath } = deps;
|
|
107
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
108
|
+
const __dirname = path.dirname(__filename);
|
|
109
|
+
const CACHE_FILE = path.join(__dirname, '../../article_terms.json');
|
|
110
|
+
|
|
111
|
+
if (fs.existsSync(CACHE_FILE)) {
|
|
112
|
+
const cachedData = JSON.parse(fs.readFileSync(CACHE_FILE, 'utf8'));
|
|
113
|
+
console.log('Using cached article terms from article_terms.json');
|
|
114
|
+
memoryCache = cachedData;
|
|
115
|
+
return cachedData;
|
|
116
|
+
}
|
|
117
|
+
} catch (error) {
|
|
118
|
+
console.log('File cache corrupted, regenerating...');
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return null;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Cache terms in appropriate storage
|
|
127
|
+
*/
|
|
128
|
+
async function cacheTerms(termMap) {
|
|
129
|
+
// Always cache in memory for this session
|
|
130
|
+
memoryCache = termMap;
|
|
131
|
+
|
|
132
|
+
if (isBrowser) {
|
|
133
|
+
// Browser caching
|
|
134
|
+
const storage = getBrowserStorage();
|
|
135
|
+
if (storage) {
|
|
136
|
+
try {
|
|
137
|
+
const cacheData = {
|
|
138
|
+
version: CACHE_VERSION,
|
|
139
|
+
timestamp: Date.now(),
|
|
140
|
+
terms: termMap
|
|
141
|
+
};
|
|
142
|
+
storage.setItem(CACHE_KEY, JSON.stringify(cacheData));
|
|
143
|
+
console.log('Article terms cached in browser storage');
|
|
144
|
+
} catch (error) {
|
|
145
|
+
console.warn('Failed to cache in browser storage:', error.message);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
} else if (isNode) {
|
|
149
|
+
// Node.js file system caching
|
|
150
|
+
try {
|
|
151
|
+
const deps = await getNodeDeps();
|
|
152
|
+
if (!deps) return;
|
|
153
|
+
|
|
154
|
+
const { fs, path, fileURLToPath } = deps;
|
|
155
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
156
|
+
const __dirname = path.dirname(__filename);
|
|
157
|
+
const CACHE_FILE = path.join(__dirname, '../../article_terms.json');
|
|
158
|
+
|
|
159
|
+
fs.writeFileSync(CACHE_FILE, JSON.stringify(termMap, null, 2), 'utf8');
|
|
160
|
+
console.log('Article terms cached to article_terms.json');
|
|
161
|
+
} catch (error) {
|
|
162
|
+
console.warn('Failed to cache article terms to file:', error.message);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
export async function generateTWTerms(book) {
|
|
168
|
+
if (!BibleBookData[book]) throw new Error(`Unknown book: ${book}`);
|
|
169
|
+
|
|
170
|
+
// Try to get cached terms first
|
|
171
|
+
const cachedTerms = await getCachedTerms();
|
|
172
|
+
if (cachedTerms) {
|
|
173
|
+
return cachedTerms;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
console.log('Downloading TW archive...');
|
|
177
|
+
|
|
178
|
+
try {
|
|
179
|
+
// Get appropriate fetch function
|
|
180
|
+
let fetchFn;
|
|
181
|
+
if (isBrowser) {
|
|
182
|
+
fetchFn = window.fetch;
|
|
183
|
+
} else if (isNode) {
|
|
184
|
+
const deps = await getNodeDeps();
|
|
185
|
+
fetchFn = deps?.fetch;
|
|
186
|
+
if (!fetchFn) throw new Error('Failed to load Node.js dependencies');
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
const res = await fetchFn(ZIP_URL);
|
|
190
|
+
if (!res.ok) throw new Error(`Failed to download zip: ${res.status} ${res.statusText}`);
|
|
191
|
+
|
|
192
|
+
const buffer = await res.arrayBuffer();
|
|
193
|
+
const zip = new AdmZip(Buffer.from(buffer));
|
|
194
|
+
|
|
195
|
+
console.log('Processing TW articles...');
|
|
196
|
+
|
|
197
|
+
const entries = zip.getEntries().filter(e => e.entryName.match(/^en_tw\/bible\/.*\/.*\.md$/));
|
|
198
|
+
entries.sort((a, b) => a.entryName.localeCompare(b.entryName));
|
|
199
|
+
|
|
200
|
+
const termMap = {};
|
|
201
|
+
|
|
202
|
+
for (const entry of entries) {
|
|
203
|
+
const content = entry.getData().toString('utf8');
|
|
204
|
+
const firstLine = content.split('\n')[0];
|
|
205
|
+
const terms = firstLine.replace(/^#/, '').trim().split(',').map(t => t.trim()).filter(Boolean);
|
|
206
|
+
const truncated = entry.entryName.replace('en_tw/bible/', '');
|
|
207
|
+
|
|
208
|
+
for (const term of terms) {
|
|
209
|
+
// Normalize terms by removing parentheses and spaces before them
|
|
210
|
+
// e.g., "Joseph (OT)" -> "Joseph", "Mary (sister of Martha)" -> "Mary"
|
|
211
|
+
const normalizedTerm = term.replace(/\s+\([^)]*\)$/, '').trim();
|
|
212
|
+
|
|
213
|
+
if (!termMap[normalizedTerm]) {
|
|
214
|
+
termMap[normalizedTerm] = [];
|
|
215
|
+
}
|
|
216
|
+
termMap[normalizedTerm].push(truncated);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Sort article arrays for consistent output
|
|
221
|
+
for (const term in termMap) {
|
|
222
|
+
termMap[term].sort();
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
console.log(`Generated ${Object.keys(termMap).length} terms from TW archive`);
|
|
226
|
+
|
|
227
|
+
// Cache the results
|
|
228
|
+
await cacheTerms(termMap);
|
|
229
|
+
|
|
230
|
+
return termMap;
|
|
231
|
+
|
|
232
|
+
} catch (error) {
|
|
233
|
+
console.error('Error generating TW terms:', error);
|
|
234
|
+
throw error;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
/**
|
|
239
|
+
* Clear cache - useful for forcing refresh in React.js apps
|
|
240
|
+
* @returns {Promise<boolean>} - true if cache was cleared successfully
|
|
241
|
+
*/
|
|
242
|
+
export async function clearCache() {
|
|
243
|
+
// Clear in-memory cache
|
|
244
|
+
memoryCache = null;
|
|
245
|
+
|
|
246
|
+
if (isBrowser) {
|
|
247
|
+
// Clear browser storage
|
|
248
|
+
const storage = getBrowserStorage();
|
|
249
|
+
if (storage) {
|
|
250
|
+
try {
|
|
251
|
+
storage.removeItem(CACHE_KEY);
|
|
252
|
+
console.log('Browser cache cleared');
|
|
253
|
+
return true;
|
|
254
|
+
} catch (error) {
|
|
255
|
+
console.warn('Failed to clear browser cache:', error.message);
|
|
256
|
+
return false;
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
} else if (isNode) {
|
|
260
|
+
// Clear Node.js file cache
|
|
261
|
+
try {
|
|
262
|
+
const deps = await getNodeDeps();
|
|
263
|
+
if (deps) {
|
|
264
|
+
const { fs, path, fileURLToPath } = deps;
|
|
265
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
266
|
+
const __dirname = path.dirname(__filename);
|
|
267
|
+
const CACHE_FILE = path.join(__dirname, '../../article_terms.json');
|
|
268
|
+
|
|
269
|
+
if (fs.existsSync(CACHE_FILE)) {
|
|
270
|
+
fs.unlinkSync(CACHE_FILE);
|
|
271
|
+
console.log('File cache cleared');
|
|
272
|
+
return true;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
} catch (error) {
|
|
276
|
+
console.warn('Failed to clear file cache:', error.message);
|
|
277
|
+
return false;
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
console.log('Memory cache cleared');
|
|
282
|
+
return true;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
/**
|
|
286
|
+
* Get cache information for debugging - useful in React.js development
|
|
287
|
+
* @returns {Object} - cache status and info
|
|
288
|
+
*/
|
|
289
|
+
export function getCacheInfo() {
|
|
290
|
+
const info = {
|
|
291
|
+
environment: isNode ? 'Node.js' : (isBrowser ? 'Browser' : 'Unknown'),
|
|
292
|
+
hasMemoryCache: !!memoryCache,
|
|
293
|
+
hasPersistentCache: false,
|
|
294
|
+
cacheType: null,
|
|
295
|
+
version: null,
|
|
296
|
+
timestamp: null,
|
|
297
|
+
termCount: 0
|
|
298
|
+
};
|
|
299
|
+
|
|
300
|
+
// Memory cache info
|
|
301
|
+
if (memoryCache) {
|
|
302
|
+
info.termCount = Object.keys(memoryCache).length;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
if (isBrowser) {
|
|
306
|
+
// Browser cache info
|
|
307
|
+
const storage = getBrowserStorage();
|
|
308
|
+
if (storage) {
|
|
309
|
+
try {
|
|
310
|
+
const cached = storage.getItem(CACHE_KEY);
|
|
311
|
+
if (cached) {
|
|
312
|
+
const data = JSON.parse(cached);
|
|
313
|
+
info.hasPersistentCache = true;
|
|
314
|
+
info.cacheType = storage === localStorage ? 'localStorage' : 'sessionStorage';
|
|
315
|
+
info.version = data.version;
|
|
316
|
+
info.timestamp = data.timestamp ? new Date(data.timestamp) : null;
|
|
317
|
+
|
|
318
|
+
if (!info.termCount && data.terms) {
|
|
319
|
+
info.termCount = Object.keys(data.terms).length;
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
} catch (error) {
|
|
323
|
+
// Ignore parse errors
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
return info;
|
|
329
|
+
}
|