twl-generator 1.3.0 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +279 -40
- package/package.json +1 -1
- package/src/index.js +5 -5
package/README.md
CHANGED
|
@@ -1,91 +1,330 @@
|
|
|
1
1
|
# TWL Generator
|
|
2
2
|
|
|
3
|
-
A Node.js library and CLI tool for generating Translation Word Links (TWL) TSV files from Door43 USFM data and Translation Words (TW) metadata.
|
|
3
|
+
A Node.js library and CLI tool for generating Translation Word Links (TWL) TSV files from Door43 USFM data and Translation Words (TW) metadata. This tool intelligently matches biblical terms with their corresponding Translation Words articles using Strong's numbers, morphological analysis, and contextual matching.
|
|
4
4
|
|
|
5
5
|
## Installation
|
|
6
6
|
|
|
7
|
-
### Global CLI
|
|
7
|
+
### Global CLI Installation
|
|
8
8
|
```bash
|
|
9
9
|
npm install -g twl-generator
|
|
10
10
|
```
|
|
11
11
|
|
|
12
|
-
### Library
|
|
12
|
+
### Library Installation
|
|
13
13
|
```bash
|
|
14
14
|
npm install twl-generator
|
|
15
15
|
```
|
|
16
16
|
|
|
17
17
|
## Usage
|
|
18
18
|
|
|
19
|
-
### Command Line
|
|
19
|
+
### Command Line Interface
|
|
20
|
+
|
|
20
21
|
Generate TWL for a specific book:
|
|
21
22
|
```bash
|
|
22
|
-
|
|
23
|
+
twl-generator --book rut
|
|
24
|
+
# Creates: rut.twl.tsv and rut.no-match.twl.tsv
|
|
23
25
|
```
|
|
24
26
|
|
|
25
27
|
Generate TWL for all books:
|
|
26
28
|
```bash
|
|
27
|
-
|
|
29
|
+
twl-generator --all --out-dir ./output
|
|
30
|
+
# Creates TWL files for all 66 biblical books
|
|
28
31
|
```
|
|
29
32
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
33
|
+
Specify custom output location:
|
|
34
|
+
```bash
|
|
35
|
+
twl-generator --book mat --out matthew.twl.tsv
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Enable advanced verb conjugation matching:
|
|
39
|
+
```bash
|
|
40
|
+
twl-generator --book jhn --use-compromise
|
|
41
|
+
# Uses compromise.js for better verb form detection
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
#### CLI Options
|
|
45
|
+
- `--book <code>`: Book code (e.g., gen, exo, mat, mrk, jhn, etc.)
|
|
46
|
+
- `--all`: Generate TWL files for all biblical books
|
|
47
|
+
- `--out <file>`: Specify output file path
|
|
48
|
+
- `--out-dir <dir>`: Output directory (for --all option)
|
|
49
|
+
- `--use-compromise`: Enable advanced morphological analysis using compromise.js
|
|
36
50
|
|
|
37
51
|
### Library Usage
|
|
52
|
+
|
|
53
|
+
#### Basic Usage
|
|
38
54
|
```javascript
|
|
39
55
|
import { generateTwlByBook } from 'twl-generator';
|
|
40
56
|
|
|
41
|
-
|
|
42
|
-
|
|
57
|
+
// Generate TWL for Ruth
|
|
58
|
+
const result = await generateTwlByBook('rut');
|
|
59
|
+
console.log(result.matchedTsv); // Main TWL output
|
|
60
|
+
console.log(result.noMatchTsv); // Unmatched entries for analysis
|
|
43
61
|
```
|
|
44
62
|
|
|
45
|
-
|
|
63
|
+
#### With Advanced Options
|
|
64
|
+
```javascript
|
|
65
|
+
import { generateTwlByBook } from 'twl-generator';
|
|
66
|
+
|
|
67
|
+
// Use advanced morphological analysis
|
|
68
|
+
const result = await generateTwlByBook('jhn', {
|
|
69
|
+
useCompromise: true // Enable compromise.js for better verb matching
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
// Save to files
|
|
73
|
+
import fs from 'fs/promises';
|
|
74
|
+
await fs.writeFile('john.twl.tsv', result.matchedTsv);
|
|
75
|
+
await fs.writeFile('john.no-match.tsv', result.noMatchTsv);
|
|
76
|
+
```
|
|
46
77
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
- **Browser Compatible**: Core library works in modern browsers
|
|
51
|
-
- **CLI Ready**: Global command-line tool for batch processing
|
|
78
|
+
#### Integration Example
|
|
79
|
+
```javascript
|
|
80
|
+
import { generateTwlByBook } from 'twl-generator';
|
|
52
81
|
|
|
53
|
-
|
|
82
|
+
async function processBibleBook(bookCode) {
|
|
83
|
+
try {
|
|
84
|
+
const { matchedTsv, noMatchTsv } = await generateTwlByBook(bookCode);
|
|
85
|
+
|
|
86
|
+
// Process the TSV data
|
|
87
|
+
const lines = matchedTsv.split('\n');
|
|
88
|
+
const header = lines[0];
|
|
89
|
+
const rows = lines.slice(1).filter(Boolean);
|
|
90
|
+
|
|
91
|
+
console.log(`Generated ${rows.length} TWL entries for ${bookCode.toUpperCase()}`);
|
|
92
|
+
|
|
93
|
+
// Further processing...
|
|
94
|
+
return { success: true, entries: rows.length };
|
|
95
|
+
} catch (error) {
|
|
96
|
+
console.error(`Failed to process ${bookCode}:`, error);
|
|
97
|
+
return { success: false, error: error.message };
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
```
|
|
54
101
|
|
|
55
|
-
|
|
102
|
+
## How It Works
|
|
56
103
|
|
|
57
|
-
|
|
58
|
-
2. **Case-insensitive word boundary**: Flexible case matching with boundaries
|
|
59
|
-
3. **Case-sensitive substring**: Exact substring matching
|
|
60
|
-
4. **Case-insensitive stripped forms**: Controlled morphological variants
|
|
104
|
+
The TWL Generator uses a sophisticated multi-stage process to create Translation Word Links:
|
|
61
105
|
|
|
62
|
-
|
|
106
|
+
### 1. **Data Sources**
|
|
107
|
+
- **Original Language USFM**: Hebrew (hbo_uhb) and Greek (el-x-koine_ugnt) texts from Door43
|
|
108
|
+
- **English Bible**: unfoldingWord Literal Text (en_ult) for context matching
|
|
109
|
+
- **Translation Words**: Local `tw_strongs_list.json` containing Strong's mappings and term definitions
|
|
110
|
+
- **Strong's Numbers**: Links between original language words and semantic concepts
|
|
63
111
|
|
|
64
|
-
|
|
65
|
-
- **Translation Words**: Local tw_strongs_list.json with Strong's mappings and term lists
|
|
66
|
-
- **English Bible**: Uses unfoldingWord/en_ult for GLQuote generation
|
|
112
|
+
### 2. **Processing Pipeline**
|
|
67
113
|
|
|
68
|
-
|
|
114
|
+
#### Stage 1: Extract Strong's Data
|
|
115
|
+
- Parses USFM `\w` tags to extract Strong's numbers from original language texts
|
|
116
|
+
- Builds initial TSV with Reference, Strong's ID, and surface words
|
|
117
|
+
- Handles multi-word phrases that share Strong's number sequences
|
|
69
118
|
|
|
70
|
-
|
|
71
|
-
-
|
|
119
|
+
#### Stage 2: Generate English Context
|
|
120
|
+
- Uses `tsv-quote-converters` to find corresponding English text (GLQuote) in ULT
|
|
121
|
+
- Adds GLQuote and GLOccurrence columns for contextual matching
|
|
122
|
+
- Converts to OrigWords/Occurrence format for processing
|
|
123
|
+
|
|
124
|
+
#### Stage 3: Intelligent Article Selection
|
|
125
|
+
For each Strong's number and its English context, the system:
|
|
126
|
+
|
|
127
|
+
1. **Prioritizes candidate articles** based on:
|
|
128
|
+
- Articles whose slug appears in the GLQuote text
|
|
129
|
+
- Article type preference: kt/ (key terms) → names/ → other/
|
|
130
|
+
- Alphabetical sorting within each category
|
|
131
|
+
|
|
132
|
+
2. **Performs 4-stage matching** (best match wins):
|
|
133
|
+
- **Stage 1**: Case-sensitive word boundary matching
|
|
134
|
+
- **Stage 2**: Case-insensitive word boundary matching
|
|
135
|
+
- **Stage 3**: Case-sensitive substring matching
|
|
136
|
+
- **Stage 4**: Case-insensitive morphological variants
|
|
137
|
+
|
|
138
|
+
3. **Morphological analysis** includes:
|
|
139
|
+
- Pluralization (dog → dogs, man → men)
|
|
140
|
+
- Verb conjugation (-ing, -ed forms)
|
|
141
|
+
- Irregular verb forms (go → went, see → saw)
|
|
142
|
+
- Optional advanced analysis with compromise.js
|
|
143
|
+
|
|
144
|
+
#### Stage 4: Quality Assurance
|
|
145
|
+
- Generates disambiguation info when multiple articles could match
|
|
146
|
+
- Marks entries as "Variant of" when morphological variants are used
|
|
147
|
+
- Creates separate files for matched and unmatched entries
|
|
148
|
+
- Provides detailed statistics and sample unmatched entries
|
|
149
|
+
|
|
150
|
+
### 3. **Output Format**
|
|
151
|
+
|
|
152
|
+
The generated TSV contains these columns:
|
|
153
|
+
|
|
154
|
+
| Column | Description |
|
|
155
|
+
|--------|-------------|
|
|
156
|
+
| Reference | Chapter:verse (e.g., "1:1") |
|
|
157
|
+
| ID | Random 4-character ID starting with letter |
|
|
158
|
+
| Tags | "keyterm", "name", or empty based on article type |
|
|
159
|
+
| OrigWords | The matched word(s) from the text |
|
|
160
|
+
| Occurrence | Which occurrence of this word in the verse |
|
|
161
|
+
| TWLink | Link to Translation Words article (rc://*/tw/dict/bible/...) |
|
|
162
|
+
| GLQuote | English text context from ULT |
|
|
163
|
+
| GLOccurrence | Occurrence number in English context |
|
|
164
|
+
| Strongs | Original Strong's number |
|
|
165
|
+
| Variant of | Original term if morphological variant was used |
|
|
166
|
+
| Disambiguation | List of other possible articles |
|
|
167
|
+
|
|
168
|
+
### 4. **Matching Examples**
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
Reference OrigWords GLQuote TWLink Variant of
|
|
172
|
+
1:17 grace grace and truth rc://*/tw/dict/bible/kt/grace
|
|
173
|
+
1:17 gracious gracious God rc://*/tw/dict/bible/kt/grace grace
|
|
174
|
+
2:3 men wise men came rc://*/tw/dict/bible/other/man
|
|
175
|
+
2:3 wisdom with great wisdom rc://*/tw/dict/bible/kt/wise wise
|
|
176
|
+
```
|
|
72
177
|
|
|
73
178
|
## Development
|
|
74
179
|
|
|
180
|
+
### Prerequisites
|
|
181
|
+
- Node.js 18+ (uses native fetch)
|
|
182
|
+
- Git access to Door43 repositories
|
|
183
|
+
|
|
184
|
+
### Setup
|
|
75
185
|
```bash
|
|
76
|
-
|
|
186
|
+
git clone https://github.com/unfoldingWord/node-twl-generator.git
|
|
187
|
+
cd node-twl-generator
|
|
77
188
|
npm install
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### Testing
|
|
192
|
+
```bash
|
|
193
|
+
# Test single book generation
|
|
194
|
+
npm test
|
|
78
195
|
|
|
196
|
+
# Test specific book
|
|
197
|
+
npm run cli -- --book rut
|
|
198
|
+
|
|
199
|
+
# Test with advanced morphology
|
|
200
|
+
npm run cli -- --book jhn --use-compromise
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### Local Development
|
|
204
|
+
```bash
|
|
79
205
|
# Run CLI locally
|
|
80
|
-
|
|
206
|
+
node src/cli.js --book gen --out test-output.tsv
|
|
207
|
+
|
|
208
|
+
# Test library integration
|
|
209
|
+
node -e "import('./src/index.js').then(m => m.generateTwlByBook('rut').then(console.log))"
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
### Project Structure
|
|
213
|
+
```
|
|
214
|
+
src/
|
|
215
|
+
├── cli.js # Command line interface
|
|
216
|
+
├── index.js # Main library exports
|
|
217
|
+
├── common/
|
|
218
|
+
│ └── books.js # Bible book metadata
|
|
219
|
+
└── utils/
|
|
220
|
+
├── twl-matcher.js # Term matching algorithms (legacy)
|
|
221
|
+
├── zipProcessor.js # TW archive processing (legacy)
|
|
222
|
+
└── usfm-alignment-remover.js # USFM parsing (legacy)
|
|
223
|
+
tw_strongs_list.json # Translation Words database
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
## Data Files
|
|
227
|
+
|
|
228
|
+
### `tw_strongs_list.json`
|
|
229
|
+
This file contains the core mapping between Strong's numbers and Translation Words articles:
|
|
230
|
+
|
|
231
|
+
```json
|
|
232
|
+
{
|
|
233
|
+
"kt/god": {
|
|
234
|
+
"article": {
|
|
235
|
+
"terms": ["God", "god", "deity", "divine"]
|
|
236
|
+
},
|
|
237
|
+
"strongs": [
|
|
238
|
+
["H430"], // Single Strong's number
|
|
239
|
+
["H410"],
|
|
240
|
+
["G2316", "G2318"] // Multiple Strong's for compound concepts
|
|
241
|
+
]
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## Contributing
|
|
247
|
+
|
|
248
|
+
We welcome contributions! Here's how you can help:
|
|
249
|
+
|
|
250
|
+
### Reporting Issues
|
|
251
|
+
- **Missing matches**: If legitimate biblical terms aren't being matched
|
|
252
|
+
- **False positives**: If non-terms are being incorrectly matched
|
|
253
|
+
- **Performance issues**: Slow processing or memory problems
|
|
254
|
+
- **Data quality**: Incorrect Strong's mappings or term definitions
|
|
81
255
|
|
|
82
|
-
|
|
83
|
-
|
|
256
|
+
### Enhancement Ideas
|
|
257
|
+
- **Better morphological analysis**: Improve verb conjugation and irregular forms
|
|
258
|
+
- **Multi-language support**: Extend beyond English GLQuotes
|
|
259
|
+
- **Contextual disambiguation**: Use surrounding words for better article selection
|
|
260
|
+
- **Performance optimization**: Faster processing for large corpora
|
|
84
261
|
|
|
85
|
-
|
|
86
|
-
|
|
262
|
+
### Development Workflow
|
|
263
|
+
1. Fork the repository
|
|
264
|
+
2. Create a feature branch: `git checkout -b feature-name`
|
|
265
|
+
3. Make your changes with tests
|
|
266
|
+
4. Run the test suite: `npm test`
|
|
267
|
+
5. Submit a pull request with detailed description
|
|
268
|
+
|
|
269
|
+
### Testing Your Changes
|
|
270
|
+
```bash
|
|
271
|
+
# Test various scenarios
|
|
272
|
+
npm run cli -- --book psa --use-compromise # Large book with advanced features
|
|
273
|
+
npm run cli -- --book phm # Short book for quick testing
|
|
274
|
+
npm run cli -- --book rev # Symbolic language testing
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
## Browser Compatibility
|
|
278
|
+
|
|
279
|
+
While primarily designed for Node.js, core functionality works in modern browsers:
|
|
280
|
+
|
|
281
|
+
```javascript
|
|
282
|
+
// React/Browser usage example
|
|
283
|
+
import { generateTwlByBook } from 'twl-generator';
|
|
284
|
+
|
|
285
|
+
const MyComponent = () => {
|
|
286
|
+
const [tsvData, setTsvData] = useState(null);
|
|
287
|
+
|
|
288
|
+
const generateTWL = async () => {
|
|
289
|
+
try {
|
|
290
|
+
const result = await generateTwlByBook('mat');
|
|
291
|
+
setTsvData(result.matchedTsv);
|
|
292
|
+
} catch (error) {
|
|
293
|
+
console.error('TWL generation failed:', error);
|
|
294
|
+
}
|
|
295
|
+
};
|
|
296
|
+
|
|
297
|
+
return (
|
|
298
|
+
<div>
|
|
299
|
+
<button onClick={generateTWL}>Generate TWL for Matthew</button>
|
|
300
|
+
{tsvData && <pre>{tsvData}</pre>}
|
|
301
|
+
</div>
|
|
302
|
+
);
|
|
303
|
+
};
|
|
87
304
|
```
|
|
88
305
|
|
|
306
|
+
## Performance
|
|
307
|
+
|
|
308
|
+
Typical processing times:
|
|
309
|
+
- **Short books** (Philemon, 2-3 John): < 5 seconds
|
|
310
|
+
- **Medium books** (Ruth, Ephesians): 5-15 seconds
|
|
311
|
+
- **Large books** (Psalms, Matthew): 30-60 seconds
|
|
312
|
+
- **All books**: 15-30 minutes depending on network speed
|
|
313
|
+
|
|
314
|
+
Memory usage scales with book size, typically 50-200MB peak.
|
|
315
|
+
|
|
89
316
|
## License
|
|
90
317
|
|
|
91
|
-
MIT
|
|
318
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
319
|
+
|
|
320
|
+
## Support
|
|
321
|
+
|
|
322
|
+
- **Issues**: https://github.com/unfoldingWord/node-twl-generator/issues
|
|
323
|
+
- **Discussions**: https://github.com/unfoldingWord/node-twl-generator/discussions
|
|
324
|
+
- **Documentation**: https://github.com/unfoldingWord/node-twl-generator/wiki
|
|
325
|
+
|
|
326
|
+
## Related Projects
|
|
327
|
+
|
|
328
|
+
- [tsv-quote-converters](https://www.npmjs.com/package/tsv-quote-converters) - GLQuote generation
|
|
329
|
+
- [compromise](https://www.npmjs.com/package/compromise) - Advanced morphological analysis
|
|
330
|
+
- [Door43 Content](https://git.door43.org/unfoldingWord) - Source biblical texts and resources
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "twl-generator",
|
|
3
|
-
"version": "1.3.
|
|
3
|
+
"version": "1.3.2",
|
|
4
4
|
"description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
package/src/index.js
CHANGED
|
@@ -713,7 +713,7 @@ export async function generateTwlByBook(bookCode, options = {}) {
|
|
|
713
713
|
};
|
|
714
714
|
|
|
715
715
|
// New header order: Reference, ID, Tags, OrigWords, Occurrence, TWLink, Strongs, GLQuote, GLOccurrence
|
|
716
|
-
const finalHeaderBase = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', '
|
|
716
|
+
const finalHeaderBase = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Strongs'];
|
|
717
717
|
const usedIds = new Set();
|
|
718
718
|
const genId = () => {
|
|
719
719
|
const letters = 'abcdefghijklmnopqrstuvwxyz';
|
|
@@ -741,9 +741,9 @@ export async function generateTwlByBook(bookCode, options = {}) {
|
|
|
741
741
|
c[A.OrigWords],
|
|
742
742
|
c[A.Occurrence],
|
|
743
743
|
c[A.TWLink],
|
|
744
|
-
strongsVal,
|
|
745
744
|
c[A.GLQuote],
|
|
746
745
|
c[A.GLOccurrence],
|
|
746
|
+
strongsVal,
|
|
747
747
|
];
|
|
748
748
|
preparedRows.push(newRow);
|
|
749
749
|
}
|
|
@@ -756,9 +756,9 @@ export async function generateTwlByBook(bookCode, options = {}) {
|
|
|
756
756
|
OrigWords: 3,
|
|
757
757
|
Occurrence: 4,
|
|
758
758
|
TWLink: 5,
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
759
|
+
GLQuote: 6,
|
|
760
|
+
GLOccurrence: 7,
|
|
761
|
+
Strongs: 8,
|
|
762
762
|
};
|
|
763
763
|
|
|
764
764
|
// 5) pick best TWLink based on GLQuote terms using Strongs column; include Variant of column
|