twl-generator 1.2.15 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -67
- package/package.json +5 -2
- package/src/cli.js +72 -74
- package/src/index.js +807 -27
package/README.md
CHANGED
|
@@ -1,104 +1,90 @@
|
|
|
1
|
-
#
|
|
1
|
+
# TWL Generator
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
A Node.js library and CLI tool for generating Translation Word Links (TWL) TSV files from Door43 USFM data and Translation Words (TW) metadata.
|
|
4
4
|
|
|
5
|
-
##
|
|
6
|
-
|
|
7
|
-
- ✅ **Universal**: Works in Node.js and browser environments
|
|
8
|
-
- ✅ **Smart Caching**: File system (Node.js) or localStorage/sessionStorage (browser)
|
|
9
|
-
- ✅ **Performance**: Optimized matching with PrefixTrie algorithm
|
|
10
|
-
- ✅ **Case Sensitivity**: Proper God/god distinction (God→kt/god, god→kt/falsegod)
|
|
11
|
-
- ✅ **Morphological Variants**: Handles plurals, possessives, verb forms
|
|
12
|
-
- ✅ **Parentheses Normalization**: "Joseph (OT)" → "Joseph" for better coverage
|
|
13
|
-
|
|
14
|
-
---
|
|
15
|
-
|
|
16
|
-
## Usage
|
|
17
|
-
|
|
18
|
-
### CLI
|
|
19
|
-
|
|
20
|
-
Install globally:
|
|
5
|
+
## Installation
|
|
21
6
|
|
|
7
|
+
### Global CLI
|
|
22
8
|
```bash
|
|
23
9
|
npm install -g twl-generator
|
|
24
10
|
```
|
|
25
11
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
```bash
|
|
29
|
-
twl-generator --book rut
|
|
30
|
-
```
|
|
31
|
-
|
|
32
|
-
Generate a TWL TSV from a local USFM file:
|
|
33
|
-
|
|
12
|
+
### Library Usage
|
|
34
13
|
```bash
|
|
35
|
-
twl-generator
|
|
14
|
+
npm install twl-generator
|
|
36
15
|
```
|
|
37
16
|
|
|
38
|
-
|
|
17
|
+
## Usage
|
|
39
18
|
|
|
19
|
+
### Command Line
|
|
20
|
+
Generate TWL for a specific book:
|
|
40
21
|
```bash
|
|
41
|
-
twl
|
|
22
|
+
generate-twl --book deu --out deuteronomy.twl.tsv
|
|
42
23
|
```
|
|
43
24
|
|
|
44
|
-
|
|
45
|
-
|
|
25
|
+
Generate TWL for all books:
|
|
46
26
|
```bash
|
|
47
|
-
twl
|
|
27
|
+
generate-twl --all --out-dir ./output
|
|
48
28
|
```
|
|
49
29
|
|
|
50
|
-
|
|
30
|
+
Options:
|
|
31
|
+
- `--book <code>`: Book code (e.g., gen, deu, mat, etc.)
|
|
32
|
+
- `--all`: Generate for all books
|
|
33
|
+
- `--out <file>`: Output file path
|
|
34
|
+
- `--out-dir <dir>`: Output directory for all books
|
|
35
|
+
- `--use-compromise`: Enable advanced verb conjugation matching
|
|
51
36
|
|
|
52
|
-
###
|
|
37
|
+
### Library Usage
|
|
38
|
+
```javascript
|
|
39
|
+
import { generateTwlByBook } from 'twl-generator';
|
|
53
40
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
```bash
|
|
57
|
-
npm install twl-generator
|
|
41
|
+
const { matchedTsv, noMatchTsv } = await generateTwlByBook('deu');
|
|
42
|
+
console.log(matchedTsv); // TSV string with matched Translation Word links
|
|
58
43
|
```
|
|
59
44
|
|
|
60
|
-
|
|
45
|
+
## Features
|
|
61
46
|
|
|
62
|
-
|
|
63
|
-
|
|
47
|
+
- **Smart Matching**: Multi-stage matching algorithm with word boundaries, case sensitivity, and morphological variants
|
|
48
|
+
- **Morphological Support**: Handles plurals, verb conjugations, and irregular forms
|
|
49
|
+
- **Variant Detection**: Identifies when terms are matched via substring or truncation
|
|
50
|
+
- **Browser Compatible**: Core library works in modern browsers
|
|
51
|
+
- **CLI Ready**: Global command-line tool for batch processing
|
|
64
52
|
|
|
65
|
-
|
|
66
|
-
const usfmContent = `
|
|
67
|
-
\\id MAT
|
|
68
|
-
\\c 1
|
|
69
|
-
\\v 1 In the beginning...
|
|
70
|
-
`;
|
|
53
|
+
## Matching Algorithm
|
|
71
54
|
|
|
72
|
-
|
|
55
|
+
The TWL generator uses a sophisticated 4-stage matching process:
|
|
73
56
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
57
|
+
1. **Case-sensitive word boundary**: Exact matches with word boundaries
|
|
58
|
+
2. **Case-insensitive word boundary**: Flexible case matching with boundaries
|
|
59
|
+
3. **Case-sensitive substring**: Exact substring matching
|
|
60
|
+
4. **Case-insensitive stripped forms**: Controlled morphological variants
|
|
78
61
|
|
|
79
|
-
|
|
62
|
+
## Data Sources
|
|
80
63
|
|
|
81
|
-
|
|
82
|
-
|
|
64
|
+
- **USFM**: Fetched from Door43 repositories (unfoldingWord/hbo_uhb, unfoldingWord/el-x-koine_ugnt)
|
|
65
|
+
- **Translation Words**: Local tw_strongs_list.json with Strong's mappings and term lists
|
|
66
|
+
- **English Bible**: Uses unfoldingWord/en_ult for GLQuote generation
|
|
83
67
|
|
|
84
|
-
|
|
68
|
+
## Output Format
|
|
85
69
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
console.log(tsv);
|
|
89
|
-
```
|
|
70
|
+
The generated TSV includes these columns:
|
|
71
|
+
- Reference, ID, Tags, OrigWords, Occurrence, TWLink, Strongs, GLQuote, GLOccurrence, Variant of, Disambiguation
|
|
90
72
|
|
|
91
|
-
|
|
73
|
+
## Development
|
|
92
74
|
|
|
93
|
-
|
|
75
|
+
```bash
|
|
76
|
+
# Install dependencies
|
|
77
|
+
npm install
|
|
94
78
|
|
|
95
|
-
|
|
79
|
+
# Run CLI locally
|
|
80
|
+
npm start -- --book gen
|
|
96
81
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
- **Returns:** `Promise<string>` — TSV string of TWL matches.
|
|
82
|
+
# Run browser demo
|
|
83
|
+
npm run styleguide
|
|
100
84
|
|
|
101
|
-
|
|
85
|
+
# Build for production
|
|
86
|
+
npm run styleguide:build
|
|
87
|
+
```
|
|
102
88
|
|
|
103
89
|
## License
|
|
104
90
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "twl-generator",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -46,7 +46,10 @@
|
|
|
46
46
|
"node": ">=18.0.0"
|
|
47
47
|
},
|
|
48
48
|
"dependencies": {
|
|
49
|
-
"
|
|
49
|
+
"csv-parse": "^5.5.6",
|
|
50
|
+
"csv-stringify": "^6.5.0",
|
|
51
|
+
"compromise": "^14.14.2",
|
|
52
|
+
"tsv-quote-converters": "^1.1.13"
|
|
50
53
|
},
|
|
51
54
|
"peerDependencies": {
|
|
52
55
|
"react": ">=16.8.0"
|
package/src/cli.js
CHANGED
|
@@ -1,86 +1,84 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
import
|
|
2
|
+
import fs from 'node:fs/promises';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
import { generateTwlByBook } from '../src/index.js';
|
|
5
|
+
import { BibleBookData } from '../src/common/books.js';
|
|
5
6
|
|
|
6
|
-
const
|
|
7
|
+
const THIS_DIR = path.dirname(new URL(import.meta.url).pathname);
|
|
7
8
|
|
|
8
|
-
function
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
--book <book> Specify the Bible book (e.g., rut)
|
|
13
|
-
--usfm <path> Path to USFM file to process
|
|
14
|
-
--output <path> Path to output TSV file
|
|
15
|
-
--help Show this help message
|
|
16
|
-
|
|
17
|
-
Examples:
|
|
18
|
-
generate-twls --book rut
|
|
19
|
-
generate-twls --usfm ./41-MAT.usfm --output ./mat_twl.tsv
|
|
20
|
-
generate-twls --usfm ./file.usfm --book rut`);
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
let book = null;
|
|
24
|
-
let usfmPath = null;
|
|
25
|
-
let outputPath = null;
|
|
26
|
-
|
|
27
|
-
for (let i = 0; i < args.length; i++) {
|
|
28
|
-
if (args[i] === '--book' && args[i + 1]) {
|
|
29
|
-
book = args[i + 1].toLowerCase();
|
|
30
|
-
i++;
|
|
31
|
-
} else if (args[i] === '--usfm' && args[i + 1]) {
|
|
32
|
-
usfmPath = args[i + 1];
|
|
33
|
-
i++;
|
|
34
|
-
} else if (args[i] === '--output' && args[i + 1]) {
|
|
35
|
-
outputPath = args[i + 1];
|
|
36
|
-
i++;
|
|
37
|
-
} else if (args[i] === '--help') {
|
|
38
|
-
printHelp();
|
|
39
|
-
process.exit(0);
|
|
9
|
+
async function readBooksJs() {
|
|
10
|
+
const map = {};
|
|
11
|
+
for (const [code, meta] of Object.entries(BibleBookData)) {
|
|
12
|
+
map[code.toUpperCase()] = { usfm: meta.usfm, testament: meta.testament };
|
|
40
13
|
}
|
|
14
|
+
return map;
|
|
41
15
|
}
|
|
42
16
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
if (
|
|
51
|
-
|
|
52
|
-
|
|
17
|
+
function parseArgs(argv) {
|
|
18
|
+
const args = { book: '', out: '', outDir: '', all: false, useCompromise: false };
|
|
19
|
+
for (let i = 2; i < argv.length; i++) {
|
|
20
|
+
const a = argv[i];
|
|
21
|
+
if (a === '--book' || a === '-b') { args.book = argv[++i] || ''; }
|
|
22
|
+
else if (a === '--out' || a === '-o') { args.out = argv[++i] || ''; }
|
|
23
|
+
else if (a === '--out-dir' || a === '-O') { args.outDir = argv[++i] || ''; }
|
|
24
|
+
else if (a === '--all' || a === '-A') { args.all = true; }
|
|
25
|
+
else if (a === '--use-compromise') { args.useCompromise = true; }
|
|
26
|
+
}
|
|
27
|
+
return args;
|
|
53
28
|
}
|
|
54
29
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
const
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
filename = 'output.tsv';
|
|
30
|
+
async function main() {
|
|
31
|
+
const { book, out, outDir, all, useCompromise } = parseArgs(process.argv);
|
|
32
|
+
if (all || (book && book.toLowerCase() === 'all')) {
|
|
33
|
+
const books = await readBooksJs();
|
|
34
|
+
const codes = Object.keys(books);
|
|
35
|
+
const destDir = outDir ? path.resolve(outDir) : path.resolve(THIS_DIR, '..'); // default to twl-generator dir
|
|
36
|
+
await fs.mkdir(destDir, { recursive: true });
|
|
37
|
+
console.error(`Generating TWL for ${codes.length} books to ${destDir} (useCompromise=${useCompromise})`);
|
|
38
|
+
for (const code of codes) {
|
|
39
|
+
try {
|
|
40
|
+
const { matchedTsv, noMatchTsv } = await generateTwlByBook(code, { useCompromise });
|
|
41
|
+
const fname = `${code.toLowerCase()}.twl.tsv`;
|
|
42
|
+
const outPath = path.join(destDir, fname);
|
|
43
|
+
await fs.writeFile(outPath, matchedTsv, 'utf8');
|
|
44
|
+
const nmPath = path.join(destDir, `${code.toLowerCase()}.no-match.twl.tsv`);
|
|
45
|
+
await fs.writeFile(nmPath, noMatchTsv, 'utf8');
|
|
46
|
+
console.error(` ✓ ${code} -> ${fname}`);
|
|
47
|
+
} catch (err) {
|
|
48
|
+
console.error(` ✗ ${code} failed:`, err.message || err);
|
|
49
|
+
}
|
|
76
50
|
}
|
|
51
|
+
return;
|
|
52
|
+
}
|
|
77
53
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
console.log(`TSV file saved as ${filename}`);
|
|
81
|
-
console.log(`Found ${tsv.split('\n').length - 1} matches`);
|
|
82
|
-
} catch (error) {
|
|
83
|
-
console.error('Error:', error.message);
|
|
54
|
+
if (!book) {
|
|
55
|
+
console.error('Usage: generate-twl --book <code>|all [--out <file.tsv> | --out-dir <dir>] [--use-compromise]');
|
|
84
56
|
process.exit(1);
|
|
85
57
|
}
|
|
86
|
-
|
|
58
|
+
|
|
59
|
+
const { matchedTsv, noMatchTsv } = await generateTwlByBook(book, { useCompromise });
|
|
60
|
+
if (out) {
|
|
61
|
+
const outPath = path.resolve(out);
|
|
62
|
+
await fs.writeFile(outPath, matchedTsv, 'utf8');
|
|
63
|
+
console.log(`Wrote ${out}`);
|
|
64
|
+
const dir = path.dirname(outPath);
|
|
65
|
+
const base = path.basename(outPath);
|
|
66
|
+
const nmPath = path.join(dir, base.replace(/\.twl\.tsv$/i, '.no-match.twl.tsv'));
|
|
67
|
+
await fs.writeFile(nmPath, noMatchTsv, 'utf8');
|
|
68
|
+
console.log(`Wrote ${nmPath}`);
|
|
69
|
+
} else if (outDir) {
|
|
70
|
+
const destDir = path.resolve(outDir);
|
|
71
|
+
await fs.mkdir(destDir, { recursive: true });
|
|
72
|
+
const outPath = path.join(destDir, `${book.toLowerCase()}.twl.tsv`);
|
|
73
|
+
await fs.writeFile(outPath, matchedTsv, 'utf8');
|
|
74
|
+
const nmPath = path.join(destDir, `${book.toLowerCase()}.no-match.twl.tsv`);
|
|
75
|
+
await fs.writeFile(nmPath, noMatchTsv, 'utf8');
|
|
76
|
+
console.log(`Wrote ${outPath}`);
|
|
77
|
+
console.log(`Wrote ${nmPath}`);
|
|
78
|
+
} else {
|
|
79
|
+
// When writing to stdout, output only the matched TSV to avoid mixing tables
|
|
80
|
+
process.stdout.write(matchedTsv);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
main().catch(err => { console.error(err); process.exit(1); });
|
package/src/index.js
CHANGED
|
@@ -1,32 +1,812 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
const
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
if (
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
1
|
+
import { BibleBookData } from './common/books.js';
|
|
2
|
+
|
|
3
|
+
const isBrowser = typeof window !== 'undefined';
|
|
4
|
+
const TW_JSON_URL = new URL('../tw_strongs_list.json', import.meta.url);
|
|
5
|
+
|
|
6
|
+
async function readBooks() {
|
|
7
|
+
// Build a simple CODE -> { usfm, testament } map from the local BibleBookData
|
|
8
|
+
const map = {};
|
|
9
|
+
for (const [code, meta] of Object.entries(BibleBookData)) {
|
|
10
|
+
map[code.toUpperCase()] = { usfm: meta.usfm, testament: meta.testament };
|
|
11
|
+
}
|
|
12
|
+
return map;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function findBookMeta(bookMap, code) {
|
|
16
|
+
const key = Object.keys(bookMap).find(k => k.toLowerCase() === code.toLowerCase());
|
|
17
|
+
if (!key) return null;
|
|
18
|
+
const meta = bookMap[key];
|
|
19
|
+
if (!meta || !meta.usfm || !meta.testament) return null;
|
|
20
|
+
return { key, ...meta };
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
async function fetchUsfm(usfmCode, testament) {
|
|
24
|
+
const repo = testament === 'old' ? 'hbo_uhb' : 'el-x-koine_ugnt';
|
|
25
|
+
const url = `https://git.door43.org/api/v1/repos/unfoldingWord/${repo}/contents/${usfmCode}.usfm`;
|
|
26
|
+
const res = await fetch(url);
|
|
27
|
+
if (!res.ok) throw new Error(`Failed to fetch USFM: ${res.status} ${res.statusText}`);
|
|
28
|
+
const json = await res.json();
|
|
29
|
+
const b64 = json.content || '';
|
|
30
|
+
|
|
31
|
+
if (isBrowser) {
|
|
32
|
+
// Browser: use atob and TextDecoder
|
|
33
|
+
const binary = atob(b64);
|
|
34
|
+
const bytes = Uint8Array.from(binary, c => c.charCodeAt(0));
|
|
35
|
+
const decoder = new TextDecoder('utf-8');
|
|
36
|
+
return decoder.decode(bytes);
|
|
23
37
|
} else {
|
|
24
|
-
//
|
|
25
|
-
|
|
26
|
-
|
|
38
|
+
// Node.js: use Buffer
|
|
39
|
+
const { Buffer } = await import('node:buffer');
|
|
40
|
+
const buf = Buffer.from(b64, 'base64');
|
|
41
|
+
return buf.toString('utf8');
|
|
27
42
|
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function pivotByStrong(twMap) {
|
|
46
|
+
// Build two structures:
|
|
47
|
+
// 1) singles: strong -> Set(articles) including base (strip letter suffix)
|
|
48
|
+
// 2) seqFirst: base-first-strong -> [{ article, seqBase, len }] preserving order in twMap
|
|
49
|
+
const singles = new Map();
|
|
50
|
+
const seqFirst = new Map();
|
|
51
|
+
const toBase = (sid) => {
|
|
52
|
+
const m = String(sid || '').match(/^([HG])(\d+)([a-f])?$/i);
|
|
53
|
+
if (!m) return '';
|
|
54
|
+
return `${m[1].toUpperCase()}${m[2]}`;
|
|
55
|
+
};
|
|
56
|
+
|
|
57
|
+
for (const [article, val] of Object.entries(twMap)) {
|
|
58
|
+
const list = Array.isArray(val && val.strongs ? val.strongs : undefined) ? val.strongs : [];
|
|
59
|
+
for (const arr of list) {
|
|
60
|
+
const seq = Array.isArray(arr) ? arr.filter(Boolean) : [];
|
|
61
|
+
if (!seq.length) continue;
|
|
62
|
+
// map each sid to article for singles (also its base form)
|
|
63
|
+
for (const sid of seq) {
|
|
64
|
+
const add = (s) => {
|
|
65
|
+
if (!s) return;
|
|
66
|
+
if (!singles.has(s)) singles.set(s, new Set());
|
|
67
|
+
singles.get(s).add(article);
|
|
68
|
+
};
|
|
69
|
+
add(sid);
|
|
70
|
+
add(toBase(sid));
|
|
71
|
+
}
|
|
72
|
+
// record multi-strong sequences by their base first sid
|
|
73
|
+
if (seq.length > 1) {
|
|
74
|
+
const firstBase = toBase(seq[0]);
|
|
75
|
+
if (firstBase) {
|
|
76
|
+
if (!seqFirst.has(firstBase)) seqFirst.set(firstBase, []);
|
|
77
|
+
seqFirst.get(firstBase).push({ article, seqBase: seq.map(toBase), len: seq.length });
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
// convert to plain objects/arrays
|
|
83
|
+
const singlesObj = {};
|
|
84
|
+
for (const [k, v] of singles.entries()) singlesObj[k] = Array.from(v);
|
|
85
|
+
const seqFirstObj = {};
|
|
86
|
+
for (const [k, v] of seqFirst.entries()) seqFirstObj[k] = v.slice().sort((a, b) => b.len - a.len);
|
|
87
|
+
// expose legacy mapping for strong -> articles, and an extra property for sequences
|
|
88
|
+
return Object.assign(singlesObj, { __seqFirst: seqFirstObj });
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function parseWTokens(usfm) {
|
|
92
|
+
// return array of { c, v, surface, attrs }
|
|
93
|
+
const out = [];
|
|
94
|
+
let curC = 0, curV = 0;
|
|
95
|
+
const cRe = /\\c\s+(\d+)/g;
|
|
96
|
+
let m;
|
|
97
|
+
// We'll iterate once and collect tokens with current chapter/verse; cheaper: do a global walk
|
|
98
|
+
const re = /(\\c\s+(\d+))|(\\v\s+(\d+))|\\w\s+([^|\s][^|]*?)\|([^\\]*?)\\w\*/g;
|
|
99
|
+
while ((m = re.exec(usfm))) {
|
|
100
|
+
if (m[2]) { curC = parseInt(m[2], 10); continue; }
|
|
101
|
+
if (m[4]) { curV = parseInt(m[4], 10); continue; }
|
|
102
|
+
if (m[5]) {
|
|
103
|
+
out.push({ c: curC, v: curV, surface: m[5], attrs: m[6] || '' });
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
return out;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function extractStrongIds(attrText) {
|
|
110
|
+
const sm = attrText.match(/(?:x-)?strong="([^"]+)"/);
|
|
111
|
+
if (!sm) return [];
|
|
112
|
+
const parts = sm[1].split(/[\s|]+/).map(s => s.trim()).filter(Boolean);
|
|
113
|
+
const out = [];
|
|
114
|
+
for (let p of parts) {
|
|
115
|
+
const core = p.split(':').pop().trim();
|
|
116
|
+
const m = core.match(/^([HG])(\d+)([a-f]?)$/i);
|
|
117
|
+
if (!m) continue;
|
|
118
|
+
out.push(`${m[1].toUpperCase()}${m[2]}${(m[3] || '').toLowerCase()}`);
|
|
119
|
+
}
|
|
120
|
+
return out;
|
|
121
|
+
}
|
|
28
122
|
|
|
29
|
-
|
|
30
|
-
const
|
|
123
|
+
function buildInitialTsv(usfm, strongPivot, bookCode) {
|
|
124
|
+
const tokens = parseWTokens(usfm).map(t => ({ ...t, sids: extractStrongIds(t.attrs) }));
|
|
125
|
+
const rows = [];
|
|
126
|
+
// map of `${c}:${v}` -> Map(phrase -> count)
|
|
127
|
+
const occMap = new Map();
|
|
128
|
+
const getArts = (sid) => {
|
|
129
|
+
let arts = strongPivot[sid];
|
|
130
|
+
if ((!arts || !arts.length) && /^(H|G)\d+[a-f]$/.test(sid)) {
|
|
131
|
+
const base = sid.slice(0, -1);
|
|
132
|
+
arts = strongPivot[base];
|
|
133
|
+
}
|
|
134
|
+
return arts;
|
|
135
|
+
};
|
|
136
|
+
const toBase = (sid) => {
|
|
137
|
+
const m = String(sid || '').match(/^([HG])(\d+)([a-f])?$/i);
|
|
138
|
+
if (!m) return '';
|
|
139
|
+
return `${m[1].toUpperCase()}${m[2]}`;
|
|
140
|
+
};
|
|
141
|
+
const tokenHasSid = (tok, sidBase) => {
|
|
142
|
+
if (!sidBase) return false;
|
|
143
|
+
return (tok.sids || []).some(s => toBase(s) === sidBase);
|
|
144
|
+
};
|
|
145
|
+
const seqFirst = strongPivot.__seqFirst || {};
|
|
146
|
+
|
|
147
|
+
let i = 0;
|
|
148
|
+
while (i < tokens.length) {
|
|
149
|
+
const t = tokens[i];
|
|
150
|
+
if (!t.c || !t.v) { i++; continue; }
|
|
151
|
+
const keyCv = `${t.c}:${t.v}`;
|
|
152
|
+
if (!occMap.has(keyCv)) occMap.set(keyCv, new Map());
|
|
153
|
+
const cvMap = occMap.get(keyCv);
|
|
154
|
+
|
|
155
|
+
// Try to match the longest multi-Strong's sequence starting at this token (within the same verse)
|
|
156
|
+
let bestSeq = null;
|
|
157
|
+
const startBases = (t.sids || []).map(toBase).filter(Boolean);
|
|
158
|
+
for (const firstBase of startBases) {
|
|
159
|
+
const candidates = seqFirst[firstBase] || [];
|
|
160
|
+
for (const cand of candidates) {
|
|
161
|
+
// Ensure all subsequent sids match in order within the same verse
|
|
162
|
+
let ok = true;
|
|
163
|
+
for (let k = 0; k < cand.seqBase.length; k++) {
|
|
164
|
+
const pos = i + k;
|
|
165
|
+
const tt = tokens[pos];
|
|
166
|
+
if (!tt || tt.c !== t.c || tt.v !== t.v) { ok = false; break; }
|
|
167
|
+
if (!tokenHasSid(tt, cand.seqBase[k])) { ok = false; break; }
|
|
168
|
+
}
|
|
169
|
+
if (ok) {
|
|
170
|
+
if (!bestSeq || cand.len > bestSeq.len) bestSeq = { ...cand };
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if (bestSeq) {
|
|
176
|
+
// Build combined surface phrase and count occurrence within the verse
|
|
177
|
+
const len = bestSeq.len;
|
|
178
|
+
const phrase = tokens.slice(i, i + len).map(x => x.surface.trim()).join(' ');
|
|
179
|
+
const cur = (cvMap.get(phrase) || 0) + 1;
|
|
180
|
+
cvMap.set(phrase, cur);
|
|
181
|
+
// Assign ID as the first strong in the sequence; TWLink prefers the sequence's article
|
|
182
|
+
const firstSid = (t.sids && t.sids[0]) ? t.sids[0] : bestSeq.seqBase[0];
|
|
183
|
+
const art = bestSeq.article;
|
|
184
|
+
const tag = art.startsWith('kt/') ? 'kt' : (art.startsWith('names/') ? 'names' : '');
|
|
185
|
+
const twLink = `rc://*/tw/dict/bible/${art}`;
|
|
186
|
+
rows.push([`${t.c}:${t.v}`, firstSid, tag, phrase, String(cur), twLink]);
|
|
187
|
+
i += len; // skip consumed tokens
|
|
188
|
+
continue;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Fallback: single-token behavior
|
|
192
|
+
const normSurface = t.surface.trim();
|
|
193
|
+
const cur = (cvMap.get(normSurface) || 0) + 1;
|
|
194
|
+
cvMap.set(normSurface, cur);
|
|
195
|
+
const sidList = t.sids || [];
|
|
196
|
+
if (!sidList.length) { i++; continue; }
|
|
197
|
+
for (const sid of sidList) {
|
|
198
|
+
const arts = getArts(sid);
|
|
199
|
+
if (!arts || !arts.length) continue;
|
|
200
|
+
const first = arts[0];
|
|
201
|
+
const tag = first.startsWith('kt/') ? 'kt' : (first.startsWith('names/') ? 'names' : '');
|
|
202
|
+
const twLink = `rc://*/tw/dict/bible/${first}`;
|
|
203
|
+
rows.push([`${t.c}:${t.v}`, sid, tag, normSurface, String(cur), twLink]);
|
|
204
|
+
}
|
|
205
|
+
i++;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
const header = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink'];
|
|
209
|
+
const tsv = [header.join('\t'), ...rows.map(r => r.join('\t'))].join('\n');
|
|
31
210
|
return tsv;
|
|
32
211
|
}
|
|
212
|
+
|
|
213
|
+
async function loadTwJsonLocal() {
|
|
214
|
+
if (isBrowser) {
|
|
215
|
+
// In browser, try to fetch from public path
|
|
216
|
+
const url = '/tw_strongs_list.json';
|
|
217
|
+
const res = await fetch(url);
|
|
218
|
+
if (!res.ok) throw new Error(`Failed to fetch tw_strongs_list.json: ${res.status}`);
|
|
219
|
+
return await res.json();
|
|
220
|
+
} else {
|
|
221
|
+
// In Node.js, read from file system
|
|
222
|
+
const fs = await import('node:fs/promises');
|
|
223
|
+
const { fileURLToPath } = await import('node:url');
|
|
224
|
+
const filePath = fileURLToPath(TW_JSON_URL);
|
|
225
|
+
const raw = await fs.readFile(filePath, 'utf8');
|
|
226
|
+
return JSON.parse(raw);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
function buildArticleTermMap(twMap) {
|
|
231
|
+
// Normalize helper: remove only trailing parenthetical notes and collapse whitespace
|
|
232
|
+
const stripParensTrim = (s) => String(s || '').replace(/\s*\([^)]*\)\s*$/, '').replace(/\s+/g, ' ').trim();
|
|
233
|
+
const out = new Map(); // article -> ordered unique terms as { orig, lower }
|
|
234
|
+
for (const [k, v] of Object.entries(twMap)) {
|
|
235
|
+
const terms = (v && v.article && typeof v.article === 'object' && Array.isArray(v.article.terms)) ? v.article.terms : [];
|
|
236
|
+
const ordered = [];
|
|
237
|
+
for (const t of terms) {
|
|
238
|
+
const orig = stripParensTrim(t);
|
|
239
|
+
if (!orig) continue;
|
|
240
|
+
ordered.push({ orig, lower: orig.toLowerCase() });
|
|
241
|
+
}
|
|
242
|
+
// de-dupe by lower, preserve order
|
|
243
|
+
const seen = new Set();
|
|
244
|
+
const uniq = [];
|
|
245
|
+
for (const obj of ordered) {
|
|
246
|
+
if (seen.has(obj.lower)) continue;
|
|
247
|
+
seen.add(obj.lower);
|
|
248
|
+
uniq.push(obj);
|
|
249
|
+
}
|
|
250
|
+
// sort longest to shortest; for ties, preserve original order (stable by adding index)
|
|
251
|
+
const withOrd = uniq.map((o, i) => ({ ...o, ord: i }));
|
|
252
|
+
withOrd.sort((a, b) => {
|
|
253
|
+
const dl = b.orig.length - a.orig.length;
|
|
254
|
+
if (dl !== 0) return dl;
|
|
255
|
+
return a.ord - b.ord;
|
|
256
|
+
});
|
|
257
|
+
out.set(k, withOrd);
|
|
258
|
+
}
|
|
259
|
+
return out;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// Build prioritized candidate list for a given strongId and GLQuote
|
|
263
|
+
function prioritizeArticles(glq, strongId, strongPivot) {
|
|
264
|
+
let candidates = (strongPivot[strongId] || []).slice();
|
|
265
|
+
if ((!candidates || !candidates.length) && /^(H|G)\d+[a-f]$/.test(strongId)) {
|
|
266
|
+
const base = strongId.slice(0, -1);
|
|
267
|
+
candidates = (strongPivot[base] || []).slice();
|
|
268
|
+
}
|
|
269
|
+
if (!candidates.length) return [];
|
|
270
|
+
const text = String(glq || '').toLowerCase();
|
|
271
|
+
|
|
272
|
+
const slugOf = (art) => (art.includes('/') ? art.split('/').pop() : art).toLowerCase();
|
|
273
|
+
// 1) Articles whose slug appears in GLQuote, ordered by longer slug first
|
|
274
|
+
const slugMatched = candidates
|
|
275
|
+
.filter((a) => text.includes(slugOf(a)))
|
|
276
|
+
.sort((a, b) => slugOf(b).length - slugOf(a).length);
|
|
277
|
+
const inSlug = new Set(slugMatched);
|
|
278
|
+
|
|
279
|
+
// 2) Remaining articles grouped kt/ then names/ then other; each group sorted by slug alphabetically
|
|
280
|
+
const rest = candidates.filter(a => !inSlug.has(a));
|
|
281
|
+
const groupRank = (a) => (a.startsWith('kt/') ? 0 : (a.startsWith('names/') ? 1 : 2));
|
|
282
|
+
const restSorted = rest.sort((a, b) => {
|
|
283
|
+
const ga = groupRank(a), gb = groupRank(b);
|
|
284
|
+
if (ga !== gb) return ga - gb;
|
|
285
|
+
const sa = slugOf(a), sb = slugOf(b);
|
|
286
|
+
return sa.localeCompare(sb);
|
|
287
|
+
});
|
|
288
|
+
|
|
289
|
+
return slugMatched.concat(restSorted);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
function chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, opts = {}) {
|
|
293
|
+
const useCompromise = !!opts.useCompromise;
|
|
294
|
+
const nlp = opts.nlp;
|
|
295
|
+
const prioritized = prioritizeArticles(glq, strongId, strongPivot);
|
|
296
|
+
if (!prioritized.length) return null;
|
|
297
|
+
const textOrig = String(glq || '');
|
|
298
|
+
const textLower = textOrig.toLowerCase();
|
|
299
|
+
const escapeRegExp = s => s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
300
|
+
|
|
301
|
+
// Utility: split a term into head (all but last word) and last word.
|
|
302
|
+
// head has no trailing space, last has no leading space. Rejoin with (head ? head+" " : "") + last
|
|
303
|
+
const splitHeadLast = (term) => {
|
|
304
|
+
const parts = String(term || '').trim().split(/\s+/);
|
|
305
|
+
if (parts.length <= 1) return { head: '', last: parts[0] || '' };
|
|
306
|
+
const last = parts.pop();
|
|
307
|
+
return { head: parts.join(' '), last };
|
|
308
|
+
};
|
|
309
|
+
|
|
310
|
+
// Basic pluralization helper for English terms. Handles common endings and a few irregulars.
|
|
311
|
+
const pluralizeTerm = (term) => {
|
|
312
|
+
const out = new Set();
|
|
313
|
+
const add = (s) => { const v = s.trim(); if (v) out.add(v); };
|
|
314
|
+
const irregular = {
|
|
315
|
+
man: 'men', woman: 'women', person: 'people', child: 'children',
|
|
316
|
+
foot: 'feet', tooth: 'teeth', goose: 'geese', mouse: 'mice', ox: 'oxen',
|
|
317
|
+
};
|
|
318
|
+
const pluralizeWord = (w) => {
|
|
319
|
+
const lw = w.toLowerCase();
|
|
320
|
+
if (irregular[lw]) return irregular[lw];
|
|
321
|
+
// endings
|
|
322
|
+
if (/[^aeiou]y$/i.test(w)) return w.replace(/y$/i, 'ies');
|
|
323
|
+
if (/(s|x|z|ch|sh)$/i.test(w)) return w + 'es';
|
|
324
|
+
if (/f$/i.test(w) && !/(roof|belief|chief|proof)$/i.test(w)) return w.replace(/f$/i, 'ves');
|
|
325
|
+
if (/fe$/i.test(w)) return w.replace(/fe$/i, 'ves');
|
|
326
|
+
if (/o$/i.test(w)) return w + 'es';
|
|
327
|
+
return w + 's';
|
|
328
|
+
};
|
|
329
|
+
const parts = term.split(/\s+/);
|
|
330
|
+
if (parts.length === 1) {
|
|
331
|
+
add(pluralizeWord(term));
|
|
332
|
+
} else {
|
|
333
|
+
const last = parts.pop();
|
|
334
|
+
const pl = pluralizeWord(last);
|
|
335
|
+
add([...parts, pl].join(' '));
|
|
336
|
+
}
|
|
337
|
+
// also the simple +s as fallback
|
|
338
|
+
add(term + 's');
|
|
339
|
+
return Array.from(out);
|
|
340
|
+
};
|
|
341
|
+
|
|
342
|
+
// Helpers to form -ing and -ed variants for a single word
|
|
343
|
+
const isVowel = (ch) => /[aeiou]/i.test(ch);
|
|
344
|
+
const isConsonant = (ch) => /[a-z]/i.test(ch) && !isVowel(ch);
|
|
345
|
+
const endsWithCVC = (w) => {
|
|
346
|
+
if (w.length < 3) return false;
|
|
347
|
+
const a = w[w.length - 3], b = w[w.length - 2], c = w[w.length - 1];
|
|
348
|
+
if (!isConsonant(a) || !isVowel(b) || !isConsonant(c)) return false;
|
|
349
|
+
// don't double for w, x, y
|
|
350
|
+
if (/[wxy]/i.test(c)) return false;
|
|
351
|
+
return true;
|
|
352
|
+
};
|
|
353
|
+
const presentParticipleWord = (w) => {
|
|
354
|
+
if (/ie$/i.test(w)) return w.replace(/ie$/i, 'ying'); // tie -> tying
|
|
355
|
+
if (/ee$/i.test(w)) return w + 'ing'; // see -> seeing
|
|
356
|
+
if (/e$/i.test(w)) return w.replace(/e$/i, 'ing'); // make -> making
|
|
357
|
+
if (endsWithCVC(w)) return w + w[w.length - 1] + 'ing'; // run -> running
|
|
358
|
+
return w + 'ing';
|
|
359
|
+
};
|
|
360
|
+
const pastTenseWord = (w) => {
|
|
361
|
+
if (/e$/i.test(w)) return w + 'd'; // move -> moved
|
|
362
|
+
if (/[^aeiou]y$/i.test(w)) return w.replace(/y$/i, 'ied'); // carry -> carried
|
|
363
|
+
if (endsWithCVC(w)) return w + w[w.length - 1] + 'ed'; // stop -> stopped
|
|
364
|
+
return w + 'ed';
|
|
365
|
+
};
|
|
366
|
+
const ingEdFormsForTerm = (term) => {
|
|
367
|
+
const forms = new Set();
|
|
368
|
+
const parts = term.split(/\s+/);
|
|
369
|
+
if (parts.length === 1) {
|
|
370
|
+
forms.add(presentParticipleWord(term));
|
|
371
|
+
forms.add(pastTenseWord(term));
|
|
372
|
+
} else {
|
|
373
|
+
const last = parts.pop();
|
|
374
|
+
const base = parts.join(' ');
|
|
375
|
+
forms.add((base ? base + ' ' : '') + presentParticipleWord(last));
|
|
376
|
+
forms.add((base ? base + ' ' : '') + pastTenseWord(last));
|
|
377
|
+
}
|
|
378
|
+
return Array.from(forms);
|
|
379
|
+
};
|
|
380
|
+
|
|
381
|
+
// Irregular verb support: small curated map plus reverse lookup
|
|
382
|
+
const irregularVerbMap = {
|
|
383
|
+
be: ['am', 'is', 'are', 'was', 'were', 'been', 'being', 'be'],
|
|
384
|
+
do: ['did', 'done', 'doing', 'does'],
|
|
385
|
+
go: ['went', 'gone', 'going', 'goes'],
|
|
386
|
+
have: ['had', 'having', 'has'],
|
|
387
|
+
say: ['said', 'saying', 'says'],
|
|
388
|
+
see: ['saw', 'seen', 'seeing', 'sees'],
|
|
389
|
+
get: ['got', 'gotten', 'getting', 'gets'],
|
|
390
|
+
make: ['made', 'making', 'makes'],
|
|
391
|
+
take: ['took', 'taken', 'taking', 'takes'],
|
|
392
|
+
come: ['came', 'coming', 'comes'],
|
|
393
|
+
know: ['knew', 'known', 'knowing', 'knows'],
|
|
394
|
+
give: ['gave', 'given', 'giving', 'gives'],
|
|
395
|
+
find: ['found', 'finding', 'finds'],
|
|
396
|
+
think: ['thought', 'thinking', 'thinks'],
|
|
397
|
+
tell: ['told', 'telling', 'tells'],
|
|
398
|
+
become: ['became', 'become', 'becoming', 'becomes'],
|
|
399
|
+
show: ['showed', 'shown', 'showing', 'shows'],
|
|
400
|
+
leave: ['left', 'leaving', 'leaves'],
|
|
401
|
+
feel: ['felt', 'feeling', 'feels'],
|
|
402
|
+
put: ['put', 'putting', 'puts'],
|
|
403
|
+
bring: ['brought', 'bringing', 'brings'],
|
|
404
|
+
begin: ['began', 'begun', 'beginning', 'begins'],
|
|
405
|
+
keep: ['kept', 'keeping', 'keeps'],
|
|
406
|
+
hold: ['held', 'holding', 'holds'],
|
|
407
|
+
write: ['wrote', 'written', 'writing', 'writes'],
|
|
408
|
+
stand: ['stood', 'standing', 'stands'],
|
|
409
|
+
hear: ['heard', 'hearing', 'hears'],
|
|
410
|
+
let: ['let', 'letting', 'lets'],
|
|
411
|
+
mean: ['meant', 'meaning', 'means'],
|
|
412
|
+
set: ['set', 'setting', 'sets'],
|
|
413
|
+
meet: ['met', 'meeting', 'meets'],
|
|
414
|
+
run: ['ran', 'running', 'runs'],
|
|
415
|
+
pay: ['paid', 'paying', 'pays'],
|
|
416
|
+
sit: ['sat', 'sitting', 'sits'],
|
|
417
|
+
speak: ['spoke', 'spoken', 'speaking', 'speaks'],
|
|
418
|
+
lie: ['lay', 'lain', 'lying', 'lies'],
|
|
419
|
+
lead: ['led', 'leading', 'leads'],
|
|
420
|
+
read: ['read', 'reading', 'reads'],
|
|
421
|
+
grow: ['grew', 'grown', 'growing', 'grows'],
|
|
422
|
+
fall: ['fell', 'fallen', 'falling', 'falls'],
|
|
423
|
+
send: ['sent', 'sending', 'sends'],
|
|
424
|
+
build: ['built', 'building', 'builds'],
|
|
425
|
+
understand: ['understood', 'understanding', 'understands'],
|
|
426
|
+
draw: ['drew', 'drawn', 'drawing', 'draws'],
|
|
427
|
+
break: ['broke', 'broken', 'breaking', 'breaks'],
|
|
428
|
+
spend: ['spent', 'spending', 'spends'],
|
|
429
|
+
cut: ['cut', 'cutting', 'cuts'],
|
|
430
|
+
rise: ['rose', 'risen', 'rising', 'rises'],
|
|
431
|
+
drive: ['drove', 'driven', 'driving', 'drives'],
|
|
432
|
+
buy: ['bought', 'buying', 'buys'],
|
|
433
|
+
wear: ['wore', 'worn', 'wearing', 'wears'],
|
|
434
|
+
swear: ['swore', 'sworn', 'swearing', 'swears'],
|
|
435
|
+
drink: ['drank', 'drunk', 'drinking', 'drinks'],
|
|
436
|
+
eat: ['ate', 'eaten', 'eating', 'eats'],
|
|
437
|
+
choose: ['chose', 'chosen', 'choosing', 'chooses'],
|
|
438
|
+
};
|
|
439
|
+
const irregularReverse = (() => {
|
|
440
|
+
const m = new Map();
|
|
441
|
+
for (const [base, forms] of Object.entries(irregularVerbMap)) {
|
|
442
|
+
m.set(base.toLowerCase(), base);
|
|
443
|
+
for (const f of forms) m.set(String(f).toLowerCase(), base);
|
|
444
|
+
}
|
|
445
|
+
return m;
|
|
446
|
+
})();
|
|
447
|
+
// Return full-term variants where only the last word is replaced by its irregular forms set
|
|
448
|
+
const irregularFormsForTerm = (term) => {
|
|
449
|
+
const { head, last } = splitHeadLast(term);
|
|
450
|
+
const baseKey = irregularReverse.get(String(last).toLowerCase());
|
|
451
|
+
const acc = new Set();
|
|
452
|
+
if (baseKey) {
|
|
453
|
+
const prefix = head ? head + ' ' : '';
|
|
454
|
+
acc.add(prefix + baseKey);
|
|
455
|
+
for (const f of irregularVerbMap[baseKey] || []) acc.add(prefix + f);
|
|
456
|
+
}
|
|
457
|
+
return Array.from(acc);
|
|
458
|
+
};
|
|
459
|
+
|
|
460
|
+
// Use compromise to get conjugations for potential verbs
|
|
461
|
+
const conjugationsForTerm = (term) => {
|
|
462
|
+
// mutate only the last word; return full-term variants
|
|
463
|
+
const { head, last } = splitHeadLast(term);
|
|
464
|
+
const forms = new Set();
|
|
465
|
+
if (!useCompromise || !nlp) return Array.from(forms);
|
|
466
|
+
const doc = nlp(last);
|
|
467
|
+
const verbs = doc.verbs();
|
|
468
|
+
if (!verbs.found) return Array.from(forms);
|
|
469
|
+
const conj = verbs.conjugate();
|
|
470
|
+
const prefix = head ? head + ' ' : '';
|
|
471
|
+
for (const c of conj || []) {
|
|
472
|
+
for (const k of ['PastTense', 'PresentTense', 'Infinitive', 'Gerund', 'Participle']) {
|
|
473
|
+
const v = c[k];
|
|
474
|
+
if (Array.isArray(v)) v.forEach(x => x && forms.add(prefix + String(x)));
|
|
475
|
+
else if (v) forms.add(prefix + String(v));
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
return Array.from(forms);
|
|
479
|
+
};
|
|
480
|
+
|
|
481
|
+
// Compute earliest stage match per article, then choose best stage overall with priority tie-breaker
|
|
482
|
+
const perArticleMatches = [];
|
|
483
|
+
|
|
484
|
+
for (const art of prioritized) {
|
|
485
|
+
const terms = termMap.get(art) || [];
|
|
486
|
+
let stage = 0;
|
|
487
|
+
let termHit = '';
|
|
488
|
+
let truncated = false;
|
|
489
|
+
|
|
490
|
+
// Stage 1: case-sensitive, word-boundary
|
|
491
|
+
if (stage === 0) {
|
|
492
|
+
for (const tobj of terms) {
|
|
493
|
+
const termOrig = tobj.orig;
|
|
494
|
+
const alts = new Set([termOrig]);
|
|
495
|
+
for (const a of pluralizeTerm(termOrig)) alts.add(a);
|
|
496
|
+
// add irregular forms for last word; and conjugations when enabled
|
|
497
|
+
for (const a of irregularFormsForTerm(termOrig)) alts.add(a);
|
|
498
|
+
for (const a of conjugationsForTerm(termOrig)) alts.add(a);
|
|
499
|
+
for (const alt of alts) {
|
|
500
|
+
const re1 = new RegExp(`\\b${escapeRegExp(alt)}\\b`);
|
|
501
|
+
if (re1.test(textOrig)) { stage = 1; termHit = termOrig; break; }
|
|
502
|
+
}
|
|
503
|
+
if (stage === 1) break;
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
// Stage 2: case-insensitive, word-boundary
|
|
507
|
+
if (stage === 0) {
|
|
508
|
+
for (const tobj of terms) {
|
|
509
|
+
const termOrig = tobj.orig;
|
|
510
|
+
const alts = new Set([termOrig]);
|
|
511
|
+
for (const a of pluralizeTerm(termOrig)) alts.add(a);
|
|
512
|
+
for (const a of irregularFormsForTerm(termOrig)) alts.add(a);
|
|
513
|
+
for (const a of conjugationsForTerm(termOrig)) alts.add(a);
|
|
514
|
+
for (const alt of alts) {
|
|
515
|
+
const re2 = new RegExp(`\\b${escapeRegExp(alt)}\\b`, 'i');
|
|
516
|
+
if (re2.test(textOrig)) { stage = 2; termHit = termOrig; break; }
|
|
517
|
+
}
|
|
518
|
+
if (stage === 2) break;
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
// Stage 3: case-sensitive, substring (no word-boundary)
|
|
522
|
+
if (stage === 0) {
|
|
523
|
+
for (const tobj of terms) {
|
|
524
|
+
const termOrig = tobj.orig;
|
|
525
|
+
if (termOrig && textOrig.includes(termOrig)) { stage = 3; termHit = termOrig; break; }
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
// Stage 4: case-insensitive, substring on derived stripped forms (no iterative truncation),
|
|
529
|
+
// mutating only the last word for multi-word terms
|
|
530
|
+
if (stage === 0) {
|
|
531
|
+
const strippedForms = (base) => {
|
|
532
|
+
const { head, last } = splitHeadLast(base);
|
|
533
|
+
const prefix = head ? head + ' ' : '';
|
|
534
|
+
const forms = new Set();
|
|
535
|
+
const addIf = (s) => {
|
|
536
|
+
const v = String(s || '').trim().toLowerCase();
|
|
537
|
+
if (v && v.length >= 3) forms.add(v);
|
|
538
|
+
};
|
|
539
|
+
const addFromLast = (w) => {
|
|
540
|
+
const lw = String(w || '').toLowerCase();
|
|
541
|
+
if (!lw) return;
|
|
542
|
+
const full = prefix + lw;
|
|
543
|
+
addIf(full);
|
|
544
|
+
const addVar = (x) => addIf(prefix + x);
|
|
545
|
+
if (/y$/i.test(lw)) addVar(lw.slice(0, -1));
|
|
546
|
+
if (/e$/i.test(lw)) addVar(lw.slice(0, -1));
|
|
547
|
+
if (/ing$/i.test(lw)) addVar(lw.slice(0, -3));
|
|
548
|
+
if (/ed$/i.test(lw)) addVar(lw.slice(0, -2));
|
|
549
|
+
if (/es$/i.test(lw)) addVar(lw.slice(0, -2));
|
|
550
|
+
if (/s$/i.test(lw) && !/ss$/i.test(lw)) addVar(lw.slice(0, -1));
|
|
551
|
+
};
|
|
552
|
+
const addYEOnlyFromLast = (w) => {
|
|
553
|
+
const lw = String(w || '').toLowerCase();
|
|
554
|
+
if (!lw) return;
|
|
555
|
+
const full = prefix + lw;
|
|
556
|
+
addIf(full);
|
|
557
|
+
const addVar = (x) => addIf(prefix + x);
|
|
558
|
+
if (/y$/i.test(lw)) addVar(lw.slice(0, -1));
|
|
559
|
+
if (/e$/i.test(lw)) addVar(lw.slice(0, -1));
|
|
560
|
+
};
|
|
561
|
+
// base last word and its stripped variants
|
|
562
|
+
addFromLast(last);
|
|
563
|
+
// For conjugations/irregulars of the last word, only drop final y/e
|
|
564
|
+
for (const x of conjugationsForTerm(base)) {
|
|
565
|
+
const { head: h2, last: l2 } = splitHeadLast(x);
|
|
566
|
+
// ensure we only consider variants that kept the same head
|
|
567
|
+
if ((h2 || '') === (head || '')) addYEOnlyFromLast(l2);
|
|
568
|
+
}
|
|
569
|
+
for (const x of irregularFormsForTerm(base)) {
|
|
570
|
+
const { head: h2, last: l2 } = splitHeadLast(x);
|
|
571
|
+
if ((h2 || '') === (head || '')) addYEOnlyFromLast(l2);
|
|
572
|
+
}
|
|
573
|
+
return Array.from(forms);
|
|
574
|
+
};
|
|
575
|
+
outerStrip:
|
|
576
|
+
for (const tobj of terms) {
|
|
577
|
+
const termOrig = tobj.orig;
|
|
578
|
+
const forms = strippedForms(termOrig);
|
|
579
|
+
for (const f of forms) {
|
|
580
|
+
if (!f) continue;
|
|
581
|
+
if (textLower.includes(f)) { stage = 4; termHit = termOrig; truncated = false; break outerStrip; }
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
if (stage > 0) {
|
|
587
|
+
perArticleMatches.push({ art, stage, termHit, truncated });
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
if (!perArticleMatches.length) return null;
|
|
592
|
+
|
|
593
|
+
// Determine best stage among all matches
|
|
594
|
+
const bestStage = Math.min(...perArticleMatches.map(m => m.stage));
|
|
595
|
+
const bestMatches = perArticleMatches.filter(m => m.stage === bestStage);
|
|
596
|
+
// Among best matches, pick the one that appears earliest in prioritized list
|
|
597
|
+
const artIndex = new Map(prioritized.map((a, i) => [a, i]));
|
|
598
|
+
bestMatches.sort((a, b) => artIndex.get(a.art) - artIndex.get(b.art));
|
|
599
|
+
const chosenMatch = bestMatches[0];
|
|
600
|
+
|
|
601
|
+
// Disambiguation: list all matched articles
|
|
602
|
+
const matchesList = perArticleMatches.map(m => m.art);
|
|
603
|
+
const disamb = matchesList.length > 1 ? `(${matchesList.join(', ')})` : '';
|
|
604
|
+
|
|
605
|
+
const isVariant = (chosenMatch.stage >= 3) || chosenMatch.truncated;
|
|
606
|
+
let variantTerm = isVariant ? chosenMatch.termHit : '';
|
|
607
|
+
// If marked variant due to non-word-boundary/truncation, but ANY term from the chosen
|
|
608
|
+
// article matches on word-boundaries case-insensitively, then do NOT mark as variant.
|
|
609
|
+
if (variantTerm) {
|
|
610
|
+
const termObjs = termMap.get(chosenMatch.art) || [];
|
|
611
|
+
const hasWordBoundMatch = termObjs.some(tobj => {
|
|
612
|
+
const termOrig = tobj.orig;
|
|
613
|
+
if (!termOrig) return false;
|
|
614
|
+
const re = new RegExp(`\\b${escapeRegExp(termOrig)}\\b`, 'i');
|
|
615
|
+
return re.test(textOrig);
|
|
616
|
+
});
|
|
617
|
+
if (hasWordBoundMatch) {
|
|
618
|
+
variantTerm = '';
|
|
619
|
+
} else {
|
|
620
|
+
// Also suppress if a proper plural of any term matches with word boundaries
|
|
621
|
+
const hasPluralBoundMatch = termObjs.some(tobj => {
|
|
622
|
+
const termOrig = tobj.orig;
|
|
623
|
+
if (!termOrig) return false;
|
|
624
|
+
const plurals = pluralizeTerm(termOrig);
|
|
625
|
+
return plurals.some(p => new RegExp(`\\b${escapeRegExp(p)}\\b`, 'i').test(textOrig));
|
|
626
|
+
});
|
|
627
|
+
if (hasPluralBoundMatch) {
|
|
628
|
+
variantTerm = '';
|
|
629
|
+
} else {
|
|
630
|
+
// Finally, if the matched term inflects (-ing, -ed) OR has irregular forms that match, suppress variant
|
|
631
|
+
const base = chosenMatch.termHit || '';
|
|
632
|
+
const infl = new Set(ingEdFormsForTerm(base));
|
|
633
|
+
for (const f of irregularFormsForTerm(base)) infl.add(f);
|
|
634
|
+
const hasInflBoundMatch = Array.from(infl).some(p => new RegExp(`\\b${escapeRegExp(p)}\\b`, 'i').test(textOrig));
|
|
635
|
+
if (hasInflBoundMatch) variantTerm = '';
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
return { article: chosenMatch.art, disamb, variantTerm };
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
export async function generateTwlByBook(bookCode, options = {}) {
|
|
644
|
+
// Import Node-specific modules conditionally
|
|
645
|
+
const { addGLQuoteCols, convertGLQuotes2OLQuotes } = await import('tsv-quote-converters');
|
|
646
|
+
|
|
647
|
+
const useCompromise = !!options.useCompromise;
|
|
648
|
+
let nlp = null;
|
|
649
|
+
if (useCompromise) {
|
|
650
|
+
const mod = await import('compromise');
|
|
651
|
+
nlp = mod.default || mod;
|
|
652
|
+
}
|
|
653
|
+
const bibleData = await readBooks();
|
|
654
|
+
const meta = findBookMeta(bibleData, bookCode);
|
|
655
|
+
if (!meta) throw new Error(`Unknown book code: ${bookCode}`);
|
|
656
|
+
const usfm = await fetchUsfm(meta.usfm, meta.testament);
|
|
657
|
+
const twJson = await loadTwJsonLocal();
|
|
658
|
+
const strongPivot = pivotByStrong(twJson);
|
|
659
|
+
|
|
660
|
+
// 1) initial TSV
|
|
661
|
+
const baseTsv = buildInitialTsv(usfm, strongPivot, meta.key);
|
|
662
|
+
|
|
663
|
+
// 2) add GLQuote and GLOccurrence
|
|
664
|
+
const glRes = await addGLQuoteCols({
|
|
665
|
+
bibleLinks: ["unfoldingWord/en_ult/master"],
|
|
666
|
+
bookCode: meta.key,
|
|
667
|
+
tsvContent: baseTsv,
|
|
668
|
+
trySeparatorsAndOccurrences: true,
|
|
669
|
+
});
|
|
670
|
+
const withGl = glRes.output;
|
|
671
|
+
|
|
672
|
+
// 3) Convert GLQuote/GLOccurrence into OrigWords/Occurrence and convert to OL quotes BEFORE matching
|
|
673
|
+
const lines0 = withGl.split(/\r?\n/);
|
|
674
|
+
const header0 = lines0.shift();
|
|
675
|
+
const h0 = header0.split('\t');
|
|
676
|
+
const I0 = {
|
|
677
|
+
Reference: h0.indexOf('Reference'),
|
|
678
|
+
ID: h0.indexOf('ID'),
|
|
679
|
+
Tags: h0.indexOf('Tags'),
|
|
680
|
+
OrigWords: h0.indexOf('OrigWords'),
|
|
681
|
+
Occurrence: h0.indexOf('Occurrence'),
|
|
682
|
+
TWLink: h0.indexOf('TWLink'),
|
|
683
|
+
GLQuote: h0.indexOf('GLQuote'),
|
|
684
|
+
GLOccurrence: h0.indexOf('GLOccurrence'),
|
|
685
|
+
};
|
|
686
|
+
const rebuilt0 = [header0].concat(lines0.filter(Boolean).map(row => {
|
|
687
|
+
const c = row.split('\t');
|
|
688
|
+
const newCols = c.slice();
|
|
689
|
+
if (I0.GLQuote >= 0) newCols[I0.OrigWords] = c[I0.GLQuote];
|
|
690
|
+
if (I0.GLOccurrence >= 0) newCols[I0.Occurrence] = c[I0.GLOccurrence];
|
|
691
|
+
return newCols.join('\t');
|
|
692
|
+
})).join('\n');
|
|
693
|
+
const convEarly = await convertGLQuotes2OLQuotes({
|
|
694
|
+
bibleLinks: ["unfoldingWord/en_ult/master"],
|
|
695
|
+
bookCode: meta.key,
|
|
696
|
+
tsvContent: rebuilt0,
|
|
697
|
+
trySeparatorsAndOccurrences: true,
|
|
698
|
+
});
|
|
699
|
+
|
|
700
|
+
// 4) Reorder columns and add Strongs + randomized 4-char IDs before matching
|
|
701
|
+
const linesA = convEarly.output.split(/\r?\n/);
|
|
702
|
+
const headerA = linesA.shift();
|
|
703
|
+
const aCols = headerA.split('\t');
|
|
704
|
+
const A = {
|
|
705
|
+
Reference: aCols.indexOf('Reference'),
|
|
706
|
+
ID: aCols.indexOf('ID'),
|
|
707
|
+
Tags: aCols.indexOf('Tags'),
|
|
708
|
+
OrigWords: aCols.indexOf('OrigWords'),
|
|
709
|
+
Occurrence: aCols.indexOf('Occurrence'),
|
|
710
|
+
TWLink: aCols.indexOf('TWLink'),
|
|
711
|
+
GLQuote: aCols.indexOf('GLQuote'),
|
|
712
|
+
GLOccurrence: aCols.indexOf('GLOccurrence'),
|
|
713
|
+
};
|
|
714
|
+
|
|
715
|
+
// New header order: Reference, ID, Tags, OrigWords, Occurrence, TWLink, Strongs, GLQuote, GLOccurrence
|
|
716
|
+
const finalHeaderBase = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'Strongs', 'GLQuote', 'GLOccurrence'];
|
|
717
|
+
const usedIds = new Set();
|
|
718
|
+
const genId = () => {
|
|
719
|
+
const letters = 'abcdefghijklmnopqrstuvwxyz';
|
|
720
|
+
const alnum = 'abcdefghijklmnopqrstuvwxyz0123456789';
|
|
721
|
+
while (true) {
|
|
722
|
+
const first = letters[Math.floor(Math.random() * letters.length)];
|
|
723
|
+
let rest = '';
|
|
724
|
+
for (let i = 0; i < 3; i++) rest += alnum[Math.floor(Math.random() * alnum.length)];
|
|
725
|
+
const id = first + rest;
|
|
726
|
+
if (!usedIds.has(id)) { usedIds.add(id); return id; }
|
|
727
|
+
}
|
|
728
|
+
};
|
|
729
|
+
|
|
730
|
+
const preparedRows = [];
|
|
731
|
+
for (const ln of linesA) {
|
|
732
|
+
if (!ln.trim()) continue;
|
|
733
|
+
const c = ln.split('\t');
|
|
734
|
+
if (c.length < 7) continue;
|
|
735
|
+
const strongsVal = c[A.ID];
|
|
736
|
+
const newId = genId();
|
|
737
|
+
const newRow = [
|
|
738
|
+
c[A.Reference],
|
|
739
|
+
newId,
|
|
740
|
+
c[A.Tags],
|
|
741
|
+
c[A.OrigWords],
|
|
742
|
+
c[A.Occurrence],
|
|
743
|
+
c[A.TWLink],
|
|
744
|
+
strongsVal,
|
|
745
|
+
c[A.GLQuote],
|
|
746
|
+
c[A.GLOccurrence],
|
|
747
|
+
];
|
|
748
|
+
preparedRows.push(newRow);
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
// Indexes for prepared rows
|
|
752
|
+
const H = {
|
|
753
|
+
Reference: 0,
|
|
754
|
+
ID: 1,
|
|
755
|
+
Tags: 2,
|
|
756
|
+
OrigWords: 3,
|
|
757
|
+
Occurrence: 4,
|
|
758
|
+
TWLink: 5,
|
|
759
|
+
Strongs: 6,
|
|
760
|
+
GLQuote: 7,
|
|
761
|
+
GLOccurrence: 8,
|
|
762
|
+
};
|
|
763
|
+
|
|
764
|
+
// 5) pick best TWLink based on GLQuote terms using Strongs column; include Variant of column
|
|
765
|
+
const termMap = buildArticleTermMap(twJson);
|
|
766
|
+
const outRows = [finalHeaderBase.concat(['Variant of', 'Disambiguation']).join('\t')];
|
|
767
|
+
const noMatchRows = [finalHeaderBase.concat(['Disambiguation']).join('\t')];
|
|
768
|
+
let totalRows = 0;
|
|
769
|
+
let droppedRows = 0;
|
|
770
|
+
let multiDisambRows = 0;
|
|
771
|
+
const noMatchSamples = [];
|
|
772
|
+
|
|
773
|
+
for (const cols of preparedRows) {
|
|
774
|
+
totalRows++;
|
|
775
|
+
const strongId = cols[H.Strongs];
|
|
776
|
+
const glq = cols[H.GLQuote] || '';
|
|
777
|
+
const result = chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, { useCompromise, nlp });
|
|
778
|
+
if (!result) {
|
|
779
|
+
droppedRows++;
|
|
780
|
+
if (noMatchSamples.length < 8) {
|
|
781
|
+
const ref = cols[H.Reference] || '';
|
|
782
|
+
noMatchSamples.push(`${ref}\t${strongId}\t${glq}`);
|
|
783
|
+
}
|
|
784
|
+
const tried = prioritizeArticles(glq, strongId, strongPivot) || [];
|
|
785
|
+
const disambTried = tried.length ? `(${tried.join(', ')})` : '';
|
|
786
|
+
noMatchRows.push(cols.join('\t') + '\t' + disambTried);
|
|
787
|
+
continue;
|
|
788
|
+
}
|
|
789
|
+
const art = result.article;
|
|
790
|
+
cols[H.TWLink] = `rc://*/tw/dict/bible/${art}`;
|
|
791
|
+
// Update Tags based on selected article prefix
|
|
792
|
+
let tag = '';
|
|
793
|
+
if (art.startsWith('kt/')) tag = 'keyterm';
|
|
794
|
+
else if (art.startsWith('names/')) tag = 'name';
|
|
795
|
+
cols[H.Tags] = tag;
|
|
796
|
+
if (result.disamb) multiDisambRows++;
|
|
797
|
+
const variantOf = result.variantTerm || '';
|
|
798
|
+
outRows.push(cols.join('\t') + '\t' + variantOf + '\t' + (result.disamb || ''));
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
const keptRows = totalRows - droppedRows;
|
|
802
|
+
const pct = totalRows ? ((keptRows / totalRows) * 100).toFixed(1) : '0.0';
|
|
803
|
+
console.log(`[TWL] ${bookCode.toUpperCase()}: kept ${keptRows}/${totalRows} (${pct}%), dropped ${droppedRows}, disambiguated ${multiDisambRows}`);
|
|
804
|
+
if (noMatchSamples.length) {
|
|
805
|
+
console.log(`[TWL] ${bookCode.toUpperCase()}: no-match samples (up to 8):`);
|
|
806
|
+
for (const s of noMatchSamples) console.log(` ${s}`);
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
const matchedTsv = outRows.join('\n');
|
|
810
|
+
const noMatchTsv = noMatchRows.join('\n');
|
|
811
|
+
return { matchedTsv, noMatchTsv };
|
|
812
|
+
}
|