twl-generator 1.4.3 → 1.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +3 -3
- package/src/index.js +83 -9
- package/src/utils/twl-matcher.js +2 -30
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "twl-generator",
|
|
3
|
-
"version": "1.4.
|
|
3
|
+
"version": "1.4.5",
|
|
4
4
|
"description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -50,7 +50,7 @@
|
|
|
50
50
|
"csv-parse": "^5.5.6",
|
|
51
51
|
"csv-stringify": "^6.5.0",
|
|
52
52
|
"jszip": "^3.10.1",
|
|
53
|
-
"tsv-quote-converters": "^1.1.
|
|
53
|
+
"tsv-quote-converters": "^1.1.14"
|
|
54
54
|
},
|
|
55
55
|
"peerDependencies": {
|
|
56
56
|
"react": ">=16.8.0"
|
|
@@ -60,4 +60,4 @@
|
|
|
60
60
|
"optional": true
|
|
61
61
|
}
|
|
62
62
|
}
|
|
63
|
-
}
|
|
63
|
+
}
|
package/src/index.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { BibleBookData } from './common/books.js';
|
|
2
|
+
import { addGLQuoteCols, convertGLQuotes2OLQuotes } from 'tsv-quote-converters';
|
|
2
3
|
|
|
3
4
|
const isBrowser = typeof window !== 'undefined';
|
|
4
5
|
|
|
@@ -829,6 +830,7 @@ function chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, twMap, opts
|
|
|
829
830
|
export async function generateTwlByBook(bookCode, options = {}) {
|
|
830
831
|
// Extract dcsHost option with default
|
|
831
832
|
const dcsHost = options.dcsHost || 'https://git.door43.org';
|
|
833
|
+
const quiet = !!options.quiet;
|
|
832
834
|
|
|
833
835
|
// Load terms from en_tw zip file instead of local tw_strongs_list.json
|
|
834
836
|
const termToArticles = await loadTermsFromEnTw(dcsHost);
|
|
@@ -844,8 +846,7 @@ export async function generateTwlByBook(bookCode, options = {}) {
|
|
|
844
846
|
if (!meta) throw new Error(`Unknown book code: ${bookCode}`);
|
|
845
847
|
const versesByChapter = await processUsfmForBook(meta.key, dcsHost);
|
|
846
848
|
|
|
847
|
-
|
|
848
|
-
const header = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Variant of', 'Disambiguation'];
|
|
849
|
+
const header = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'Variant of', 'Disambiguation'];
|
|
849
850
|
const outRows = [header.join('\t')];
|
|
850
851
|
|
|
851
852
|
// ID generator
|
|
@@ -874,8 +875,22 @@ export async function generateTwlByBook(bookCode, options = {}) {
|
|
|
874
875
|
const isVowel = (ch) => /[aeiou]/i.test(ch);
|
|
875
876
|
const isConsonant = (ch) => /[a-z]/i.test(ch) && !isVowel(ch);
|
|
876
877
|
const endsWithCVC = (w) => w.length >= 3 && isConsonant(w[w.length - 3]) && isVowel(w[w.length - 2]) && isConsonant(w[w.length - 1]) && !/[wxy]/i.test(w[w.length - 1]);
|
|
877
|
-
const edForm = (w) =>
|
|
878
|
-
|
|
878
|
+
const edForm = (w) => {
|
|
879
|
+
if (/e$/i.test(w)) return w + 'd';
|
|
880
|
+
if (/[^aeiou]y$/i.test(w)) return w.replace(/y$/i, 'ied');
|
|
881
|
+
// Do not double the final consonant for words ending in "er" (e.g., gather -> gathered)
|
|
882
|
+
const lastCh = w[w.length - 1];
|
|
883
|
+
if (endsWithCVC(w) && !/(?:er|en)$/i.test(w)) return w + lastCh + 'ed';
|
|
884
|
+
return w + 'ed';
|
|
885
|
+
};
|
|
886
|
+
const ingForm = (w) => {
|
|
887
|
+
if (/ie$/i.test(w)) return w.replace(/ie$/i, 'ying');
|
|
888
|
+
if (/ee$/i.test(w)) return w + 'ing';
|
|
889
|
+
if (/e$/i.test(w)) return w.replace(/e$/i, 'ing');
|
|
890
|
+
const lastCh = w[w.length - 1];
|
|
891
|
+
if (endsWithCVC(w) && !/(?:er|en)$/i.test(w)) return w + lastCh + 'ing';
|
|
892
|
+
return w + 'ing';
|
|
893
|
+
};
|
|
879
894
|
|
|
880
895
|
const allowNoVariant = (base, match) => {
|
|
881
896
|
const b = String(base || '');
|
|
@@ -929,8 +944,6 @@ export async function generateTwlByBook(bookCode, options = {}) {
|
|
|
929
944
|
glq,
|
|
930
945
|
String(occ),
|
|
931
946
|
twLink,
|
|
932
|
-
glq,
|
|
933
|
-
String(occ),
|
|
934
947
|
variantOf,
|
|
935
948
|
disamb,
|
|
936
949
|
].join('\t'));
|
|
@@ -941,13 +954,12 @@ export async function generateTwlByBook(bookCode, options = {}) {
|
|
|
941
954
|
// Build TSV and convert GL OrigWords back to OL using tsv-quote-converters
|
|
942
955
|
let matchedTsv = outRows.join('\n');
|
|
943
956
|
try {
|
|
944
|
-
const { convertGLQuotes2OLQuotes } = await import('tsv-quote-converters');
|
|
945
957
|
const conv = await convertGLQuotes2OLQuotes({
|
|
946
|
-
|
|
958
|
+
bibleLink: 'unfoldingWord/en_ult/master',
|
|
947
959
|
bookCode: String(meta.key || bookCode).toLowerCase(),
|
|
948
960
|
tsvContent: matchedTsv,
|
|
949
961
|
trySeparatorsAndOccurrences: true,
|
|
950
|
-
quiet
|
|
962
|
+
quiet,
|
|
951
963
|
});
|
|
952
964
|
if (conv && typeof conv.output === 'string' && conv.output.length) {
|
|
953
965
|
matchedTsv = conv.output;
|
|
@@ -955,6 +967,68 @@ export async function generateTwlByBook(bookCode, options = {}) {
|
|
|
955
967
|
} catch (e) {
|
|
956
968
|
// If conversion fails (e.g., no network), fall back to unconverted TSV
|
|
957
969
|
}
|
|
970
|
+
|
|
971
|
+
// Now add the actual GLQuote/GLOccurrence by calling addGLQuoteCols
|
|
972
|
+
try {
|
|
973
|
+
const result = await addGLQuoteCols({
|
|
974
|
+
bibleLinks: ['unfoldingWord/en_ult/master'],
|
|
975
|
+
bookCode: String(meta.key || bookCode).toLowerCase(),
|
|
976
|
+
tsvContent: matchedTsv,
|
|
977
|
+
trySeparatorsAndOccurrences: true,
|
|
978
|
+
usePreviousGLQuotes: true,
|
|
979
|
+
quiet,
|
|
980
|
+
});
|
|
981
|
+
if (result && typeof result.output === 'string' && result.output.length) {
|
|
982
|
+
matchedTsv = result.output;
|
|
983
|
+
// Reorder columns: move cols[5] and cols[6] to after cols[7] for every line
|
|
984
|
+
try {
|
|
985
|
+
const lines = String(matchedTsv || '').split('\n');
|
|
986
|
+
for (let i = 0; i < lines.length; i++) {
|
|
987
|
+
const cols = lines[i].split('\t');
|
|
988
|
+
// require at least 8 columns so cols[7] exists
|
|
989
|
+
if (cols.length >= 8) {
|
|
990
|
+
const removed = cols.splice(5, 2); // remove cols[5] and cols[6]
|
|
991
|
+
// after removal, original cols[7] is at index 5, so insert after it at index 6
|
|
992
|
+
const insertIndex = Math.min(6, cols.length);
|
|
993
|
+
cols.splice(insertIndex, 0, ...removed);
|
|
994
|
+
lines[i] = cols.join('\t');
|
|
995
|
+
}
|
|
996
|
+
}
|
|
997
|
+
matchedTsv = lines.join('\n');
|
|
998
|
+
} catch (err) {
|
|
999
|
+
// leave matchedTsv unchanged on error
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
} catch (e) {
|
|
1003
|
+
try {
|
|
1004
|
+
const lines = String(matchedTsv || '').split('\n');
|
|
1005
|
+
if (lines.length > 0) {
|
|
1006
|
+
lines[0] = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Variant of', 'Disambiguation'].join('\t');
|
|
1007
|
+
const out = [lines[0]];
|
|
1008
|
+
for (let i = 1; i < lines.length; i++) {
|
|
1009
|
+
const cols = lines[i].split('\t');
|
|
1010
|
+
const g = (idx) => (cols[idx] !== undefined ? cols[idx] : '');
|
|
1011
|
+
const newRow = [
|
|
1012
|
+
g(0), // Reference
|
|
1013
|
+
g(1), // ID
|
|
1014
|
+
g(2), // Tags
|
|
1015
|
+
g(3), // OrigWords
|
|
1016
|
+
g(4), // Occurrence
|
|
1017
|
+
g(5), // TWLink
|
|
1018
|
+
g(3), // GLQuote (copy of OrigWords)
|
|
1019
|
+
g(4), // GLOccurrence (copy of Occurrence)
|
|
1020
|
+
g(6), // Variant of
|
|
1021
|
+
g(7), // Disambiguation
|
|
1022
|
+
].join('\t');
|
|
1023
|
+
out.push(newRow);
|
|
1024
|
+
}
|
|
1025
|
+
matchedTsv = out.join('\n');
|
|
1026
|
+
}
|
|
1027
|
+
} catch (err) {
|
|
1028
|
+
// leave matchedTsv unchanged on any transformation error
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
|
|
958
1032
|
const noMatchHeader = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Disambiguation'];
|
|
959
1033
|
const noMatchTsv = [noMatchHeader.join('\t')].join('\n');
|
|
960
1034
|
return { matchedTsv, noMatchTsv };
|
package/src/utils/twl-matcher.js
CHANGED
|
@@ -162,23 +162,9 @@ class PrefixTrie {
|
|
|
162
162
|
// Always extract from the original text to preserve case
|
|
163
163
|
let originalMatchedText = originalText.substring(startPos, currentPos);
|
|
164
164
|
|
|
165
|
-
// Extend match backwards to include dash-connected words
|
|
165
|
+
// Extend match backwards to include possessive forms (but not dash-connected words)
|
|
166
166
|
let extendedStartPos = startPos;
|
|
167
167
|
|
|
168
|
-
// Check backwards for dash preceded by word characters (no space between)
|
|
169
|
-
if (extendedStartPos > 0 && originalText[extendedStartPos - 1] === '-') {
|
|
170
|
-
let dashPos = extendedStartPos - 1;
|
|
171
|
-
dashPos--; // Move before the dash
|
|
172
|
-
// Check if there are word characters immediately before the dash
|
|
173
|
-
if (dashPos >= 0 && /[\w]/.test(originalText[dashPos])) {
|
|
174
|
-
// Find the start of the word before the dash
|
|
175
|
-
while (dashPos >= 0 && /[\w]/.test(originalText[dashPos])) {
|
|
176
|
-
dashPos--;
|
|
177
|
-
}
|
|
178
|
-
extendedStartPos = dashPos + 1;
|
|
179
|
-
}
|
|
180
|
-
}
|
|
181
|
-
|
|
182
168
|
// Check backwards for apostrophe (straight or curly) preceded by text
|
|
183
169
|
if (extendedStartPos > 0 && /['']/.test(originalText[extendedStartPos - 1])) {
|
|
184
170
|
let apostrophePos = extendedStartPos - 1;
|
|
@@ -193,23 +179,9 @@ class PrefixTrie {
|
|
|
193
179
|
}
|
|
194
180
|
}
|
|
195
181
|
|
|
196
|
-
// Extend match forwards to include dash-connected words
|
|
182
|
+
// Extend match forwards to include possessive forms (but not dash-connected words)
|
|
197
183
|
let extendedEndPos = currentPos;
|
|
198
184
|
|
|
199
|
-
// Check for dash followed by word characters (no space between)
|
|
200
|
-
if (extendedEndPos < originalText.length && originalText[extendedEndPos] === '-') {
|
|
201
|
-
let dashPos = extendedEndPos;
|
|
202
|
-
dashPos++; // Move past the dash
|
|
203
|
-
// Check if there are word characters immediately after the dash
|
|
204
|
-
if (dashPos < originalText.length && /[\w]/.test(originalText[dashPos])) {
|
|
205
|
-
// Find the end of the word after the dash
|
|
206
|
-
while (dashPos < originalText.length && /[\w]/.test(originalText[dashPos])) {
|
|
207
|
-
dashPos++;
|
|
208
|
-
}
|
|
209
|
-
extendedEndPos = dashPos;
|
|
210
|
-
}
|
|
211
|
-
}
|
|
212
|
-
|
|
213
185
|
// Check for apostrophe (straight or curly) followed by text
|
|
214
186
|
if (extendedEndPos < originalText.length && /['']/.test(originalText[extendedEndPos])) {
|
|
215
187
|
let apostrophePos = extendedEndPos;
|