twl-generator 1.4.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/utils/twl-matcher.js +85 -12
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "twl-generator",
|
|
3
|
-
"version": "1.4.
|
|
3
|
+
"version": "1.4.1",
|
|
4
4
|
"description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
package/src/utils/twl-matcher.js
CHANGED
|
@@ -160,16 +160,86 @@ class PrefixTrie {
|
|
|
160
160
|
if (node._terms) {
|
|
161
161
|
const matchLength = currentPos - startPos;
|
|
162
162
|
// Always extract from the original text to preserve case
|
|
163
|
-
|
|
163
|
+
let originalMatchedText = originalText.substring(startPos, currentPos);
|
|
164
|
+
|
|
165
|
+
// Extend match backwards to include dash-connected words and possessive forms
|
|
166
|
+
let extendedStartPos = startPos;
|
|
167
|
+
|
|
168
|
+
// Check backwards for dash preceded by word characters (no space between)
|
|
169
|
+
if (extendedStartPos > 0 && originalText[extendedStartPos - 1] === '-') {
|
|
170
|
+
let dashPos = extendedStartPos - 1;
|
|
171
|
+
dashPos--; // Move before the dash
|
|
172
|
+
// Check if there are word characters immediately before the dash
|
|
173
|
+
if (dashPos >= 0 && /[\w]/.test(originalText[dashPos])) {
|
|
174
|
+
// Find the start of the word before the dash
|
|
175
|
+
while (dashPos >= 0 && /[\w]/.test(originalText[dashPos])) {
|
|
176
|
+
dashPos--;
|
|
177
|
+
}
|
|
178
|
+
extendedStartPos = dashPos + 1;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Check backwards for apostrophe (straight or curly) preceded by text
|
|
183
|
+
if (extendedStartPos > 0 && /['']/.test(originalText[extendedStartPos - 1])) {
|
|
184
|
+
let apostrophePos = extendedStartPos - 1;
|
|
185
|
+
apostrophePos--; // Move before the apostrophe
|
|
186
|
+
// Check if there are word characters immediately before the apostrophe
|
|
187
|
+
if (apostrophePos >= 0 && /[\w]/.test(originalText[apostrophePos])) {
|
|
188
|
+
// Find the start of the text before the apostrophe
|
|
189
|
+
while (apostrophePos >= 0 && /[\w]/.test(originalText[apostrophePos])) {
|
|
190
|
+
apostrophePos--;
|
|
191
|
+
}
|
|
192
|
+
extendedStartPos = apostrophePos + 1;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Extend match forwards to include dash-connected words and possessive forms
|
|
197
|
+
let extendedEndPos = currentPos;
|
|
198
|
+
|
|
199
|
+
// Check for dash followed by word characters (no space between)
|
|
200
|
+
if (extendedEndPos < originalText.length && originalText[extendedEndPos] === '-') {
|
|
201
|
+
let dashPos = extendedEndPos;
|
|
202
|
+
dashPos++; // Move past the dash
|
|
203
|
+
// Check if there are word characters immediately after the dash
|
|
204
|
+
if (dashPos < originalText.length && /[\w]/.test(originalText[dashPos])) {
|
|
205
|
+
// Find the end of the word after the dash
|
|
206
|
+
while (dashPos < originalText.length && /[\w]/.test(originalText[dashPos])) {
|
|
207
|
+
dashPos++;
|
|
208
|
+
}
|
|
209
|
+
extendedEndPos = dashPos;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// Check for apostrophe (straight or curly) followed by text
|
|
214
|
+
if (extendedEndPos < originalText.length && /['']/.test(originalText[extendedEndPos])) {
|
|
215
|
+
let apostrophePos = extendedEndPos;
|
|
216
|
+
apostrophePos++; // Move past the apostrophe
|
|
217
|
+
// Check if there are word characters immediately after the apostrophe
|
|
218
|
+
if (apostrophePos < originalText.length && /[\w]/.test(originalText[apostrophePos])) {
|
|
219
|
+
// Find the end of the text after the apostrophe
|
|
220
|
+
while (apostrophePos < originalText.length && /[\w]/.test(originalText[apostrophePos])) {
|
|
221
|
+
apostrophePos++;
|
|
222
|
+
}
|
|
223
|
+
extendedEndPos = apostrophePos;
|
|
224
|
+
} else {
|
|
225
|
+
// Include the apostrophe even if no text follows (for possessives ending in s)
|
|
226
|
+
extendedEndPos = apostrophePos;
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Update the matched text if we extended it
|
|
231
|
+
if (extendedStartPos < startPos || extendedEndPos > currentPos) {
|
|
232
|
+
originalMatchedText = originalText.substring(extendedStartPos, extendedEndPos);
|
|
233
|
+
}
|
|
164
234
|
|
|
165
235
|
// Check if this is a valid word boundary match (both start and end)
|
|
166
|
-
const isStartBoundary =
|
|
167
|
-
/[\s\p{P}]/.test(originalText[
|
|
168
|
-
!/[\w]/.test(originalText[
|
|
236
|
+
const isStartBoundary = extendedStartPos === 0 ||
|
|
237
|
+
/[\s\p{P}]/.test(originalText[extendedStartPos - 1]) ||
|
|
238
|
+
!/[\w]/.test(originalText[extendedStartPos - 1]);
|
|
169
239
|
|
|
170
|
-
const isEndBoundary =
|
|
171
|
-
/[\s\p{P}]/.test(originalText[
|
|
172
|
-
!/[\w]/.test(originalText[
|
|
240
|
+
const isEndBoundary = extendedEndPos >= originalText.length ||
|
|
241
|
+
/[\s\p{P}]/.test(originalText[extendedEndPos]) ||
|
|
242
|
+
!/[\w]/.test(originalText[extendedEndPos]);
|
|
173
243
|
|
|
174
244
|
const isWordBoundary = isStartBoundary && isEndBoundary;
|
|
175
245
|
|
|
@@ -178,8 +248,9 @@ class PrefixTrie {
|
|
|
178
248
|
matches.push({
|
|
179
249
|
term: termData.term,
|
|
180
250
|
articles: termData.articles,
|
|
181
|
-
matchedText: originalMatchedText, // Use the
|
|
182
|
-
length:
|
|
251
|
+
matchedText: originalMatchedText, // Use the extended matched text
|
|
252
|
+
length: originalMatchedText.length, // Use extended length
|
|
253
|
+
originalLength: matchLength, // Keep track of original match length for advancement
|
|
183
254
|
priority: termData.priority,
|
|
184
255
|
isExactCase: isExactCase
|
|
185
256
|
});
|
|
@@ -283,9 +354,11 @@ function findMatches(verseText, termTrie) {
|
|
|
283
354
|
priority: bestMatch.priority
|
|
284
355
|
});
|
|
285
356
|
|
|
286
|
-
// Move past the matched text
|
|
287
|
-
|
|
288
|
-
|
|
357
|
+
// Move past only the original matched text (not the extended part)
|
|
358
|
+
// This allows finding additional matches within the extended portion
|
|
359
|
+
const advanceBy = bestMatch.originalLength || bestMatch.length;
|
|
360
|
+
processedText += normalizedText.substring(currentPos, currentPos + advanceBy);
|
|
361
|
+
currentPos += advanceBy;
|
|
289
362
|
} else {
|
|
290
363
|
// No match found, move to next character/word boundary
|
|
291
364
|
const nextWordBoundary = normalizedText.substring(currentPos).search(/[\s\p{P}]/u);
|