word-aligner 1.0.3 → 1.1.0-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/js/aligner.js CHANGED
@@ -54,6 +54,134 @@ var hasAlignments = exports.hasAlignments = function hasAlignments(alignments) {
54
54
  return indexFirstAlignment >= 0;
55
55
  };
56
56
 
57
+ /**
58
+ * Combines consecutive text objects in an array of verse objects recursively.
59
+ * When multiple text objects appear consecutively, they are merged into a single text object.
60
+ * Also processes nested children arrays recursively.
61
+ * @param {Array} objects - Array of verse objects to process
62
+ * @return {Array} - Array with consecutive text objects combined
63
+ */
64
+ var combineConsecutiveText = function combineConsecutiveText(objects) {
65
+ var result = [];
66
+ for (var i = 0; i < objects.length; i++) {
67
+ var current = objects[i];
68
+
69
+ if (current.type === 'text' && result.length > 0 && result[result.length - 1].type === 'text') {
70
+ // combine with previous text object
71
+ result[result.length - 1].text += current.text;
72
+ } else {
73
+ // recursively process children if they exist
74
+ if (current.children && Array.isArray(current.children)) {
75
+ current.children = combineConsecutiveText(current.children);
76
+ }
77
+ result.push(current);
78
+ }
79
+ }
80
+ return result;
81
+ };
82
+
83
+ /**
84
+ * Restores verse objects from a flattened state by rebuilding their hierarchical structure,
85
+ * removing null/undefined objects, and combining consecutive text objects.
86
+ * @param {Array} verseObjects - Array of verse objects to restore
87
+ * @return {Array} - Cleaned and restored array of verse objects
88
+ */
89
+ function restoreVerseObjects(verseObjects) {
90
+ restoreHierarchy(verseObjects);
91
+ // remove null objects
92
+ var filteredObjects = verseObjects.filter(function (item) {
93
+ return item !== null && item !== undefined;
94
+ });
95
+ // combine consecutive text objects in nested verseObjects
96
+ var cleanedVerseObjects = combineConsecutiveText(filteredObjects);
97
+ cleanChildReferences({ children: cleanedVerseObjects }, 'parentIndex');
98
+ return cleanedVerseObjects;
99
+ }
100
+
101
+ /**
102
+ * Recursively removes a specified property (default 'parentIndex') from all children
103
+ * in a verse object's hierarchy.
104
+ * @param {Object} verseObject - The verse object whose children should be cleaned
105
+ * @param {string} [key='parentIndex'] - The property key to remove from children
106
+ */
107
+ function cleanChildReferences(verseObject) {
108
+ var key = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 'parentIndex';
109
+
110
+ var children = verseObject.children || [];
111
+ for (var j = 0, cLen = children.length; j < cLen; j++) {
112
+ var child = children[j];
113
+ var childKeyValue = child[key];
114
+ if (childKeyValue >= 0) {
115
+ delete child[key];
116
+ }
117
+ if (child.children) {
118
+ cleanChildReferences(child, key);
119
+ }
120
+ }
121
+ }
122
+
123
+ /**
124
+ * Restores the hierarchical structure of flattened verse objects.
125
+ * Verse objects that have a parentIndex property are moved into their parent's children array
126
+ * and then removed from the top-level array by setting them to null.
127
+ *
128
+ * @param {Array} unalignedOrdered - Array of flattened verse objects that may contain parentIndex properties
129
+ */
130
+ function restoreHierarchy(unalignedOrdered) {
131
+ var toRemove = [];
132
+
133
+ var _loop = function _loop(i, oLen) {
134
+ var verseObject = unalignedOrdered[i];
135
+ var parentIndex = verseObject.parentIndex;
136
+ if (parentIndex >= 0) {
137
+ var parent = unalignedOrdered.find(function (obj) {
138
+ return obj && obj.originalIndex === parentIndex;
139
+ });
140
+ if (parent && parent.children) {
141
+ parent.children.push(verseObject);
142
+ toRemove.push(i);
143
+ }
144
+ delete verseObject.parentIndex;
145
+ delete verseObject.originalIndex;
146
+ cleanChildReferences(verseObject, 'parentIndex');
147
+ }
148
+ };
149
+
150
+ for (var i = 0, oLen = unalignedOrdered.length; i < oLen; i++) {
151
+ _loop(i, oLen);
152
+ }
153
+
154
+ // remove from original location by nulling
155
+ for (var i = toRemove.length - 1; i >= 0; i--) {
156
+ var toRemoveElement = toRemove[i];
157
+ unalignedOrdered.splice(toRemoveElement, 1);
158
+ }
159
+
160
+ // clean up originalIndex property
161
+ for (var _i = 0, oLen = unalignedOrdered.length; _i < oLen; _i++) {
162
+ var _verseObject2 = unalignedOrdered[_i];
163
+ if (_verseObject2 && _verseObject2.originalIndex >= 0) {
164
+ delete _verseObject2.originalIndex;
165
+ }
166
+ cleanChildReferences(_verseObject2, 'originalIndex');
167
+ }
168
+ }
169
+
170
+ /**
171
+ * Saves the original position of each verse object in the array by adding an originalIndex property.
172
+ * This allows tracking of objects' positions before any modifications or deletions occur.
173
+ *
174
+ * @param {Array} unalignedOrdered - Array of verse objects whose positions need to be saved
175
+ */
176
+ function savePosition(unalignedOrdered) {
177
+ for (var i = 0, dLen = unalignedOrdered.length; i < dLen; i++) {
178
+ var _verseObject3 = unalignedOrdered[i];
179
+ if (_verseObject3) {
180
+ _verseObject3.originalIndex = i; // so we can keep track of where the object was before deletions
181
+ }
182
+ }
183
+ }
184
+
57
185
  /**
58
186
  * @description pivots alignments into bottomWords/targetLanguage verseObjectArray sorted by verseText
59
187
  * @param {Array} alignments - array of aligned word objects {bottomWords, topWords}
@@ -94,11 +222,11 @@ var merge = exports.merge = function merge(alignments, wordBank, verseString) {
94
222
  var wbLen = wordBank.length;
95
223
  for (var i = 0; i < wbLen; i++) {
96
224
  var bottomWord = wordBank[i];
97
- var verseObject = VerseObjectUtils.wordVerseObjectFromBottomWord(bottomWord);
98
- var index = VerseObjectUtils.indexOfVerseObject(wordMap, verseObject);
225
+ var _verseObject4 = VerseObjectUtils.wordVerseObjectFromBottomWord(bottomWord);
226
+ var index = VerseObjectUtils.indexOfVerseObject(wordMap, _verseObject4);
99
227
  if (index > -1) {
100
228
  var location = wordMap[index];
101
- location.array[location.pos] = verseObject;
229
+ location.array[location.pos] = _verseObject4;
102
230
  } else if (hasAlignments(alignments)) {
103
231
  // if verse has some alignments
104
232
  throw { message: 'Word "' + bottomWord.word + '" is in wordBank, but missing from target language verse.', type: 'InvalidatedAlignments' };
@@ -109,8 +237,8 @@ var merge = exports.merge = function merge(alignments, wordBank, verseString) {
109
237
  }
110
238
  var indicesToDelete = [];
111
239
  // each alignment should result in one verseObject
112
- for (var _i = 0, aLen = alignments.length; _i < aLen; _i++) {
113
- var alignment = alignments[_i];
240
+ for (var _i2 = 0, aLen = alignments.length; _i2 < aLen; _i2++) {
241
+ var alignment = alignments[_i2];
114
242
  var topWords = alignment.topWords,
115
243
  bottomWords = alignment.bottomWords;
116
244
  // each bottomWord results in a nested verseObject of tag: w, type: word
@@ -119,12 +247,12 @@ var merge = exports.merge = function merge(alignments, wordBank, verseString) {
119
247
  var replacements = {};
120
248
  for (var j = 0, bwLen = bottomWords.length; j < bwLen; j++) {
121
249
  var _bottomWord = bottomWords[j];
122
- var _verseObject2 = VerseObjectUtils.wordVerseObjectFromBottomWord(_bottomWord);
123
- var _index = VerseObjectUtils.indexOfVerseObject(wordMap, _verseObject2);
250
+ var _verseObject5 = VerseObjectUtils.wordVerseObjectFromBottomWord(_bottomWord);
251
+ var _index = VerseObjectUtils.indexOfVerseObject(wordMap, _verseObject5);
124
252
  if (_index === -1) {
125
- throw { message: 'VerseObject not found in verseText while merging:' + (0, _stringify2.default)(_verseObject2), type: 'InvalidatedAlignments' };
253
+ throw { message: 'VerseObject not found in verseText while merging:' + (0, _stringify2.default)(_verseObject5), type: 'InvalidatedAlignments' };
126
254
  }
127
- replacements[_index] = _verseObject2;
255
+ replacements[_index] = _verseObject5;
128
256
  }
129
257
  // each topWord results in a nested verseObject of tag: k, type: milestone
130
258
  var milestones = topWords.map(function (topWord) {
@@ -156,12 +284,19 @@ var merge = exports.merge = function merge(alignments, wordBank, verseString) {
156
284
  var milestone = VerseObjectUtils.nestMilestones(milestones);
157
285
  // replace the original verseObject from the verse text with the aligned milestone verseObject
158
286
  var _location = wordMap[indexToReplace];
287
+ if (_location.parentIndex >= 0) {
288
+ milestone.parentIndex = _location.parentIndex; // preserve the parent index
289
+ }
159
290
  _location.array[_location.pos] = milestone;
160
291
  }
161
292
  }
293
+
294
+ savePosition(unalignedOrdered); // save original position of each verseObject to keep track even after deletions
295
+
162
296
  // deleteIndices that were queued due to consecutive bottomWords in alignments
163
297
  var verseObjects = ArrayUtils.deleteIndices(unalignedOrdered, indicesToDelete, wordMap);
164
- return verseObjects;
298
+ var restoredObjects = restoreVerseObjects(verseObjects);
299
+ return restoredObjects;
165
300
  };
166
301
 
167
302
  /**
@@ -413,13 +548,13 @@ var unmerge = exports.unmerge = function unmerge(verseObjects, alignedVerse) {
413
548
  }
414
549
  var len = verseObjects.length;
415
550
  for (var i = 0; i < len; i++) {
416
- var verseObject = verseObjects[i];
417
- addAlignment(baseMilestones, verseObject, alignments);
551
+ var _verseObject6 = verseObjects[i];
552
+ addAlignment(baseMilestones, _verseObject6, alignments);
418
553
  }
419
554
  var alignmentUnOrdered = [];
420
555
  len = alignments.length;
421
- for (var _i2 = 0; _i2 < len; _i2++) {
422
- var _alignment = alignments[_i2];
556
+ for (var _i3 = 0; _i3 < len; _i3++) {
557
+ var _alignment = alignments[_i3];
423
558
  if (_alignment.topWords.length > 0) {
424
559
  alignmentUnOrdered.push(_alignment);
425
560
  } else {
@@ -185,21 +185,29 @@ var getVerseObjectsText = function getVerseObjectsText(verseObjects) {
185
185
  };
186
186
 
187
187
  /**
188
- * make sure we pick up white space between tokens
189
- * @param {string} text - string to tokenize
190
- * @param {Number} lastPos - position of end of last token
191
- * @param {Number} pos - position to grab up to
192
- * @param {Array} newVerseObjects - nested verse objects
193
- * @param {Boolean} end - if true, then at end of line
194
- * @return {{lastPos: *, verseObject: *}} - new verse object and updated position
188
+ * Fills gaps (whitespace and text) between tokens in the verse object array.
189
+ * Ensures whitespace between tokens is preserved by creating text verse objects.
190
+ * If possible, appends to the previous text object if it exists at the same nesting level;
191
+ * otherwise creates a new text verse object.
192
+ *
193
+ * @param {string} text - The complete string being tokenized
194
+ * @param {Number} lastPos - Position of the end of the last processed token
195
+ * @param {Number} pos - Position to process up to (start of next token or end of string)
196
+ * @param {Array} newVerseObjects - Array of verse objects being populated
197
+ * @param {Boolean} [end=false] - If true, forces creation of text object even if gap is empty (for end of line)
198
+ * @param {Number} [parentIndex=-1] - Index of parent verse object if nested, -1 if at root level
199
+ * @return {Number} Updated position after processing the gap (lastPos + gap.length)
195
200
  */
196
201
  var fillGap = function fillGap(text, lastPos, pos, newVerseObjects) {
197
202
  var end = arguments.length > 4 && arguments[4] !== undefined ? arguments[4] : false;
203
+ var parentIndex = arguments.length > 5 && arguments[5] !== undefined ? arguments[5] : -1;
198
204
 
199
205
  var verseObject = null;
200
206
  var gap = text.substring(lastPos, pos);
201
207
  var lastVerseObject = newVerseObjects.length && newVerseObjects[newVerseObjects.length - 1];
202
- if (lastVerseObject && lastVerseObject.type === 'text') {
208
+ var lastParentIndex = typeof lastVerseObject.parentIndex === 'number' ? lastVerseObject.parentIndex : -1;
209
+ var canAppendToPreviousText = lastVerseObject && lastVerseObject.type === 'text' && lastParentIndex === parentIndex;
210
+ if (canAppendToPreviousText) {
203
211
  // append to previous text
204
212
  lastVerseObject.text += gap;
205
213
  } else if (end || gap) {
@@ -208,6 +216,11 @@ var fillGap = function fillGap(text, lastPos, pos, newVerseObjects) {
208
216
  type: 'text',
209
217
  text: gap
210
218
  };
219
+
220
+ if (parentIndex >= 0) {
221
+ verseObject.parentIndex = parentIndex;
222
+ }
223
+
211
224
  newVerseObjects.push(verseObject);
212
225
  }
213
226
  lastPos += gap.length;
@@ -215,15 +228,22 @@ var fillGap = function fillGap(text, lastPos, pos, newVerseObjects) {
215
228
  };
216
229
 
217
230
  /**
218
- * parse text into tokens
219
- * @param {string} text - string to tokenize
220
- * @param {Array} newVerseObjects - nested verse objects
221
- * @param {Array} wordMap - ordered map of word locations in verseObjects
222
- * @param {Number} nonWordVerseObjectCount - keeps count of entries that are not actually words
223
- * @param {String} verseText - text of the entire verse
224
- * @return {Number} new nonWordVerseObjectCount
231
+ * Parses text into tokens and creates word or text verse objects.
232
+ * Tokenizes the input text and identifies words (containing word/number characters)
233
+ * versus punctuation/text. For words, creates word objects with occurrence tracking.
234
+ * For non-word tokens, creates text objects. Preserves whitespace between tokens.
235
+ *
236
+ * @param {string} text - The string to tokenize
237
+ * @param {Array} newVerseObjects - Array to populate with newly created verse objects
238
+ * @param {Array} wordMap - Ordered map tracking word locations in verseObjects for occurrence counting
239
+ * @param {Number} nonWordVerseObjectCount - Counter for entries that are not words (text/punctuation)
240
+ * @param {String} verseText - Complete text of the entire verse for occurrence calculation
241
+ * @param {Number} [parentIndex=-1] - Index of parent verse object if this text is nested, -1 if at root level
242
+ * @return {Number} Updated nonWordVerseObjectCount after processing
225
243
  */
226
244
  var tokenizeText = function tokenizeText(text, newVerseObjects, wordMap, nonWordVerseObjectCount, verseText) {
245
+ var parentIndex = arguments.length > 5 && arguments[5] !== undefined ? arguments[5] : -1;
246
+
227
247
  if (text) {
228
248
  var tokens = tokenizer.tokenize({ text: text, includePunctuation: true });
229
249
  var tokenLength = tokens.length;
@@ -234,7 +254,7 @@ var tokenizeText = function tokenizeText(text, newVerseObjects, wordMap, nonWord
234
254
  var pos = text.indexOf(word, lastPos);
235
255
  if (pos > lastPos) {
236
256
  // make sure we are not dropping white space
237
- lastPos = fillGap(text, lastPos, pos, newVerseObjects);
257
+ lastPos = fillGap(text, lastPos, pos, newVerseObjects, false, parentIndex);
238
258
  }
239
259
  if (tokenizer.word.test(word) || tokenizer.number.test(word)) {
240
260
  // if the text has word or number characters, its a word object
@@ -249,7 +269,8 @@ var tokenizeText = function tokenizeText(text, newVerseObjects, wordMap, nonWord
249
269
  occurrence: occurrence,
250
270
  occurrences: occurrences
251
271
  };
252
- wordMap.push({ array: newVerseObjects, pos: newVerseObjects.length });
272
+ var _pos = newVerseObjects.length;
273
+ wordMap.push({ array: newVerseObjects, pos: _pos, parentIndex: parentIndex });
253
274
  } else {
254
275
  // the text does not have word characters
255
276
  nonWordVerseObjectCount++;
@@ -259,28 +280,46 @@ var tokenizeText = function tokenizeText(text, newVerseObjects, wordMap, nonWord
259
280
  };
260
281
  }
261
282
  lastPos += word.length;
283
+
284
+ if (parentIndex >= 0) {
285
+ verseObject.parentIndex = parentIndex;
286
+ }
287
+
262
288
  newVerseObjects.push(verseObject);
263
289
  }
264
290
  if (lastPos < text.length) {
265
- lastPos = fillGap(text, lastPos, text.length, newVerseObjects, true);
291
+ lastPos = fillGap(text, lastPos, text.length, newVerseObjects, true, parentIndex);
266
292
  }
267
293
  }
268
294
  return nonWordVerseObjectCount;
269
295
  };
270
296
 
271
297
  /**
272
- * step through verse objects extracting words
273
- * @param {Array} verseObjects - original array of verse objects with words split
274
- * @param {Array} newVerseObjects - new array of verse objects with words split
275
- * @param {Array} wordMap - ordered map of word locations in verseObjects
276
- * @param {String} verseText - text of the entire verse
277
- * @param {Number} nonWordVerseObjectCount - keeps count of entries that are not actually words
278
- * @return {Number} updated nonWordVerseObjectCount
298
+ * Recursively processes nested verse objects to extract and tokenize words.
299
+ * Traverses through verse objects, preserving non-text objects (like milestones) while
300
+ * extracting and tokenizing any text content. Handles nested children recursively.
301
+ * Maintains parent-child relationships through parentIndex tracking.
302
+ *
303
+ * @param {Array} verseObjects - Original array of verse objects to process (may contain nested structures)
304
+ * @param {Array} newVerseObjects - Output array to populate with processed verse objects with words split
305
+ * @param {Array} wordMap - Ordered map tracking word locations in verseObjects for occurrence counting
306
+ * @param {String} verseText - Complete text of the entire verse for occurrence calculation
307
+ * @param {Number} nonWordVerseObjectCount - Counter for entries that are not words (text/punctuation)
308
+ * @param {Number} [parentIndex=-1] - Index of parent verse object for nested elements, -1 if at root level
309
+ * @return {Number} Updated nonWordVerseObjectCount after processing all verse objects
279
310
  */
280
311
  var getWordsFromNestedVerseObjects = function getWordsFromNestedVerseObjects(verseObjects, newVerseObjects, wordMap, verseText, nonWordVerseObjectCount) {
312
+ var parentIndex = arguments.length > 5 && arguments[5] !== undefined ? arguments[5] : -1;
313
+
281
314
  var voLength = verseObjects.length;
282
315
  for (var i = 0; i < voLength; i++) {
283
316
  var verseObject = verseObjects[i];
317
+
318
+ if (parentIndex >= 0) {
319
+ // keep track of where the parent is
320
+ verseObject.parentIndex = parentIndex;
321
+ }
322
+
284
323
  var vsObjText = verseObject.text;
285
324
  if (verseObject.type !== 'text') {
286
325
  // preseserve non-text verseObject except for text part which will be split into words
@@ -292,16 +331,17 @@ var getWordsFromNestedVerseObjects = function getWordsFromNestedVerseObjects(ver
292
331
  verseObject.nextChar = ' '; // preserve space before text
293
332
  }
294
333
  newVerseObjects.push(verseObject);
334
+ var indexOfThisObject = newVerseObjects.length - 1;
295
335
  if (verseObject.children) {
296
336
  var newChildVerseObjects = [];
297
- nonWordVerseObjectCount = tokenizeText(vsObjText, newChildVerseObjects, wordMap, nonWordVerseObjectCount, verseText);
298
- nonWordVerseObjectCount = getWordsFromNestedVerseObjects(verseObject.children, newChildVerseObjects, wordMap, verseText, nonWordVerseObjectCount);
337
+ nonWordVerseObjectCount = tokenizeText(vsObjText, newChildVerseObjects, wordMap, nonWordVerseObjectCount, verseText, indexOfThisObject);
338
+ nonWordVerseObjectCount = getWordsFromNestedVerseObjects(verseObject.children, newChildVerseObjects, wordMap, verseText, nonWordVerseObjectCount, indexOfThisObject);
299
339
  verseObject.children = newChildVerseObjects;
300
340
  } else {
301
- nonWordVerseObjectCount = tokenizeText(vsObjText, newVerseObjects, wordMap, nonWordVerseObjectCount, verseText);
341
+ nonWordVerseObjectCount = tokenizeText(vsObjText, newVerseObjects, wordMap, nonWordVerseObjectCount, verseText, indexOfThisObject);
302
342
  }
303
343
  } else {
304
- nonWordVerseObjectCount = tokenizeText(vsObjText, newVerseObjects, wordMap, nonWordVerseObjectCount, verseText);
344
+ nonWordVerseObjectCount = tokenizeText(vsObjText, newVerseObjects, wordMap, nonWordVerseObjectCount, verseText, parentIndex);
305
345
  }
306
346
  }
307
347
  return nonWordVerseObjectCount;