codexparser 0.1.82 → 0.1.83

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/CodexParser.js +60 -121
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "codexparser",
3
- "version": "0.1.82",
3
+ "version": "0.1.83",
4
4
  "description": "This is a Javascript Bible parser and text scanner. It will search through texts and collate all scripture references into an array and parse them into objects, and it will parse passages into objects by book, chapter, verse, and testament. ",
5
5
  "main": "index.js",
6
6
  "scripts": {
@@ -85,6 +85,11 @@ class CodexParser {
85
85
  return singleChapterBook ? singleChapterBook[book][chapter] || [] : this.chapterVerses[book]?.[chapter] || []
86
86
  }
87
87
 
88
+ /**
89
+ * Scans text for scripture references and stores them in `this.found`.
90
+ * @param {string} text - The text to scan.
91
+ * @returns {CodexParser} The parser instance for method chaining.
92
+ */
88
93
  /**
89
94
  * Scans text for scripture references and stores them in `this.found`.
90
95
  * @param {string} text - The text to scan.
@@ -94,74 +99,53 @@ class CodexParser {
94
99
  const fullNames = [...this.bible.old, ...this.bible.new]
95
100
  const abbreviations = Object.keys(this.abbreviations)
96
101
  this.found = []
97
- // Normalize text for parsing but keep original for originalText
98
- let normalizedText = text
99
- .replace(/[“”]/g, "") // Remove curly quotes
100
- .replace(/\.(?=\d)/g, ":") // Replace periods before digits with colons (e.g., "Re13.8" -> "Re13:8")
101
- .replace(/\s+/g, " ") // Normalize multiple spaces to single
102
+ // Minimal normalization: fix periods before numbers, remove trailing periods
103
+ let normalizedText = text.replace(/\.(?=\d)/g, ":").replace(/(\b[A-Za-z]+)\.(?=\s|$)/g, "$1")
102
104
  const lowercaseBibleFullNames = fullNames.map((book) => book.toLowerCase())
103
105
  const lowercaseBibleAbbreviations = abbreviations.map((abbr) => abbr.toLowerCase())
104
106
  const lowerCaseText = normalizedText.toLowerCase()
105
107
  let i = 0
106
108
 
107
- const isValidChapterVerseChar = (char) => /[^A-Za-z]/.test(char) // Non-letter characters
109
+ const isValidChapterVerseChar = (char) => /[\d:,\-;\s]/.test(char)
108
110
  const isNextBibleBook = (startIndex) => {
109
111
  const textAfterCurrentPosition = lowerCaseText.substring(startIndex).trim()
110
- // Check if the text starts with a book name or abbreviation followed by a digit
111
112
  return (
112
- lowercaseBibleFullNames.some((book) => {
113
- if (textAfterCurrentPosition.startsWith(book)) {
114
- const nextIndex = startIndex + book.length
115
- const nextChar = lowerCaseText[nextIndex]
116
- return nextChar && /\d/.test(nextChar)
117
- }
118
- return false
119
- }) ||
120
- lowercaseBibleAbbreviations.some((abbr) => {
121
- if (textAfterCurrentPosition.startsWith(abbr)) {
122
- const nextIndex = startIndex + abbr.length
123
- const nextChar = lowerCaseText[nextIndex]
124
- return nextChar && (/\d/.test(nextChar) || /\./.test(nextChar))
125
- }
126
- return false
127
- })
113
+ lowercaseBibleFullNames.some((book) => textAfterCurrentPosition.startsWith(book)) ||
114
+ lowercaseBibleAbbreviations.some((abbr) => textAfterCurrentPosition.startsWith(abbr))
128
115
  )
129
116
  }
130
- const detectSuffix = (startIndex, inputText) => {
131
- const suffixMatch = inputText.substring(startIndex).match(/\b(LXX|MT)\b/i)
132
- return suffixMatch ? { version: suffixMatch[0].toUpperCase(), length: suffixMatch[0].length } : null
117
+ const detectSuffix = (startIndex) => {
118
+ const suffixMatch = normalizedText.substring(startIndex).match(/\b(LXX|MT)\b/i)
119
+ return suffixMatch ? { suffix: suffixMatch[0].toUpperCase(), length: suffixMatch[0].length } : null
133
120
  }
134
121
 
135
122
  while (i < lowerCaseText.length) {
136
123
  let foundBook = null
124
+ let startIndex = -1
137
125
  let matchedLength = 0
138
- let originalBookText = ""
139
- let startIndex = i
140
126
 
141
- // Check full book names
127
+ // Skip whitespace and special characters before checking for book
128
+ while (i < lowerCaseText.length && /[\s—-]/.test(lowerCaseText[i])) {
129
+ i++
130
+ }
131
+ if (i >= lowerCaseText.length) break
132
+
142
133
  for (let j = 0; j < lowercaseBibleFullNames.length; j++) {
143
134
  const book = lowercaseBibleFullNames[j]
144
- if (
145
- lowerCaseText.startsWith(book, i) &&
146
- (i + book.length >= lowerCaseText.length || /\d/.test(lowerCaseText[i + book.length]))
147
- ) {
135
+ if (lowerCaseText.startsWith(book, i) && book.length > matchedLength) {
148
136
  foundBook = fullNames[j]
137
+ startIndex = i
149
138
  matchedLength = book.length
150
- originalBookText = text.slice(i, i + book.length)
151
139
  }
152
140
  }
153
141
 
154
- // Check abbreviations
155
142
  if (!foundBook) {
156
143
  for (let k = 0; k < lowercaseBibleAbbreviations.length; k++) {
157
144
  const abbreviation = lowercaseBibleAbbreviations[k]
158
- const abbrPattern = abbreviation.replace(/\./g, "\\.?")
159
- const regex = new RegExp(`^${abbrPattern}(\\.?\\s*\\d)`, "i")
160
- const match = lowerCaseText.slice(i).match(regex)
161
- if (match) {
145
+ if (lowerCaseText.startsWith(abbreviation, i) && abbreviation.length > matchedLength) {
162
146
  foundBook = this.abbreviations[abbreviations[k]]
163
- matchedLength = match[0].length - match[1].length // Exclude chapter-verse part
164
- originalBookText = text.slice(i, i + matchedLength)
147
+ startIndex = i
148
+ matchedLength = abbreviation.length
165
149
  }
166
150
  }
167
151
  }
@@ -169,90 +153,52 @@ class CodexParser {
169
153
  if (foundBook) {
170
154
  i += matchedLength
171
155
  let chapterVerse = ""
172
- let originalChapterVerseText = ""
173
156
  const references = []
157
+ const startOfReference = startIndex
174
158
 
175
- // Capture chapter-verse until a letter (potential new book) or semicolon
176
159
  while (i < normalizedText.length && isValidChapterVerseChar(normalizedText[i])) {
177
- if (isNextBibleBook(i)) {
178
- break
179
- }
160
+ if (isNextBibleBook(i)) break
180
161
  if (normalizedText[i] === ";") {
181
- const formattedReference = chapterVerse.trim().replace(/[^a-zA-Z0-9:,\-]+$/g, "")
182
- if (formattedReference) {
183
- // Find the last digit in the reference
184
- const lastDigitMatch = formattedReference.match(/\d(?=[^0-9]*$)/)
185
- let endIndex = i - 1 // Default to position before semicolon
186
- if (lastDigitMatch) {
187
- const lastDigitIndex = formattedReference.lastIndexOf(lastDigitMatch[0])
188
- endIndex = startIndex + matchedLength + lastDigitIndex
189
- }
190
- references.push({
191
- reference: formattedReference,
192
- originalText: (originalBookText + originalChapterVerseText).trim(),
193
- startIndex,
194
- endIndex,
195
- })
196
- }
162
+ const formattedReference = chapterVerse.trim().replace(/[^a-zA-Z0-9]+$/, "")
163
+ if (formattedReference) references.push(formattedReference)
197
164
  chapterVerse = ""
198
- originalChapterVerseText = ""
199
- originalBookText = foundBook // Reuse book for semicolon-separated references
200
- startIndex = i + 1 // Start of next reference
201
165
  i++
202
166
  continue
203
167
  }
204
168
  chapterVerse += normalizedText[i]
205
- originalChapterVerseText += text[i]
206
169
  i++
207
170
  }
208
171
 
209
- // Add any remaining reference
210
172
  if (chapterVerse.trim().length > 0) {
211
- const formattedReference = chapterVerse.trim().replace(/[^a-zA-Z0-9:,\-]+$/g, "")
212
- if (formattedReference) {
213
- // Find the last digit in the reference
214
- const lastDigitMatch = formattedReference.match(/\d(?=[^0-9]*$)/)
215
- let endIndex = i - 1 // Default to last character
216
- if (lastDigitMatch) {
217
- const lastDigitIndex = formattedReference.lastIndexOf(lastDigitMatch[0])
218
- endIndex = startIndex + matchedLength + lastDigitIndex
219
- }
220
- references.push({
221
- reference: formattedReference,
222
- originalText: (originalBookText + originalChapterVerseText).trim(),
223
- startIndex,
224
- endIndex,
225
- })
226
- }
173
+ const formattedReference = chapterVerse.trim().replace(/[^a-zA-Z0-9]+$/, "")
174
+ if (formattedReference) references.push(formattedReference)
227
175
  }
228
176
 
229
- // Process each reference
230
- references.forEach((refObj) => {
231
- // Detect version suffix (LXX or MT)
232
- let version = null
233
- let originalText = refObj.originalText
234
- const suffix = detectSuffix(i, text)
235
- if (suffix) {
236
- version = suffix.version
237
- originalText += ` ${version}`
238
- i += suffix.length
239
- // Update endIndex if version suffix follows a digit
240
- if (refObj.endIndex === i - suffix.length - 1) {
241
- refObj.endIndex = i - 1
242
- }
243
- }
177
+ // Set endIndex to the current position
178
+ let endIndex = i
179
+ const suffixData = detectSuffix(i)
180
+ const suffix = suffixData ? suffixData.suffix : null
181
+ if (suffixData) {
182
+ endIndex += suffixData.length
183
+ i += suffixData.length
184
+ }
244
185
 
186
+ // Trim endIndex to exclude trailing whitespace or non-reference characters
187
+ while (endIndex > startOfReference && /[\s]/.test(normalizedText[endIndex - 1])) {
188
+ endIndex--
189
+ }
190
+
191
+ references.forEach((ref) => {
245
192
  let type
246
- let ref = refObj.reference.replace(/^\.\s*/, "") // Remove leading period and space
247
- if (this.config.booksOnly && !ref) {
248
- type = "book_only"
249
- } else if (ref.includes(":")) {
193
+ if (ref.includes(":")) {
250
194
  if (ref.includes("-")) {
251
- const [start, end] = ref.split("-").map((s) => s.trim())
252
- const startParts = start.split(":").map((s) => s.trim())
253
- const endParts = end.split(":").map((s) => s.trim())
195
+ const [start, end] = ref.split("-")
196
+ const startParts = start.split(":")
197
+ const endParts = end.split(":")
254
198
  type =
255
- startParts.length > 1 && endParts.length > 1 && startParts[0] !== endParts[0]
199
+ startParts.length > 1 &&
200
+ endParts.length > 1 &&
201
+ startParts[0].trim() !== endParts[0].trim()
256
202
  ? "multi_chapter_verse_range"
257
203
  : "chapter_verse_range"
258
204
  } else if (ref.includes(",")) {
@@ -262,32 +208,25 @@ class CodexParser {
262
208
  }
263
209
  } else if (ref.includes("-")) {
264
210
  type = "chapter_range"
265
- } else if (/\d/.test(ref)) {
266
- type = "single_chapter"
267
211
  } else {
268
- type = "book_only"
212
+ type = "single_chapter"
269
213
  }
270
214
 
271
- const referenceObj = {
215
+ this.found.push({
272
216
  book: foundBook,
273
217
  reference: ref,
274
- version,
218
+ startIndex: startOfReference + 1,
219
+ endIndex: endIndex + 1,
220
+ version: suffix || null,
275
221
  type,
276
- originalText,
277
- startIndex: refObj.startIndex,
278
- endIndex: refObj.endIndex,
279
- }
280
- this.found.push(referenceObj)
222
+ originalText: text.slice(startOfReference, endIndex), // Use original text
223
+ })
281
224
  })
282
-
283
- // Skip any trailing spaces after the reference
284
- while (i < lowerCaseText.length && /\s/.test(lowerCaseText[i])) {
285
- i++
286
- }
287
225
  } else {
288
226
  i++
289
227
  }
290
228
  }
229
+
291
230
  return this
292
231
  }
293
232