versiref 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
versiref/py.typed ADDED
File without changes
versiref/ref_parser.py ADDED
@@ -0,0 +1,578 @@
1
+ """Bible reference parsing for versiref.
2
+
3
+ This module provides the RefParser class for parsing Bible references from strings.
4
+
5
+ We don't call pp.ParserElement.enablePackrat() because it made parsing slower.
6
+ """
7
+
8
+ from typing import Callable, Generator, Optional
9
+
10
+ import pyparsing as pp
11
+ from pyparsing import common
12
+
13
+ from versiref.bible_ref import BibleRef, SimpleBibleRef, VerseRange
14
+ from versiref.ref_style import RefStyle
15
+ from versiref.versification import Versification
16
+
17
+
18
+ def _get_int(tokens: pp.ParseResults, name: str, default: int) -> int:
19
+ """Get an integer from the parsed tokens in a type-safe manner."""
20
+ if name in tokens:
21
+ return int(tokens[name])
22
+ else:
23
+ return default
24
+
25
+
26
+ def _get_str(tokens: pp.ParseResults, name: str, default: str) -> str:
27
+ """Get an integer from the parsed tokens in a type-safe manner."""
28
+ if name in tokens:
29
+ return str(tokens[name])
30
+ else:
31
+ return default
32
+
33
+
34
+ class RefParser:
35
+ """Parser for Bible references.
36
+
37
+ RefParser uses pyparsing to build a parser that recognizes Bible references
38
+ according to a given style. It can parse a string to produce a SimpleBibleRef
39
+ instance.
40
+ """
41
+
42
+ def __init__(
43
+ self, style: RefStyle, versification: Versification, strict: bool = False
44
+ ):
45
+ """Initialize a RefParser with a style and versification.
46
+
47
+ Args:
48
+ style: The RefStyle to use for parsing
49
+ versification: The Versification to use for determining single-chapter books
50
+ strict: If True, follow the style more closely.
51
+
52
+ Non-strict parsers currently recognize as range separators hyphens, en
53
+ dashes, and the style's range separator if it differs from these.
54
+
55
+ """
56
+ self.style = style
57
+ self.versification = versification
58
+ self.strict = strict
59
+
60
+ # Build the parser
61
+ self._build_parser()
62
+
63
+ def _build_parser(self) -> None:
64
+ """Build the pyparsing parser based on the style and versification."""
65
+ # Define basic elements
66
+ book = pp.one_of(list(self.style.recognized_names.keys()))
67
+ # Parse name to book ID.
68
+ book.set_parse_action(lambda t: self.style.recognized_names[t[0]])
69
+ chapter = common.integer
70
+ verse = common.integer
71
+ subverse = pp.Word(pp.alphas.lower(), max=2).leave_whitespace() + pp.WordEnd(
72
+ pp.alphas.lower()
73
+ )
74
+ optional_subverse = pp.Opt(subverse.copy()).set_parse_action(
75
+ lambda t: t[0] if t else ""
76
+ )
77
+ subverse.set_parse_action(lambda t: t[0] if t else "")
78
+ if self.strict:
79
+ range_separator = pp.Suppress(self.style.range_separator)
80
+ else:
81
+ range_separators = ["-", "\N{EN DASH}"]
82
+ if self.style.range_separator not in range_separators:
83
+ range_separators.append(self.style.range_separator)
84
+ range_separator = pp.Suppress(pp.one_of(range_separators))
85
+ # Empty marker to record location
86
+ location_marker = pp.Empty().set_parse_action(lambda s, loc, t: loc)
87
+
88
+ # For now, we only parse ranges of a single verse.
89
+ verse_range = (
90
+ verse.copy().set_results_name("start_verse")
91
+ + optional_subverse.set_results_name("start_subverse")
92
+ + pp.Opt(
93
+ (
94
+ (
95
+ pp.Literal(self.style.following_verses).set_name(
96
+ "following_verses"
97
+ )
98
+ | pp.Literal(self.style.following_verse).set_name(
99
+ "following_verse"
100
+ )
101
+ )
102
+ + ~pp.Char(pp.identbodychars) # word boundary
103
+ )
104
+ | (
105
+ range_separator
106
+ + (
107
+ # Either a full chapter:verse reference
108
+ (
109
+ pp.Opt(
110
+ chapter.copy().set_results_name("end_chapter")
111
+ + self.style.chapter_verse_separator
112
+ )
113
+ + verse.copy().set_results_name("end_verse")
114
+ + optional_subverse.copy().set_results_name("end_subverse")
115
+ )
116
+ # Or just a subverse of the same verse
117
+ | subverse.copy().set_results_name("end_subverse")
118
+ )
119
+ )
120
+ )
121
+ + location_marker.copy().set_results_name("end_location")
122
+ ).set_parse_action(self._make_verse_range)
123
+
124
+ verse_ranges = pp.DelimitedList(
125
+ verse_range, delim=pp.Suppress(RefStyle.verse_range_separator.strip())
126
+ ).set_results_name("verse_ranges")
127
+
128
+ chapter_range = (
129
+ chapter.copy().set_results_name("start_chapter")
130
+ + pp.Suppress(self.style.chapter_verse_separator)
131
+ + location_marker.copy().set_results_name("verse_ranges_location")
132
+ + verse_ranges
133
+ ).set_parse_action(self._make_chapter_range)
134
+
135
+ chapter_ranges = pp.DelimitedList(
136
+ chapter_range, delim=pp.Suppress(self.style.chapter_separator.strip())
137
+ ).set_results_name("chapter_ranges")
138
+
139
+ book_chapter_verse_ranges = (
140
+ book.copy().set_results_name("book")
141
+ + location_marker.copy().set_results_name("chapter_ranges_location")
142
+ + chapter_ranges
143
+ + location_marker.copy().set_results_name("end_location")
144
+ ).set_parse_action(self._make_simple_ref)
145
+
146
+ # The chapter can be omitted for single-chapter (sc) books
147
+ sc_books = [
148
+ name
149
+ for name, id in self.style.recognized_names.items()
150
+ if self.versification.is_single_chapter(id)
151
+ ]
152
+ sc_book = pp.one_of(sc_books).set_results_name("book")
153
+ sc_book.set_parse_action(lambda t: self.style.recognized_names[t[0]])
154
+
155
+ sc_verse_range = (
156
+ verse.copy().set_results_name("start_verse")
157
+ + optional_subverse.copy().set_results_name("start_subverse")
158
+ + pp.Opt(
159
+ pp.Literal(self.style.following_verses).set_name("following_verses")
160
+ | pp.Literal(self.style.following_verse).set_name("following_verse")
161
+ | (
162
+ range_separator
163
+ + (
164
+ (
165
+ verse.copy().set_results_name("end_verse")
166
+ + optional_subverse.copy().set_results_name("end_subverse")
167
+ )
168
+ | subverse.copy().set_results_name("end_subverse")
169
+ )
170
+ )
171
+ )
172
+ + location_marker.copy().set_results_name("end_location")
173
+ ).set_parse_action(self._make_sc_verse_range)
174
+
175
+ sc_verse_ranges = pp.DelimitedList(
176
+ sc_verse_range, delim=pp.Suppress(RefStyle.verse_range_separator.strip())
177
+ ).set_results_name("chapter_ranges")
178
+
179
+ sc_book_verse_ranges = (
180
+ sc_book
181
+ + location_marker.copy().set_results_name("chapter_ranges_location")
182
+ + sc_verse_ranges
183
+ + location_marker.copy().set_results_name("end_location")
184
+ ).set_parse_action(self._make_simple_ref)
185
+
186
+ # Try the parser with longer matches first, lest Jude 1:5 parse as Jude 1.
187
+ self.simple_ref_parser = book_chapter_verse_ranges | sc_book_verse_ranges
188
+
189
+ # Now it's simple to build a parser for BibleRef.
190
+ self.bible_ref_parser = (
191
+ pp.DelimitedList(self.simple_ref_parser, self.style.chapter_separator)
192
+ + location_marker.copy().set_results_name("end_location")
193
+ ).set_parse_action(self._make_bible_ref)
194
+
195
+ def _make_verse_range(
196
+ self, original_text: str, loc: int, tokens: pp.ParseResults
197
+ ) -> VerseRange:
198
+ """Create a VerseRange from parsed tokens.
199
+
200
+ Chapter numbers that cannot be determined locally are set to -1.
201
+ This is a parse action for use with pyparsing.
202
+
203
+ Returns:
204
+ A VerseRange instance based on the parsed tokens
205
+
206
+ """
207
+ start_chapter = _get_int(tokens, "start_chapter", -1)
208
+ start_verse = tokens.start_verse
209
+ start_subverse = tokens.start_subverse
210
+ # Handle following_verse(s) mis-parsed as subverse.
211
+ if "following_verse" in tokens:
212
+ has_following_verse = True
213
+ elif start_subverse == self.style.following_verse and "end_verse" not in tokens:
214
+ start_subverse = ""
215
+ has_following_verse = True
216
+ else:
217
+ has_following_verse = False
218
+ if "following_verses" in tokens:
219
+ has_following_verses = True
220
+ elif (
221
+ start_subverse == self.style.following_verses and "end_verse" not in tokens
222
+ ):
223
+ start_subverse = ""
224
+ has_following_verses = True
225
+ else:
226
+ has_following_verses = False
227
+ # Now set end based on type of range.
228
+ if has_following_verse or has_following_verses:
229
+ end_chapter = start_chapter
230
+ if has_following_verse:
231
+ end_verse = start_verse + 1
232
+ else:
233
+ end_verse = -1
234
+ end_subverse = ""
235
+ else:
236
+ end_chapter = _get_int(tokens, "end_chapter", start_chapter)
237
+ end_verse = _get_int(tokens, "end_verse", start_verse)
238
+ end_subverse = _get_str(tokens, "end_subverse", start_subverse)
239
+ end_location = _get_int(tokens, "end_location", -1)
240
+ range_original_text = original_text[loc:end_location].strip()
241
+ return VerseRange(
242
+ start_chapter=start_chapter,
243
+ start_verse=start_verse,
244
+ start_subverse=start_subverse,
245
+ end_chapter=end_chapter,
246
+ end_verse=end_verse,
247
+ end_subverse=end_subverse,
248
+ original_text=range_original_text,
249
+ )
250
+
251
+ @staticmethod
252
+ def _make_chapter_range(
253
+ original_text: str, loc: int, tokens: pp.ParseResults
254
+ ) -> pp.ParseResults:
255
+ """Set the chapter for the verse ranges.
256
+
257
+ Here we supply chapter numbers that cannot be determined locally.
258
+ This is a parse action for use with pyparsing.
259
+ """
260
+ this_chapter = tokens.start_chapter
261
+ verse_ranges = tokens.verse_ranges
262
+ for range in verse_ranges:
263
+ # Set the chapter for each verse range
264
+ range.start_chapter = this_chapter
265
+ if range.end_chapter < 0:
266
+ range.end_chapter = this_chapter
267
+ else:
268
+ this_chapter = range.end_chapter
269
+ if verse_ranges:
270
+ # Expand the original text for the first verse range to include the
271
+ # chapter number. Why do we need to use find()? Because there could
272
+ # be whitespace after verse_ranges_location.
273
+ verse_ranges_location = _get_int(tokens, "verse_ranges_location", loc)
274
+ range_0_start = original_text.find(
275
+ verse_ranges[0].original_text, verse_ranges_location
276
+ )
277
+ verse_ranges[0].original_text = original_text[
278
+ loc : range_0_start + len(verse_ranges[0].original_text)
279
+ ]
280
+ assert isinstance(verse_ranges, pp.ParseResults)
281
+ return verse_ranges
282
+
283
+ def _make_sc_verse_range(
284
+ self, original_text: str, loc: int, tokens: pp.ParseResults
285
+ ) -> VerseRange:
286
+ """Create a VerseRange from parsed tokens.
287
+
288
+ This is for a single-chapter book.
289
+ This is a parse action for use with pyparsing.
290
+
291
+ Returns:
292
+ A VerseRange instance based on the parsed tokens
293
+
294
+ """
295
+ start_chapter = 1
296
+ start_verse = tokens.start_verse
297
+ start_subverse = tokens.start_subverse
298
+ end_chapter = 1
299
+ # Handle following_verse(s) mis-parsed as subverse.
300
+ if "following_verse" in tokens:
301
+ has_following_verse = True
302
+ elif start_subverse == self.style.following_verse and "end_verse" not in tokens:
303
+ start_subverse = ""
304
+ has_following_verse = True
305
+ else:
306
+ has_following_verse = False
307
+ if "following_verses" in tokens:
308
+ has_following_verses = True
309
+ elif (
310
+ start_subverse == self.style.following_verses and "end_verse" not in tokens
311
+ ):
312
+ start_subverse = ""
313
+ has_following_verses = True
314
+ else:
315
+ has_following_verses = False
316
+ # Now set end based on type of range.
317
+ if has_following_verse or has_following_verses:
318
+ if has_following_verse:
319
+ end_verse = start_verse + 1
320
+ else:
321
+ end_verse = -1
322
+ end_subverse = ""
323
+ else:
324
+ end_verse = _get_int(tokens, "end_verse", start_verse)
325
+ end_subverse = _get_str(tokens, "end_subverse", start_subverse)
326
+ end_location = _get_int(tokens, "end_location", -1)
327
+ range_original_text = original_text[loc:end_location].strip()
328
+ return VerseRange(
329
+ start_chapter=start_chapter,
330
+ start_verse=start_verse,
331
+ start_subverse=start_subverse,
332
+ end_chapter=end_chapter,
333
+ end_verse=end_verse,
334
+ end_subverse=end_subverse,
335
+ original_text=range_original_text,
336
+ )
337
+
338
+ @staticmethod
339
+ def _make_simple_ref(
340
+ original_text: str, loc: int, tokens: pp.ParseResults
341
+ ) -> SimpleBibleRef:
342
+ """Create a SimpleBibleRef from parsed tokens.
343
+
344
+ This is a parse action for use with pyparsing.
345
+
346
+ Returns:
347
+ A SimpleBibleRef instance based on the parsed tokens
348
+
349
+ """
350
+ # Extract the book ID and verse ranges
351
+ book_name = tokens.book
352
+ if "chapter_ranges" in tokens:
353
+ verse_ranges = tokens.chapter_ranges
354
+ else:
355
+ verse_ranges = tokens.verse_ranges
356
+ if verse_ranges:
357
+ # Expand the original text for the first verse range to include the
358
+ # book name. Why do we need to use find()? Because there could
359
+ # be whitespace after verse_ranges_location.
360
+ chapter_ranges_location = _get_int(tokens, "chapter_ranges_location", loc)
361
+ range_0_start = original_text.find(
362
+ verse_ranges[0].original_text, chapter_ranges_location
363
+ )
364
+ verse_ranges[0].original_text = original_text[
365
+ loc : range_0_start + len(verse_ranges[0].original_text)
366
+ ]
367
+ end_location = _get_int(tokens, "end_location", -1)
368
+ ref_original_text = original_text[loc:end_location].strip()
369
+
370
+ # Create a SimpleBibleRef with the parsed data
371
+ return SimpleBibleRef(
372
+ book_id=book_name,
373
+ ranges=verse_ranges.as_list(),
374
+ original_text=ref_original_text,
375
+ )
376
+
377
+ def _make_bible_ref(
378
+ self, original_text: str, loc: int, tokens: pp.ParseResults
379
+ ) -> BibleRef:
380
+ """Create a BibleRef from parsed tokens.
381
+
382
+ This is a parse action for use with pyparsing.
383
+
384
+ Returns:
385
+ A BibleRef instance based on the parsed tokens
386
+
387
+ """
388
+ end_location = _get_int(tokens, "end_location", -1)
389
+ # One token (end_location) is not a SimpleBibleRef
390
+ simple_refs = [r for r in tokens if isinstance(r, SimpleBibleRef)]
391
+ ref_original_text = original_text[loc:end_location].strip()
392
+
393
+ # Create a BibleRef with the parsed data
394
+ return BibleRef(
395
+ versification=self.versification,
396
+ simple_refs=simple_refs,
397
+ original_text=ref_original_text,
398
+ )
399
+
400
+ def parse_simple(self, text: str, silent: bool = True) -> Optional[SimpleBibleRef]:
401
+ """Parse a string to produce a SimpleBibleRef.
402
+
403
+ This method attempts to parse the entire string as a reference to a single book of the Bible.
404
+
405
+ Args:
406
+ text: The string to parse
407
+ silent: If True, return None on failure instead of raising a pyparsing.ParseException
408
+
409
+ Returns:
410
+ A SimpleBibleRef instance, or None if parsing fails
411
+
412
+ """
413
+ try:
414
+ # Try to parse the text
415
+ result = self.simple_ref_parser.parse_string(text, parse_all=True)
416
+ ref = result[0]
417
+ assert isinstance(ref, SimpleBibleRef)
418
+ return ref
419
+
420
+ except pp.ParseException as e:
421
+ if silent:
422
+ return None
423
+ else:
424
+ raise e
425
+
426
+ def parse(self, text: str, silent: bool = True) -> Optional[BibleRef]:
427
+ """Parse a string to produce a BibleRef.
428
+
429
+ This method attempts to parse the entire string as a reference to one or more books of the Bible.
430
+
431
+ Args:
432
+ text: The string to parse
433
+ silent: If True, return None on failure instead of raising a pyparsing.ParseException
434
+
435
+ Returns:
436
+ A BibleRef instance, or None if parsing fails
437
+
438
+ """
439
+ try:
440
+ # Try to parse the text
441
+ result = self.bible_ref_parser.parse_string(text, parse_all=True)
442
+ ref = result[0]
443
+ assert isinstance(ref, BibleRef)
444
+ return ref
445
+
446
+ except pp.ParseException as e:
447
+ if silent:
448
+ return None
449
+ else:
450
+ raise e
451
+
452
+ def scan_string_simple(
453
+ self, text: str, as_ranges: bool = False
454
+ ) -> Generator[tuple["SimpleBibleRef", int, int], None, None]:
455
+ """Scan a string for SimpleBibleRefs.
456
+
457
+ This method scans the entire string for references to a single book of the Bible.
458
+
459
+ Args:
460
+ text: The string to scan
461
+ as_ranges: If True, yield a SimpleBibleRef for each verse range
462
+
463
+ Yields:
464
+ A reference and the start and end of its location in text.
465
+ (ref: SimpleBibleRef, start: int, end: int)
466
+
467
+ """
468
+ for tokens, start, end in self.simple_ref_parser.scan_string(text):
469
+ ref = tokens[0]
470
+ assert isinstance(ref, SimpleBibleRef)
471
+ if as_ranges:
472
+ next_start = start
473
+ for range_ref in ref.range_refs():
474
+ # Use the original text to find the start and end.
475
+ assert range_ref.original_text is not None
476
+ range_start = text.find(range_ref.original_text, next_start)
477
+ assert range_start >= 0
478
+ next_start = range_start + len(range_ref.original_text)
479
+ yield (range_ref, range_start, next_start)
480
+ else:
481
+ assert ref.original_text is not None
482
+ yield (ref, start, start + len(ref.original_text))
483
+
484
+ def scan_string(
485
+ self, text: str, as_ranges: bool = False
486
+ ) -> Generator[tuple["BibleRef", int, int], None, None]:
487
+ """Scan a string for BibleRefs.
488
+
489
+ This method scans the entire string for references to one or more books of the Bible.
490
+
491
+ Args:
492
+ text: The string to scan
493
+ as_ranges: If True, yield a BibleRef for each verse range
494
+
495
+ Yields:
496
+ A reference and the start and end of its location in text.
497
+ (ref: BibleRef, start: int, end: int)
498
+
499
+ """
500
+ for tokens, start, end in self.bible_ref_parser.scan_string(text):
501
+ ref = tokens[0]
502
+ assert isinstance(ref, BibleRef)
503
+ if as_ranges:
504
+ next_start = start
505
+ for range_ref in ref.range_refs():
506
+ # Use the original text to find the start and end.
507
+ assert range_ref.original_text is not None
508
+ range_start = text.find(range_ref.original_text, next_start)
509
+ assert range_start >= 0
510
+ next_start = range_start + len(range_ref.original_text)
511
+ yield (range_ref, range_start, next_start)
512
+ else:
513
+ assert ref.original_text is not None
514
+ yield (ref, start, start + len(ref.original_text))
515
+
516
+ def sub_refs_simple(
517
+ self,
518
+ text: str,
519
+ callback: Callable[[SimpleBibleRef], Optional[str]],
520
+ as_ranges: bool = False,
521
+ ) -> str:
522
+ """Substitute SimpleBibleRefs in a string.
523
+
524
+ This method scans the entire string for references to a single book of the Bible and
525
+ applies a callback function to each reference.
526
+
527
+ Args:
528
+ text: The string to scan
529
+ callback: A function that takes a SimpleBibleRef and returns a string or None
530
+ If None is returned, the reference is not replaced.
531
+ as_ranges: If True, yield a SimpleBibleRef for each verse range
532
+
533
+ Returns:
534
+ The modified string
535
+
536
+ """
537
+ result = []
538
+ last_end = 0
539
+ for ref, start, end in self.scan_string_simple(text, as_ranges):
540
+ replacement = callback(ref)
541
+ if replacement is not None:
542
+ result.append(text[last_end:start])
543
+ result.append(replacement)
544
+ last_end = end
545
+ result.append(text[last_end:])
546
+ return "".join(result)
547
+
548
+ def sub_refs(
549
+ self,
550
+ text: str,
551
+ callback: Callable[[BibleRef], Optional[str]],
552
+ as_ranges: bool = False,
553
+ ) -> str:
554
+ """Substitute BibleRefs in a string.
555
+
556
+ This method scans the entire string for references to one or more books of the Bible and
557
+ applies a callback function to each reference.
558
+
559
+ Args:
560
+ text: The string to scan
561
+ callback: A function that takes a BibleRef and returns a string or None
562
+ If None is returned, the reference is not replaced.
563
+ as_ranges: If True, yield a BibleRef for each verse range
564
+
565
+ Returns:
566
+ The modified string
567
+
568
+ """
569
+ result = []
570
+ last_end = 0
571
+ for ref, start, end in self.scan_string(text, as_ranges):
572
+ replacement = callback(ref)
573
+ if replacement is not None:
574
+ result.append(text[last_end:start])
575
+ result.append(replacement)
576
+ last_end = end
577
+ result.append(text[last_end:])
578
+ return "".join(result)