wikitextparser 0.55.11__tar.gz → 0.55.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/CHANGELOG.rst +4 -0
  2. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/PKG-INFO +1 -1
  3. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/__init__.py +1 -1
  4. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_wikitext.py +81 -70
  5. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/.coveragerc +0 -0
  6. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/.github/workflows/tests.yml +0 -0
  7. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/.gitignore +0 -0
  8. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/.readthedocs.yaml +0 -0
  9. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/LICENSE.md +0 -0
  10. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/README.rst +0 -0
  11. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/docs/CHANGELOG.rst +0 -0
  12. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/docs/Makefile +0 -0
  13. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/docs/README.rst +0 -0
  14. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/docs/conf.py +0 -0
  15. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/docs/index.rst +0 -0
  16. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/docs/make.bat +0 -0
  17. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/pyproject.toml +0 -0
  18. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_argument.py +0 -0
  19. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_cell.py +0 -0
  20. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_comment_bold_italic.py +0 -0
  21. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_config.py +0 -0
  22. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_externallink.py +0 -0
  23. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_parameter.py +0 -0
  24. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_parser_function.py +0 -0
  25. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_section.py +0 -0
  26. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_spans.py +0 -0
  27. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_table.py +0 -0
  28. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_tag.py +0 -0
  29. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_template.py +0 -0
  30. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_wikilink.py +0 -0
  31. {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_wikilist.py +0 -0
@@ -1,3 +1,7 @@
1
+ v0.55.12
2
+ --------
3
+ * Performance improvements in extracting bold and italic nodes. (#133)
4
+
1
5
  v0.55.11
2
6
  --------
3
7
  * Performance improvements in ``__setitem__``/``__delitem__`` and ``pformat``/``plain_text`` methods. (#131)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wikitextparser
3
- Version: 0.55.11
3
+ Version: 0.55.12
4
4
  Summary: A simple parsing tool for MediaWiki's wikitext markup.
5
5
  Keywords: MediaWiki,wikitext,parser
6
6
  Author-email: 5j9 <5j9@users.noreply.github.com>
@@ -1,5 +1,5 @@
1
1
  # Scheme: [N!]N(.N)*[{a|b|rc}N][.postN][.devN]
2
- __version__ = '0.55.11'
2
+ __version__ = '0.55.12'
3
3
 
4
4
  from . import _wikitext
5
5
  from ._argument import Argument # noqa: F401
@@ -1,5 +1,4 @@
1
1
  from bisect import bisect_left, bisect_right, insort_right
2
- from copy import deepcopy
3
2
  from html import unescape
4
3
  from itertools import compress, islice
5
4
  from operator import attrgetter
@@ -114,10 +113,11 @@ TABLE_FINDITER = rc(
114
113
  DOTALL | MULTILINE | VERBOSE,
115
114
  ).finditer
116
115
 
117
- BOLD_ITALIC_FINDITER = rc( # bold-italic, bold, or italic tokens
118
- rb"""((?>'\0*)*?)'\0*+'\0*+('\0*+('\0*+')?+)?+(?=[^']|$)|($)""",
116
+ substitute_apostrophes = rc( # bold-italic, bold, or italic tokens
117
+ rb"('\0*+){2,}+(?=[^']|$)",
119
118
  MULTILINE | VERBOSE,
120
- ).finditer
119
+ ).sub
120
+ find_lines = rc(rb'(.*?)$').finditer
121
121
 
122
122
  BOLD_FINDITER = rc(
123
123
  rb"""
@@ -621,15 +621,12 @@ class WikiText:
621
621
  self.string.
622
622
  """
623
623
  ss, se, _, _ = self._span_data
624
- if ss == 0 and se == len(self._lststr[0]):
625
- return deepcopy(self._type_to_spans)
626
624
  return {
627
625
  type_: [
628
626
  [s - ss, e - ss, m, ba[:] if ba is not None else None]
629
627
  for s, e, m, ba in spans[
630
- bisect_left(spans, [ss]) : bisect_left(spans, [se])
628
+ bisect_right(spans, [ss]) : bisect_right(spans, [se])
631
629
  ]
632
- if e <= se
633
630
  ]
634
631
  for type_, spans in self._type_to_spans.items()
635
632
  }
@@ -1012,66 +1009,82 @@ class WikiText:
1012
1009
  ]
1013
1010
 
1014
1011
  @property
1015
- def _balanced_quotes_shadow(self):
1016
- """Return bold and italic match objects according MW's algorithm.
1012
+ def _balanced_quotes_shadow(self) -> bytearray:
1013
+ """Return a byte array with non-markup-apostrophes removed.
1017
1014
 
1018
1015
  The comments at /includes/parser/Parser.php:doQuotes are helpful:
1019
1016
  https://github.com/wikimedia/mediawiki/blob/master/includes/parser/Parser.php
1020
1017
  https://phabricator.wikimedia.org/T15227#178834
1021
1018
  """
1022
- bold_matches = []
1019
+ bold_starts: List[int] = []
1023
1020
  odd_italics = False
1024
1021
  odd_bold_italics = False
1025
- shadow_copy = self._shadow[:]
1026
- append_match = bold_matches.append
1027
- for m in BOLD_ITALIC_FINDITER(shadow_copy):
1028
- if m[4] is not None: # newline or string end
1029
- if (
1030
- odd_italics is True
1031
- and (len(bold_matches) + odd_bold_italics) % 2
1032
- ):
1033
- # one of the bold marks needs to be interpreted as italic
1034
- first_multi_letter_word = first_space = None
1035
- for bold_match in bold_matches:
1036
- bold_start = bold_match.start()
1037
- if shadow_copy[bold_start - 1 : bold_start] == b' ':
1038
- if first_space is None:
1039
- first_space = bold_start
1040
- continue
1041
- if (
1042
- shadow_copy[bold_start - 2 : bold_start - 1]
1043
- == b' '
1044
- ):
1045
- shadow_copy[bold_start] = 95 # _
1046
- break # first_single_letter_word
1047
- if first_multi_letter_word is None:
1048
- first_multi_letter_word = bold_start
1049
- continue
1050
- else: # there was no first_single_letter_word
1051
- if first_multi_letter_word is not None:
1052
- shadow_copy[first_multi_letter_word] = 95 # _
1053
- elif first_space is not None:
1054
- shadow_copy[first_space] = 95 # _
1055
- bold_matches.clear()
1056
- odd_italics = False
1057
- continue
1058
- if m[2] is None: # italic
1022
+ append_bold_start = bold_starts.append
1023
+
1024
+ def process_line(line: bytes) -> bytes:
1025
+ nonlocal odd_italics, odd_bold_italics
1026
+ if odd_italics and (len(bold_starts) + odd_bold_italics) % 2:
1027
+ # one of the bold marks needs to be interpreted as italic
1028
+ first_multi_letter_word = first_space = None
1029
+ for s in bold_starts:
1030
+ if line[s - 1] == 32: # space
1031
+ if first_space is None:
1032
+ first_space = s
1033
+ continue
1034
+ if line[s - 2] == 32: # space
1035
+ line = line[:s] + b' ' + line[s + 1 :]
1036
+ break # first_single_letter_word
1037
+ if first_multi_letter_word is None:
1038
+ first_multi_letter_word = s
1039
+ continue
1040
+ else: # there was no first_single_letter_word
1041
+ if first_multi_letter_word is not None:
1042
+ line = (
1043
+ line[:first_multi_letter_word]
1044
+ + b'_'
1045
+ + line[first_multi_letter_word + 1 :]
1046
+ )
1047
+ elif first_space is not None:
1048
+ line = (
1049
+ line[:first_space] + b'_' + line[first_space + 1 :]
1050
+ )
1051
+ # reset state for the next line
1052
+ bold_starts.clear()
1053
+ odd_italics = False
1054
+ odd_bold_italics = False
1055
+ return line
1056
+
1057
+ def process_apostrophes(m) -> bytes:
1058
+ nonlocal odd_italics, odd_bold_italics
1059
+ starts = m.starts(1)
1060
+ n = len(starts)
1061
+ if n == 2: # italic
1059
1062
  odd_italics ^= True
1060
- continue
1061
- if m[3] is None: # bold
1062
- s, e = m.span(1)
1063
- if s != e: # four apostrophes, hide the first one
1064
- shadow_copy[s] = 95 # _
1065
- append_match(m)
1066
- continue
1067
- # bold-italic
1068
- s, e = m.span(1)
1069
- es = e - s
1070
- if es: # more than 5 apostrophes, hide the previous ones
1071
- shadow_copy[s:e] = b'_' * es
1072
- odd_bold_italics ^= True
1073
- odd_italics ^= True
1074
- return shadow_copy
1063
+ return m[0]
1064
+ if n == 3: # bold
1065
+ append_bold_start(starts[0])
1066
+ return m[0]
1067
+ if n == 5:
1068
+ odd_bold_italics ^= True
1069
+ odd_italics ^= True
1070
+ return m[0]
1071
+ if n == 4: # four apostrophes -> hide the first one
1072
+ s = starts[1]
1073
+ append_bold_start(s)
1074
+ return b'_' * (s - starts[0]) + m.string[s : m.end()]
1075
+ if n > 5: # more than 5 apostrophes -> hide the prior ones
1076
+ odd_bold_italics ^= True
1077
+ odd_italics ^= True
1078
+ s = starts[-5]
1079
+ return b'_' * (s - starts[0]) + m.string[s : m.end()]
1080
+ raise # execution should never reach here
1081
+
1082
+ return bytearray(b'\n').join(
1083
+ [
1084
+ process_line(substitute_apostrophes(process_apostrophes, line))
1085
+ for line in self._shadow.splitlines()
1086
+ ]
1087
+ )
1075
1088
 
1076
1089
  def _bolds_italics_recurse(self, result: list, filter_cls: Optional[type]):
1077
1090
  for prop in (
@@ -1123,8 +1136,8 @@ class WikiText:
1123
1136
  bold_spans = tts_setdefault('Bold', [])
1124
1137
  get_old_bold_span = {(s[0], s[1]): s for s in bold_spans}.get
1125
1138
  bold_matches = list(BOLD_FINDITER(balanced_shadow, rs, re))
1126
- for match in bold_matches:
1127
- ms, me = match.span()
1139
+ for m in bold_matches:
1140
+ ms, me = m.span()
1128
1141
  b, e = s + ms, s + me
1129
1142
  old_span = get_old_bold_span((b, e))
1130
1143
  if old_span is None:
@@ -1146,16 +1159,16 @@ class WikiText:
1146
1159
  # filter_cls is None or filter_cls is Italic
1147
1160
 
1148
1161
  # remove bold tokens before searching for italics
1149
- for match in bold_matches:
1150
- ms, me = match.span()
1151
- cs, ce = match.span(1) # content
1162
+ for m in bold_matches:
1163
+ ms, me = m.span()
1164
+ cs, ce = m.span(1) # content
1152
1165
  balanced_shadow[ms:cs] = b'_' * (cs - ms)
1153
1166
  balanced_shadow[ce:me] = b'_' * (me - ce)
1154
1167
 
1155
1168
  italic_spans = tts_setdefault('Italic', [])
1156
1169
  get_old_italic_span = {(s[0], s[1]): s for s in italic_spans}.get
1157
- for match in ITALIC_FINDITER(balanced_shadow, rs, re):
1158
- ms, me = match.span()
1170
+ for m in ITALIC_FINDITER(balanced_shadow, rs, re):
1171
+ ms, me = m.span()
1159
1172
  b, e = span = s + ms, s + me
1160
1173
  old_span = get_old_italic_span(span)
1161
1174
  if old_span is None:
@@ -1164,9 +1177,7 @@ class WikiText:
1164
1177
  else:
1165
1178
  span = old_span
1166
1179
  append(
1167
- Italic(
1168
- _lststr, type_to_spans, span, 'Bold', me != match.end(1)
1169
- )
1180
+ Italic(_lststr, type_to_spans, span, 'Bold', me != m.end(1))
1170
1181
  )
1171
1182
  if recursive and filter_cls is Italic:
1172
1183
  self._bolds_italics_recurse(result, filter_cls)