wikitextparser 0.55.11__tar.gz → 0.55.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/CHANGELOG.rst +8 -0
  2. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/PKG-INFO +1 -1
  3. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/__init__.py +1 -1
  4. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_parser_function.py +1 -4
  5. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_section.py +2 -2
  6. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_wikitext.py +76 -77
  7. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/.coveragerc +0 -0
  8. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/.github/workflows/tests.yml +0 -0
  9. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/.gitignore +0 -0
  10. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/.readthedocs.yaml +0 -0
  11. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/LICENSE.md +0 -0
  12. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/README.rst +0 -0
  13. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/docs/CHANGELOG.rst +0 -0
  14. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/docs/Makefile +0 -0
  15. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/docs/README.rst +0 -0
  16. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/docs/conf.py +0 -0
  17. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/docs/index.rst +0 -0
  18. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/docs/make.bat +0 -0
  19. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/pyproject.toml +0 -0
  20. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_argument.py +0 -0
  21. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_cell.py +0 -0
  22. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_comment_bold_italic.py +0 -0
  23. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_config.py +0 -0
  24. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_externallink.py +0 -0
  25. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_parameter.py +0 -0
  26. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_spans.py +0 -0
  27. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_table.py +0 -0
  28. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_tag.py +0 -0
  29. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_template.py +0 -0
  30. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_wikilink.py +0 -0
  31. {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_wikilist.py +0 -0
@@ -1,3 +1,11 @@
1
+ v0.55.13
2
+ --------
3
+ * Fixed a bug in ``Section.level`` resulting in malformed section titles when multiple levels are added (#135)
4
+
5
+ v0.55.12
6
+ --------
7
+ * Performance improvements in extracting bold and italic nodes. (#133)
8
+
1
9
  v0.55.11
2
10
  --------
3
11
  * Performance improvements in ``__setitem__``/``__delitem__`` and ``pformat``/``plain_text`` methods. (#131)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wikitextparser
3
- Version: 0.55.11
3
+ Version: 0.55.13
4
4
  Summary: A simple parsing tool for MediaWiki's wikitext markup.
5
5
  Keywords: MediaWiki,wikitext,parser
6
6
  Author-email: 5j9 <5j9@users.noreply.github.com>
@@ -1,5 +1,5 @@
1
1
  # Scheme: [N!]N(.N)*[{a|b|rc}N][.postN][.devN]
2
- __version__ = '0.55.11'
2
+ __version__ = '0.55.13'
3
3
 
4
4
  from . import _wikitext
5
5
  from ._argument import Argument # noqa: F401
@@ -60,10 +60,7 @@ class SubWikiTextWithArgs(SubWikiText):
60
60
  else:
61
61
  arg_span = old_span
62
62
  arg = Argument(lststr, type_to_spans, arg_span, type_, self)
63
- arg._shadow_cache = (
64
- lststr[0][s:e],
65
- shadow[arg_self_start:arg_self_end],
66
- )
63
+ arg._span_data[3] = shadow[arg_self_start:arg_self_end]
67
64
  arguments_append(arg)
68
65
  return arguments
69
66
 
@@ -42,9 +42,9 @@ class Section(SubWikiText):
42
42
  if level_diff == 0:
43
43
  return
44
44
  if level_diff < 0:
45
- new_equals = '=' * abs(level_diff)
45
+ new_equals = '=' * -level_diff
46
46
  self.insert(0, new_equals)
47
- self.insert(m.end(2) + 1, new_equals)
47
+ self.insert(m.end(2) - level_diff, new_equals)
48
48
  return
49
49
  del self[:level_diff]
50
50
  del self[m.end(2) : m.end(2) + level_diff]
@@ -1,5 +1,4 @@
1
1
  from bisect import bisect_left, bisect_right, insort_right
2
- from copy import deepcopy
3
2
  from html import unescape
4
3
  from itertools import compress, islice
5
4
  from operator import attrgetter
@@ -41,7 +40,6 @@ from ._spans import (
41
40
  END_TAG_PATTERN,
42
41
  EXTERNAL_LINK_URL_TAIL,
43
42
  INVALID_URL_CHARS,
44
- PARSABLE_TAG_EXTENSION_NAME,
45
43
  START_TAG_PATTERN,
46
44
  parse_to_spans,
47
45
  rc,
@@ -53,10 +51,6 @@ NAME_CAPTURING_HTML_START_TAG_FINDITER = rc(
53
51
  )
54
52
  ).finditer
55
53
 
56
- PARSABLE_TAG_EXTENSIONS_MATCH = rc(
57
- rb'<' + PARSABLE_TAG_EXTENSION_NAME + rb'\b', IGNORECASE
58
- ).match
59
-
60
54
  # External links
61
55
  BRACKET_EXTERNAL_LINK_SCHEMES = regex_pattern(
62
56
  _bare_external_link_schemes | {'//'}
@@ -114,10 +108,7 @@ TABLE_FINDITER = rc(
114
108
  DOTALL | MULTILINE | VERBOSE,
115
109
  ).finditer
116
110
 
117
- BOLD_ITALIC_FINDITER = rc( # bold-italic, bold, or italic tokens
118
- rb"""((?>'\0*)*?)'\0*+'\0*+('\0*+('\0*+')?+)?+(?=[^']|$)|($)""",
119
- MULTILINE | VERBOSE,
120
- ).finditer
111
+ substitute_apostrophes = rc(rb"('\0*+){2,}+(?=[^']|$)", MULTILINE).sub
121
112
 
122
113
  BOLD_FINDITER = rc(
123
114
  rb"""
@@ -213,7 +204,7 @@ class WikiText:
213
204
  # The following class attribute acts as a default value.
214
205
  _type = 'WikiText'
215
206
 
216
- __slots__ = '_type_to_spans', '_lststr', '_span_data', '_shadow_cache'
207
+ __slots__ = '_type_to_spans', '_lststr', '_span_data'
217
208
 
218
209
  def __init__(
219
210
  self,
@@ -241,7 +232,6 @@ class WikiText:
241
232
  if _type not in SPAN_PARSER_TYPES:
242
233
  type_to_spans = self._type_to_spans = parse_to_spans(byte_array)
243
234
  type_to_spans[_type] = [span]
244
- self._shadow_cache = string, byte_array
245
235
  else:
246
236
  # In SPAN_PARSER_TYPES, we can't pass the original byte_array to
247
237
  # parser to generate the shadow because it will replace the whole
@@ -259,7 +249,6 @@ class WikiText:
259
249
  byte_array[0] = 3
260
250
  byte_array[-1] = 32
261
251
  type_to_spans = parse_to_spans(byte_array)
262
- self._shadow_cache = string, byte_array
263
252
  type_to_spans[_type].insert(0, span)
264
253
  self._type_to_spans = type_to_spans
265
254
  if type(self) is Parameter:
@@ -621,15 +610,12 @@ class WikiText:
621
610
  self.string.
622
611
  """
623
612
  ss, se, _, _ = self._span_data
624
- if ss == 0 and se == len(self._lststr[0]):
625
- return deepcopy(self._type_to_spans)
626
613
  return {
627
614
  type_: [
628
615
  [s - ss, e - ss, m, ba[:] if ba is not None else None]
629
616
  for s, e, m, ba in spans[
630
- bisect_left(spans, [ss]) : bisect_left(spans, [se])
617
+ bisect_right(spans, [ss]) : bisect_right(spans, [se])
631
618
  ]
632
- if e <= se
633
619
  ]
634
620
  for type_, spans in self._type_to_spans.items()
635
621
  }
@@ -1012,66 +998,81 @@ class WikiText:
1012
998
  ]
1013
999
 
1014
1000
  @property
1015
- def _balanced_quotes_shadow(self):
1016
- """Return bold and italic match objects according MW's algorithm.
1001
+ def _balanced_quotes_shadow(self) -> bytearray:
1002
+ """Return a byte array with non-markup-apostrophes removed.
1017
1003
 
1018
1004
  The comments at /includes/parser/Parser.php:doQuotes are helpful:
1019
1005
  https://github.com/wikimedia/mediawiki/blob/master/includes/parser/Parser.php
1020
1006
  https://phabricator.wikimedia.org/T15227#178834
1021
1007
  """
1022
- bold_matches = []
1008
+ bold_starts: List[int] = []
1023
1009
  odd_italics = False
1024
1010
  odd_bold_italics = False
1025
- shadow_copy = self._shadow[:]
1026
- append_match = bold_matches.append
1027
- for m in BOLD_ITALIC_FINDITER(shadow_copy):
1028
- if m[4] is not None: # newline or string end
1029
- if (
1030
- odd_italics is True
1031
- and (len(bold_matches) + odd_bold_italics) % 2
1032
- ):
1033
- # one of the bold marks needs to be interpreted as italic
1034
- first_multi_letter_word = first_space = None
1035
- for bold_match in bold_matches:
1036
- bold_start = bold_match.start()
1037
- if shadow_copy[bold_start - 1 : bold_start] == b' ':
1038
- if first_space is None:
1039
- first_space = bold_start
1040
- continue
1041
- if (
1042
- shadow_copy[bold_start - 2 : bold_start - 1]
1043
- == b' '
1044
- ):
1045
- shadow_copy[bold_start] = 95 # _
1046
- break # first_single_letter_word
1047
- if first_multi_letter_word is None:
1048
- first_multi_letter_word = bold_start
1049
- continue
1050
- else: # there was no first_single_letter_word
1051
- if first_multi_letter_word is not None:
1052
- shadow_copy[first_multi_letter_word] = 95 # _
1053
- elif first_space is not None:
1054
- shadow_copy[first_space] = 95 # _
1055
- bold_matches.clear()
1056
- odd_italics = False
1057
- continue
1058
- if m[2] is None: # italic
1011
+ append_bold_start = bold_starts.append
1012
+
1013
+ def process_line(line: bytes) -> bytes:
1014
+ nonlocal odd_italics, odd_bold_italics
1015
+ if odd_italics and (len(bold_starts) + odd_bold_italics) % 2:
1016
+ # one of the bold marks needs to be interpreted as italic
1017
+ first_multi_letter_word = first_space = None
1018
+ for s in bold_starts:
1019
+ if line[s - 1] == 32: # space
1020
+ if first_space is None:
1021
+ first_space = s
1022
+ continue
1023
+ if line[s - 2] == 32: # space
1024
+ line = line[:s] + b' ' + line[s + 1 :]
1025
+ break # first_single_letter_word
1026
+ if first_multi_letter_word is None:
1027
+ first_multi_letter_word = s
1028
+ continue
1029
+ else: # there was no first_single_letter_word
1030
+ if first_multi_letter_word is not None:
1031
+ line = (
1032
+ line[:first_multi_letter_word]
1033
+ + b'_'
1034
+ + line[first_multi_letter_word + 1 :]
1035
+ )
1036
+ elif first_space is not None:
1037
+ line = (
1038
+ line[:first_space] + b'_' + line[first_space + 1 :]
1039
+ )
1040
+ # reset state for the next line
1041
+ bold_starts.clear()
1042
+ odd_italics = False
1043
+ odd_bold_italics = False
1044
+ return line
1045
+
1046
+ def process_apostrophes(m) -> bytes:
1047
+ nonlocal odd_italics, odd_bold_italics
1048
+ starts = m.starts(1)
1049
+ n = len(starts)
1050
+ if n == 2: # italic
1059
1051
  odd_italics ^= True
1060
- continue
1061
- if m[3] is None: # bold
1062
- s, e = m.span(1)
1063
- if s != e: # four apostrophes, hide the first one
1064
- shadow_copy[s] = 95 # _
1065
- append_match(m)
1066
- continue
1067
- # bold-italic
1068
- s, e = m.span(1)
1069
- es = e - s
1070
- if es: # more than 5 apostrophes, hide the previous ones
1071
- shadow_copy[s:e] = b'_' * es
1052
+ return m[0]
1053
+ if n == 3: # bold
1054
+ append_bold_start(starts[0])
1055
+ return m[0]
1056
+ if n == 5:
1057
+ odd_bold_italics ^= True
1058
+ odd_italics ^= True
1059
+ return m[0]
1060
+ if n == 4: # four apostrophes -> hide the first one
1061
+ s = starts[1]
1062
+ append_bold_start(s)
1063
+ return b'_' * (s - starts[0]) + m.string[s : m.end()]
1064
+ # more than 5 apostrophes -> hide the prior ones
1072
1065
  odd_bold_italics ^= True
1073
1066
  odd_italics ^= True
1074
- return shadow_copy
1067
+ s = starts[-5]
1068
+ return b'_' * (s - starts[0]) + m.string[s : m.end()]
1069
+
1070
+ return bytearray(b'\n').join(
1071
+ [
1072
+ process_line(substitute_apostrophes(process_apostrophes, line))
1073
+ for line in self._shadow.splitlines()
1074
+ ]
1075
+ )
1075
1076
 
1076
1077
  def _bolds_italics_recurse(self, result: list, filter_cls: Optional[type]):
1077
1078
  for prop in (
@@ -1123,8 +1124,8 @@ class WikiText:
1123
1124
  bold_spans = tts_setdefault('Bold', [])
1124
1125
  get_old_bold_span = {(s[0], s[1]): s for s in bold_spans}.get
1125
1126
  bold_matches = list(BOLD_FINDITER(balanced_shadow, rs, re))
1126
- for match in bold_matches:
1127
- ms, me = match.span()
1127
+ for m in bold_matches:
1128
+ ms, me = m.span()
1128
1129
  b, e = s + ms, s + me
1129
1130
  old_span = get_old_bold_span((b, e))
1130
1131
  if old_span is None:
@@ -1146,16 +1147,16 @@ class WikiText:
1146
1147
  # filter_cls is None or filter_cls is Italic
1147
1148
 
1148
1149
  # remove bold tokens before searching for italics
1149
- for match in bold_matches:
1150
- ms, me = match.span()
1151
- cs, ce = match.span(1) # content
1150
+ for m in bold_matches:
1151
+ ms, me = m.span()
1152
+ cs, ce = m.span(1) # content
1152
1153
  balanced_shadow[ms:cs] = b'_' * (cs - ms)
1153
1154
  balanced_shadow[ce:me] = b'_' * (me - ce)
1154
1155
 
1155
1156
  italic_spans = tts_setdefault('Italic', [])
1156
1157
  get_old_italic_span = {(s[0], s[1]): s for s in italic_spans}.get
1157
- for match in ITALIC_FINDITER(balanced_shadow, rs, re):
1158
- ms, me = match.span()
1158
+ for m in ITALIC_FINDITER(balanced_shadow, rs, re):
1159
+ ms, me = m.span()
1159
1160
  b, e = span = s + ms, s + me
1160
1161
  old_span = get_old_italic_span(span)
1161
1162
  if old_span is None:
@@ -1164,9 +1165,7 @@ class WikiText:
1164
1165
  else:
1165
1166
  span = old_span
1166
1167
  append(
1167
- Italic(
1168
- _lststr, type_to_spans, span, 'Bold', me != match.end(1)
1169
- )
1168
+ Italic(_lststr, type_to_spans, span, 'Bold', me != m.end(1))
1170
1169
  )
1171
1170
  if recursive and filter_cls is Italic:
1172
1171
  self._bolds_italics_recurse(result, filter_cls)