wikitextparser 0.55.10__tar.gz → 0.55.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/CHANGELOG.rst +8 -0
  2. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/PKG-INFO +1 -1
  3. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/pyproject.toml +3 -3
  4. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/__init__.py +1 -1
  5. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_argument.py +0 -1
  6. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_config.py +0 -1
  7. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_parser_function.py +0 -1
  8. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_spans.py +1 -0
  9. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_table.py +3 -3
  10. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_tag.py +0 -1
  11. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_template.py +0 -1
  12. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_wikilink.py +0 -1
  13. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_wikilist.py +0 -1
  14. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_wikitext.py +87 -74
  15. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/.coveragerc +0 -0
  16. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/.github/workflows/tests.yml +0 -0
  17. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/.gitignore +0 -0
  18. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/.readthedocs.yaml +0 -0
  19. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/LICENSE.md +0 -0
  20. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/README.rst +0 -0
  21. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/docs/CHANGELOG.rst +0 -0
  22. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/docs/Makefile +0 -0
  23. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/docs/README.rst +0 -0
  24. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/docs/conf.py +0 -0
  25. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/docs/index.rst +0 -0
  26. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/docs/make.bat +0 -0
  27. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_cell.py +0 -0
  28. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_comment_bold_italic.py +0 -0
  29. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_externallink.py +0 -0
  30. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_parameter.py +0 -0
  31. {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_section.py +0 -0
@@ -1,3 +1,11 @@
1
+ v0.55.12
2
+ --------
3
+ * Performance improvements in extracting bold and italic nodes. (#133)
4
+
5
+ v0.55.11
6
+ --------
7
+ * Performance improvements in ``__setitem__``/``__delitem__`` and ``pformat``/``plain_text`` methods. (#131)
8
+
1
9
  v0.55.10
2
10
  --------
3
11
  * Fixed a bug in ``plain_text`` causing ``IndexError`` when using a custom function to replace ``templates``/``parser_functions``.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wikitextparser
3
- Version: 0.55.10
3
+ Version: 0.55.12
4
4
  Summary: A simple parsing tool for MediaWiki's wikitext markup.
5
5
  Keywords: MediaWiki,wikitext,parser
6
6
  Author-email: 5j9 <5j9@users.noreply.github.com>
@@ -48,12 +48,12 @@ exclude = ['tests/', 'doc/', 'dev/']
48
48
  [tool.ruff]
49
49
  line-length = 79
50
50
  format.quote-style = 'single'
51
- isort.combine-as-imports = true
52
- extend-select = [
51
+ lint.isort.combine-as-imports = true
52
+ lint.extend-select = [
53
53
  'I', # isort
54
54
  'UP', # pyupgrade
55
55
  ]
56
- ignore = [
56
+ lint.ignore = [
57
57
  'UP027', # list comprehensions are faster than generator expressions
58
58
  'E721', # Do not compare types, use `isinstance()`
59
59
  ]
@@ -1,5 +1,5 @@
1
1
  # Scheme: [N!]N(.N)*[{a|b|rc}N][.postN][.devN]
2
- __version__ = '0.55.10'
2
+ __version__ = '0.55.12'
3
3
 
4
4
  from . import _wikitext
5
5
  from ._argument import Argument # noqa: F401
@@ -13,7 +13,6 @@ ARG_SHADOW_FULLMATCH = rc(
13
13
 
14
14
 
15
15
  class Argument(SubWikiText):
16
-
17
16
  """Create a new Argument Object.
18
17
 
19
18
  Note that in MediaWiki documentation `arguments` are (also) called
@@ -1,6 +1,5 @@
1
1
  """Utilities to override default configurations."""
2
2
 
3
-
4
3
  from collections import defaultdict as _defaultdict
5
4
  from typing import Iterable as _Iterable
6
5
 
@@ -11,7 +11,6 @@ PF_NAME_ARGS_FULLMATCH = rc(
11
11
 
12
12
 
13
13
  class SubWikiTextWithArgs(SubWikiText):
14
-
15
14
  """Define common attributes for `Template` and `ParserFunction`."""
16
15
 
17
16
  __slots__ = ()
@@ -1,4 +1,5 @@
1
1
  """Define the functions required for parsing wikitext into spans."""
2
+
2
3
  from functools import partial
3
4
  from typing import Callable, Dict, Optional
4
5
 
@@ -296,9 +296,9 @@ class Table(SubWikiTextWithAttrs):
296
296
  m = CAPTION_MATCH(shadow)
297
297
  if m:
298
298
  s = m.end('attrs')
299
- self[
300
- s if s != -1 else m.end('preattrs') : m.end('caption')
301
- ] = newcaption
299
+ self[s if s != -1 else m.end('preattrs') : m.end('caption')] = (
300
+ newcaption
301
+ )
302
302
  return
303
303
  # There is no caption. Create one.
304
304
  h, s, t = shadow.partition(b'\n')
@@ -45,7 +45,6 @@ TAG_FULLMATCH = rc(
45
45
 
46
46
 
47
47
  class SubWikiTextWithAttrs(SubWikiText):
48
-
49
48
  """Define a class for SubWikiText objects that have attributes.
50
49
 
51
50
  Any class that is going to inherit from SubWikiTextWithAttrs should provide
@@ -18,7 +18,6 @@ T = TypeVar('T')
18
18
 
19
19
 
20
20
  class Template(SubWikiTextWithArgs):
21
-
22
21
  """Convert strings to Template objects.
23
22
 
24
23
  The string should start with {{ and end with }}.
@@ -1,6 +1,5 @@
1
1
  """Define the WikiLink class."""
2
2
 
3
-
4
3
  from typing import List, Optional, Tuple
5
4
 
6
5
  from regex import DOTALL
@@ -33,7 +33,6 @@ LIST_PATTERN_FORMAT = ( # noqa
33
33
 
34
34
 
35
35
  class WikiList(SubWikiText):
36
-
37
36
  """Class to represent ordered, unordered, and definition lists."""
38
37
 
39
38
  __slots__ = 'pattern', '_match_cache'
@@ -1,5 +1,4 @@
1
1
  from bisect import bisect_left, bisect_right, insort_right
2
- from copy import deepcopy
3
2
  from html import unescape
4
3
  from itertools import compress, islice
5
4
  from operator import attrgetter
@@ -114,10 +113,11 @@ TABLE_FINDITER = rc(
114
113
  DOTALL | MULTILINE | VERBOSE,
115
114
  ).finditer
116
115
 
117
- BOLD_ITALIC_FINDITER = rc( # bold-italic, bold, or italic tokens
118
- rb"""((?>'\0*)*?)'\0*+'\0*+('\0*+('\0*+')?+)?+(?=[^']|$)|($)""",
116
+ substitute_apostrophes = rc( # bold-italic, bold, or italic tokens
117
+ rb"('\0*+){2,}+(?=[^']|$)",
119
118
  MULTILINE | VERBOSE,
120
- ).finditer
119
+ ).sub
120
+ find_lines = rc(rb'(.*?)$').finditer
121
121
 
122
122
  BOLD_FINDITER = rc(
123
123
  rb"""
@@ -488,6 +488,7 @@ class WikiText:
488
488
  # Note: The following algorithm won't work correctly if spans
489
489
  # are not sorted.
490
490
  # Note: No span should be removed from _type_to_spans.
491
+ rmlength = rmstop - rmstart
491
492
  for spans in self._type_to_spans.values():
492
493
  i = len(spans) - 1
493
494
  while i >= 0:
@@ -495,7 +496,6 @@ class WikiText:
495
496
  s, e, _, b = span = spans[i]
496
497
  if rmstop <= s:
497
498
  # rmstart <= rmstop <= s <= e
498
- rmlength = rmstop - rmstart
499
499
  # todo
500
500
  span[:] = s - rmlength, e - rmlength, None, None
501
501
  i -= 1
@@ -508,7 +508,7 @@ class WikiText:
508
508
  if rmstop < e:
509
509
  # rmstart < s <= rmstop < e
510
510
  # todo: update byte_array instead
511
- span[:] = rmstart, e + rmstart - rmstop, None, None
511
+ span[:] = rmstart, e - rmlength, None, None
512
512
  i -= 1
513
513
  if i < 0:
514
514
  break
@@ -531,7 +531,7 @@ class WikiText:
531
531
  s, e, _, _ = span = spans[i]
532
532
  continue
533
533
  # s <= rmstart <= rmstop <= e
534
- span[1] -= rmstop - rmstart
534
+ span[1] -= rmlength
535
535
  span[2] = None
536
536
  # todo: update bytearray instead
537
537
  span[3] = None
@@ -621,13 +621,12 @@ class WikiText:
621
621
  self.string.
622
622
  """
623
623
  ss, se, _, _ = self._span_data
624
- if ss == 0 and se == len(self._lststr[0]):
625
- return deepcopy(self._type_to_spans)
626
624
  return {
627
625
  type_: [
628
626
  [s - ss, e - ss, m, ba[:] if ba is not None else None]
629
- for s, e, m, ba in spans[bisect_left(spans, [ss]) :]
630
- if e <= se
627
+ for s, e, m, ba in spans[
628
+ bisect_right(spans, [ss]) : bisect_right(spans, [se])
629
+ ]
631
630
  ]
632
631
  for type_, spans in self._type_to_spans.items()
633
632
  }
@@ -1010,66 +1009,82 @@ class WikiText:
1010
1009
  ]
1011
1010
 
1012
1011
  @property
1013
- def _balanced_quotes_shadow(self):
1014
- """Return bold and italic match objects according MW's algorithm.
1012
+ def _balanced_quotes_shadow(self) -> bytearray:
1013
+ """Return a byte array with non-markup-apostrophes removed.
1015
1014
 
1016
1015
  The comments at /includes/parser/Parser.php:doQuotes are helpful:
1017
1016
  https://github.com/wikimedia/mediawiki/blob/master/includes/parser/Parser.php
1018
1017
  https://phabricator.wikimedia.org/T15227#178834
1019
1018
  """
1020
- bold_matches = []
1019
+ bold_starts: List[int] = []
1021
1020
  odd_italics = False
1022
1021
  odd_bold_italics = False
1023
- shadow_copy = self._shadow[:]
1024
- append_match = bold_matches.append
1025
- for match in BOLD_ITALIC_FINDITER(shadow_copy):
1026
- if match[4] is not None: # newline or string end
1027
- if (
1028
- odd_italics is True
1029
- and (len(bold_matches) + odd_bold_italics) % 2
1030
- ):
1031
- # one of the bold marks needs to be interpreted as italic
1032
- first_multi_letter_word = first_space = None
1033
- for bold_match in bold_matches:
1034
- bold_start = bold_match.start()
1035
- if shadow_copy[bold_start - 1 : bold_start] == b' ':
1036
- if first_space is None:
1037
- first_space = bold_start
1038
- continue
1039
- if (
1040
- shadow_copy[bold_start - 2 : bold_start - 1]
1041
- == b' '
1042
- ):
1043
- shadow_copy[bold_start] = 95 # _
1044
- break # first_single_letter_word
1045
- if first_multi_letter_word is None:
1046
- first_multi_letter_word = bold_start
1047
- continue
1048
- else: # there was no first_single_letter_word
1049
- if first_multi_letter_word is not None:
1050
- shadow_copy[first_multi_letter_word] = 95 # _
1051
- elif first_space is not None:
1052
- shadow_copy[first_space] = 95 # _
1053
- bold_matches.clear()
1054
- odd_italics = False
1055
- continue
1056
- if match[2] is None: # italic
1022
+ append_bold_start = bold_starts.append
1023
+
1024
+ def process_line(line: bytes) -> bytes:
1025
+ nonlocal odd_italics, odd_bold_italics
1026
+ if odd_italics and (len(bold_starts) + odd_bold_italics) % 2:
1027
+ # one of the bold marks needs to be interpreted as italic
1028
+ first_multi_letter_word = first_space = None
1029
+ for s in bold_starts:
1030
+ if line[s - 1] == 32: # space
1031
+ if first_space is None:
1032
+ first_space = s
1033
+ continue
1034
+ if line[s - 2] == 32: # space
1035
+ line = line[:s] + b' ' + line[s + 1 :]
1036
+ break # first_single_letter_word
1037
+ if first_multi_letter_word is None:
1038
+ first_multi_letter_word = s
1039
+ continue
1040
+ else: # there was no first_single_letter_word
1041
+ if first_multi_letter_word is not None:
1042
+ line = (
1043
+ line[:first_multi_letter_word]
1044
+ + b'_'
1045
+ + line[first_multi_letter_word + 1 :]
1046
+ )
1047
+ elif first_space is not None:
1048
+ line = (
1049
+ line[:first_space] + b'_' + line[first_space + 1 :]
1050
+ )
1051
+ # reset state for the next line
1052
+ bold_starts.clear()
1053
+ odd_italics = False
1054
+ odd_bold_italics = False
1055
+ return line
1056
+
1057
+ def process_apostrophes(m) -> bytes:
1058
+ nonlocal odd_italics, odd_bold_italics
1059
+ starts = m.starts(1)
1060
+ n = len(starts)
1061
+ if n == 2: # italic
1057
1062
  odd_italics ^= True
1058
- continue
1059
- if match[3] is None: # bold
1060
- s, e = match.span(1)
1061
- if s != e: # four apostrophes, hide the first one
1062
- shadow_copy[s] = 95 # _
1063
- append_match(match)
1064
- continue
1065
- # bold-italic
1066
- s, e = match.span(1)
1067
- es = e - s
1068
- if es: # more than 5 apostrophes, hide the previous ones
1069
- shadow_copy[s:e] = b'_' * es
1070
- odd_bold_italics ^= True
1071
- odd_italics ^= True
1072
- return shadow_copy
1063
+ return m[0]
1064
+ if n == 3: # bold
1065
+ append_bold_start(starts[0])
1066
+ return m[0]
1067
+ if n == 5:
1068
+ odd_bold_italics ^= True
1069
+ odd_italics ^= True
1070
+ return m[0]
1071
+ if n == 4: # four apostrophes -> hide the first one
1072
+ s = starts[1]
1073
+ append_bold_start(s)
1074
+ return b'_' * (s - starts[0]) + m.string[s : m.end()]
1075
+ if n > 5: # more than 5 apostrophes -> hide the prior ones
1076
+ odd_bold_italics ^= True
1077
+ odd_italics ^= True
1078
+ s = starts[-5]
1079
+ return b'_' * (s - starts[0]) + m.string[s : m.end()]
1080
+ raise # execution should never reach here
1081
+
1082
+ return bytearray(b'\n').join(
1083
+ [
1084
+ process_line(substitute_apostrophes(process_apostrophes, line))
1085
+ for line in self._shadow.splitlines()
1086
+ ]
1087
+ )
1073
1088
 
1074
1089
  def _bolds_italics_recurse(self, result: list, filter_cls: Optional[type]):
1075
1090
  for prop in (
@@ -1121,8 +1136,8 @@ class WikiText:
1121
1136
  bold_spans = tts_setdefault('Bold', [])
1122
1137
  get_old_bold_span = {(s[0], s[1]): s for s in bold_spans}.get
1123
1138
  bold_matches = list(BOLD_FINDITER(balanced_shadow, rs, re))
1124
- for match in bold_matches:
1125
- ms, me = match.span()
1139
+ for m in bold_matches:
1140
+ ms, me = m.span()
1126
1141
  b, e = s + ms, s + me
1127
1142
  old_span = get_old_bold_span((b, e))
1128
1143
  if old_span is None:
@@ -1144,16 +1159,16 @@ class WikiText:
1144
1159
  # filter_cls is None or filter_cls is Italic
1145
1160
 
1146
1161
  # remove bold tokens before searching for italics
1147
- for match in bold_matches:
1148
- ms, me = match.span()
1149
- cs, ce = match.span(1) # content
1162
+ for m in bold_matches:
1163
+ ms, me = m.span()
1164
+ cs, ce = m.span(1) # content
1150
1165
  balanced_shadow[ms:cs] = b'_' * (cs - ms)
1151
1166
  balanced_shadow[ce:me] = b'_' * (me - ce)
1152
1167
 
1153
1168
  italic_spans = tts_setdefault('Italic', [])
1154
1169
  get_old_italic_span = {(s[0], s[1]): s for s in italic_spans}.get
1155
- for match in ITALIC_FINDITER(balanced_shadow, rs, re):
1156
- ms, me = match.span()
1170
+ for m in ITALIC_FINDITER(balanced_shadow, rs, re):
1171
+ ms, me = m.span()
1157
1172
  b, e = span = s + ms, s + me
1158
1173
  old_span = get_old_italic_span(span)
1159
1174
  if old_span is None:
@@ -1162,9 +1177,7 @@ class WikiText:
1162
1177
  else:
1163
1178
  span = old_span
1164
1179
  append(
1165
- Italic(
1166
- _lststr, type_to_spans, span, 'Bold', me != match.end(1)
1167
- )
1180
+ Italic(_lststr, type_to_spans, span, 'Bold', me != m.end(1))
1168
1181
  )
1169
1182
  if recursive and filter_cls is Italic:
1170
1183
  self._bolds_italics_recurse(result, filter_cls)
@@ -1338,7 +1351,7 @@ class WikiText:
1338
1351
 
1339
1352
  if level is not None:
1340
1353
  section_spans = compress(
1341
- section_spans, [l == level for l in levels]
1354
+ section_spans, [lvl == level for lvl in levels]
1342
1355
  )
1343
1356
 
1344
1357
  return self._section_spans_to_sections(section_spans, shadow)