PyPI - wikitextparser - Versions diffs - 0.55.11__tar.gz → 0.55.13__tar.gz - Mend

wikitextparser 0.55.11tar.gz → 0.55.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{wikitextparser-0.55.11 → wikitextparser-0.55.13}/CHANGELOG.rst RENAMED Viewed

@@ -1,3 +1,11 @@
+v0.55.13
+--------
+*  Fixed a bug in ``Section.level`` resulting in malformed section titles when multiple levels are added (#135)
+v0.55.12
+--------
+* Performance improvements in extracting bold and italic nodes. (#133)
 v0.55.11
 --------
 * Performance improvements in ``__setitem__``/``__delitem__`` and ``pformat``/``plain_text`` methods. (#131)

{wikitextparser-0.55.11 → wikitextparser-0.55.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: wikitextparser
-Version: 0.55.11
+Version: 0.55.13
 Summary: A simple parsing tool for MediaWiki's wikitext markup.
 Keywords: MediaWiki,wikitext,parser
 Author-email: 5j9 <5j9@users.noreply.github.com>

{wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # Scheme: [N!]N(.N)*[{a|b|rc}N][.postN][.devN]
-__version__ = '0.55.11'
+__version__ = '0.55.13'
 from . import _wikitext
 from ._argument import Argument  # noqa: F401

{wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_parser_function.py RENAMED Viewed

@@ -60,10 +60,7 @@ class SubWikiTextWithArgs(SubWikiText):
             else:
                 arg_span = old_span
             arg = Argument(lststr, type_to_spans, arg_span, type_, self)
-            arg._shadow_cache = (
-                lststr[0][s:e],
-                shadow[arg_self_start:arg_self_end],
-            )
+            arg._span_data[3] = shadow[arg_self_start:arg_self_end]
             arguments_append(arg)
         return arguments

{wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_section.py RENAMED Viewed

@@ -42,9 +42,9 @@ class Section(SubWikiText):
         if level_diff == 0:
             return
         if level_diff < 0:
-            new_equals = '=' * abs(level_diff)
+            new_equals = '=' * -level_diff
             self.insert(0, new_equals)
-            self.insert(m.end(2) + 1, new_equals)
+            self.insert(m.end(2) - level_diff, new_equals)
             return
         del self[:level_diff]
         del self[m.end(2) : m.end(2) + level_diff]

{wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_wikitext.py RENAMED Viewed

@@ -1,5 +1,4 @@
 from bisect import bisect_left, bisect_right, insort_right
-from copy import deepcopy
 from html import unescape
 from itertools import compress, islice
 from operator import attrgetter
@@ -41,7 +40,6 @@ from ._spans import (
     END_TAG_PATTERN,
     EXTERNAL_LINK_URL_TAIL,
     INVALID_URL_CHARS,
-    PARSABLE_TAG_EXTENSION_NAME,
     START_TAG_PATTERN,
     parse_to_spans,
     rc,
@@ -53,10 +51,6 @@ NAME_CAPTURING_HTML_START_TAG_FINDITER = rc(
     )
 ).finditer
-PARSABLE_TAG_EXTENSIONS_MATCH = rc(
-    rb'<' + PARSABLE_TAG_EXTENSION_NAME + rb'\b', IGNORECASE
-).match
 # External links
 BRACKET_EXTERNAL_LINK_SCHEMES = regex_pattern(
     _bare_external_link_schemes | {'//'}
@@ -114,10 +108,7 @@ TABLE_FINDITER = rc(
     DOTALL | MULTILINE | VERBOSE,
 ).finditer
-BOLD_ITALIC_FINDITER = rc(  # bold-italic, bold, or italic tokens
-    rb"""((?>'\0*)*?)'\0*+'\0*+('\0*+('\0*+')?+)?+(?=[^']|$)|($)""",
-    MULTILINE | VERBOSE,
-).finditer
+substitute_apostrophes = rc(rb"('\0*+){2,}+(?=[^']|$)", MULTILINE).sub
 BOLD_FINDITER = rc(
     rb"""
@@ -213,7 +204,7 @@ class WikiText:
     # The following class attribute acts as a default value.
     _type = 'WikiText'
-    __slots__ = '_type_to_spans', '_lststr', '_span_data', '_shadow_cache'
+    __slots__ = '_type_to_spans', '_lststr', '_span_data'
     def __init__(
         self,
@@ -241,7 +232,6 @@ class WikiText:
         if _type not in SPAN_PARSER_TYPES:
             type_to_spans = self._type_to_spans = parse_to_spans(byte_array)
             type_to_spans[_type] = [span]
-            self._shadow_cache = string, byte_array
         else:
             # In SPAN_PARSER_TYPES, we can't pass the original byte_array to
             # parser to generate the shadow because it will replace the whole
@@ -259,7 +249,6 @@ class WikiText:
                 byte_array[0] = 3
                 byte_array[-1] = 32
             type_to_spans = parse_to_spans(byte_array)
-            self._shadow_cache = string, byte_array
             type_to_spans[_type].insert(0, span)
             self._type_to_spans = type_to_spans
             if type(self) is Parameter:
@@ -621,15 +610,12 @@ class WikiText:
         self.string.
         """
         ss, se, _, _ = self._span_data
-        if ss == 0 and se == len(self._lststr[0]):
-            return deepcopy(self._type_to_spans)
         return {
             type_: [
                 [s - ss, e - ss, m, ba[:] if ba is not None else None]
                 for s, e, m, ba in spans[
-                    bisect_left(spans, [ss]) : bisect_left(spans, [se])
+                    bisect_right(spans, [ss]) : bisect_right(spans, [se])
                 ]
-                if e <= se
             ]
             for type_, spans in self._type_to_spans.items()
         }
@@ -1012,66 +998,81 @@ class WikiText:
         ]
     @property
-    def _balanced_quotes_shadow(self):
-        """Return bold and italic match objects according MW's algorithm.
+    def _balanced_quotes_shadow(self) -> bytearray:
+        """Return a byte array with non-markup-apostrophes removed.
         The comments at /includes/parser/Parser.php:doQuotes are helpful:
         https://github.com/wikimedia/mediawiki/blob/master/includes/parser/Parser.php
         https://phabricator.wikimedia.org/T15227#178834
         """
-        bold_matches = []
+        bold_starts: List[int] = []
         odd_italics = False
         odd_bold_italics = False
-        shadow_copy = self._shadow[:]
-        append_match = bold_matches.append
-        for m in BOLD_ITALIC_FINDITER(shadow_copy):
-            if m[4] is not None:  # newline or string end
-                if (
-                    odd_italics is True
-                    and (len(bold_matches) + odd_bold_italics) % 2
-                ):
-                    # one of the bold marks needs to be interpreted as italic
-                    first_multi_letter_word = first_space = None
-                    for bold_match in bold_matches:
-                        bold_start = bold_match.start()
-                        if shadow_copy[bold_start - 1 : bold_start] == b' ':
-                            if first_space is None:
-                                first_space = bold_start
-                            continue
-                        if (
-                            shadow_copy[bold_start - 2 : bold_start - 1]
-                            == b' '
-                        ):
-                            shadow_copy[bold_start] = 95  # _
-                            break  # first_single_letter_word
-                        if first_multi_letter_word is None:
-                            first_multi_letter_word = bold_start
-                            continue
-                    else:  # there was no first_single_letter_word
-                        if first_multi_letter_word is not None:
-                            shadow_copy[first_multi_letter_word] = 95  # _
-                        elif first_space is not None:
-                            shadow_copy[first_space] = 95  # _
-                bold_matches.clear()
-                odd_italics = False
-                continue
-            if m[2] is None:  # italic
+        append_bold_start = bold_starts.append
+        def process_line(line: bytes) -> bytes:
+            nonlocal odd_italics, odd_bold_italics
+            if odd_italics and (len(bold_starts) + odd_bold_italics) % 2:
+                # one of the bold marks needs to be interpreted as italic
+                first_multi_letter_word = first_space = None
+                for s in bold_starts:
+                    if line[s - 1] == 32:  # space
+                        if first_space is None:
+                            first_space = s
+                        continue
+                    if line[s - 2] == 32:  # space
+                        line = line[:s] + b' ' + line[s + 1 :]
+                        break  # first_single_letter_word
+                    if first_multi_letter_word is None:
+                        first_multi_letter_word = s
+                        continue
+                else:  # there was no first_single_letter_word
+                    if first_multi_letter_word is not None:
+                        line = (
+                            line[:first_multi_letter_word]
+                            + b'_'
+                            + line[first_multi_letter_word + 1 :]
+                        )
+                    elif first_space is not None:
+                        line = (
+                            line[:first_space] + b'_' + line[first_space + 1 :]
+                        )
+            # reset state for the next line
+            bold_starts.clear()
+            odd_italics = False
+            odd_bold_italics = False
+            return line
+        def process_apostrophes(m) -> bytes:
+            nonlocal odd_italics, odd_bold_italics
+            starts = m.starts(1)
+            n = len(starts)
+            if n == 2:  # italic
                 odd_italics ^= True
-                continue
-            if m[3] is None:  # bold
-                s, e = m.span(1)
-                if s != e:  # four apostrophes, hide the first one
-                    shadow_copy[s] = 95  # _
-                append_match(m)
-                continue
-            # bold-italic
-            s, e = m.span(1)
-            es = e - s
-            if es:  # more than 5 apostrophes, hide the previous ones
-                shadow_copy[s:e] = b'_' * es
+                return m[0]
+            if n == 3:  # bold
+                append_bold_start(starts[0])
+                return m[0]
+            if n == 5:
+                odd_bold_italics ^= True
+                odd_italics ^= True
+                return m[0]
+            if n == 4:  # four apostrophes -> hide the first one
+                s = starts[1]
+                append_bold_start(s)
+                return b'_' * (s - starts[0]) + m.string[s : m.end()]
+            # more than 5 apostrophes -> hide the prior ones
             odd_bold_italics ^= True
             odd_italics ^= True
-        return shadow_copy
+            s = starts[-5]
+            return b'_' * (s - starts[0]) + m.string[s : m.end()]
+        return bytearray(b'\n').join(
+            [
+                process_line(substitute_apostrophes(process_apostrophes, line))
+                for line in self._shadow.splitlines()
+            ]
+        )
     def _bolds_italics_recurse(self, result: list, filter_cls: Optional[type]):
         for prop in (
@@ -1123,8 +1124,8 @@ class WikiText:
             bold_spans = tts_setdefault('Bold', [])
             get_old_bold_span = {(s[0], s[1]): s for s in bold_spans}.get
             bold_matches = list(BOLD_FINDITER(balanced_shadow, rs, re))
-            for match in bold_matches:
-                ms, me = match.span()
+            for m in bold_matches:
+                ms, me = m.span()
                 b, e = s + ms, s + me
                 old_span = get_old_bold_span((b, e))
                 if old_span is None:
@@ -1146,16 +1147,16 @@ class WikiText:
         # filter_cls is None or filter_cls is Italic
         # remove bold tokens before searching for italics
-        for match in bold_matches:
-            ms, me = match.span()
-            cs, ce = match.span(1)  # content
+        for m in bold_matches:
+            ms, me = m.span()
+            cs, ce = m.span(1)  # content
             balanced_shadow[ms:cs] = b'_' * (cs - ms)
             balanced_shadow[ce:me] = b'_' * (me - ce)
         italic_spans = tts_setdefault('Italic', [])
         get_old_italic_span = {(s[0], s[1]): s for s in italic_spans}.get
-        for match in ITALIC_FINDITER(balanced_shadow, rs, re):
-            ms, me = match.span()
+        for m in ITALIC_FINDITER(balanced_shadow, rs, re):
+            ms, me = m.span()
             b, e = span = s + ms, s + me
             old_span = get_old_italic_span(span)
             if old_span is None:
@@ -1164,9 +1165,7 @@ class WikiText:
             else:
                 span = old_span
             append(
-                Italic(
-                    _lststr, type_to_spans, span, 'Bold', me != match.end(1)
-                )
+                Italic(_lststr, type_to_spans, span, 'Bold', me != m.end(1))
             )
         if recursive and filter_cls is Italic:
             self._bolds_italics_recurse(result, filter_cls)