PyPI - wikitextparser - Versions diffs - 0.55.11__tar.gz → 0.55.12__tar.gz - Mend

wikitextparser 0.55.11tar.gz → 0.55.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{wikitextparser-0.55.11 → wikitextparser-0.55.12}/CHANGELOG.rst RENAMED Viewed

@@ -1,3 +1,7 @@
+v0.55.12
+--------
+* Performance improvements in extracting bold and italic nodes. (#133)
 v0.55.11
 --------
 * Performance improvements in ``__setitem__``/``__delitem__`` and ``pformat``/``plain_text`` methods. (#131)

{wikitextparser-0.55.11 → wikitextparser-0.55.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: wikitextparser
-Version: 0.55.11
+Version: 0.55.12
 Summary: A simple parsing tool for MediaWiki's wikitext markup.
 Keywords: MediaWiki,wikitext,parser
 Author-email: 5j9 <5j9@users.noreply.github.com>

{wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # Scheme: [N!]N(.N)*[{a|b|rc}N][.postN][.devN]
-__version__ = '0.55.11'
+__version__ = '0.55.12'
 from . import _wikitext
 from ._argument import Argument  # noqa: F401

{wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_wikitext.py RENAMED Viewed

@@ -1,5 +1,4 @@
 from bisect import bisect_left, bisect_right, insort_right
-from copy import deepcopy
 from html import unescape
 from itertools import compress, islice
 from operator import attrgetter
@@ -114,10 +113,11 @@ TABLE_FINDITER = rc(
     DOTALL | MULTILINE | VERBOSE,
 ).finditer
-BOLD_ITALIC_FINDITER = rc(  # bold-italic, bold, or italic tokens
-    rb"""((?>'\0*)*?)'\0*+'\0*+('\0*+('\0*+')?+)?+(?=[^']|$)|($)""",
+substitute_apostrophes = rc(  # bold-italic, bold, or italic tokens
+    rb"('\0*+){2,}+(?=[^']|$)",
     MULTILINE | VERBOSE,
-).finditer
+).sub
+find_lines = rc(rb'(.*?)$').finditer
 BOLD_FINDITER = rc(
     rb"""
@@ -621,15 +621,12 @@ class WikiText:
         self.string.
         """
         ss, se, _, _ = self._span_data
-        if ss == 0 and se == len(self._lststr[0]):
-            return deepcopy(self._type_to_spans)
         return {
             type_: [
                 [s - ss, e - ss, m, ba[:] if ba is not None else None]
                 for s, e, m, ba in spans[
-                    bisect_left(spans, [ss]) : bisect_left(spans, [se])
+                    bisect_right(spans, [ss]) : bisect_right(spans, [se])
                 ]
-                if e <= se
             ]
             for type_, spans in self._type_to_spans.items()
         }
@@ -1012,66 +1009,82 @@ class WikiText:
         ]
     @property
-    def _balanced_quotes_shadow(self):
-        """Return bold and italic match objects according MW's algorithm.
+    def _balanced_quotes_shadow(self) -> bytearray:
+        """Return a byte array with non-markup-apostrophes removed.
         The comments at /includes/parser/Parser.php:doQuotes are helpful:
         https://github.com/wikimedia/mediawiki/blob/master/includes/parser/Parser.php
         https://phabricator.wikimedia.org/T15227#178834
         """
-        bold_matches = []
+        bold_starts: List[int] = []
         odd_italics = False
         odd_bold_italics = False
-        shadow_copy = self._shadow[:]
-        append_match = bold_matches.append
-        for m in BOLD_ITALIC_FINDITER(shadow_copy):
-            if m[4] is not None:  # newline or string end
-                if (
-                    odd_italics is True
-                    and (len(bold_matches) + odd_bold_italics) % 2
-                ):
-                    # one of the bold marks needs to be interpreted as italic
-                    first_multi_letter_word = first_space = None
-                    for bold_match in bold_matches:
-                        bold_start = bold_match.start()
-                        if shadow_copy[bold_start - 1 : bold_start] == b' ':
-                            if first_space is None:
-                                first_space = bold_start
-                            continue
-                        if (
-                            shadow_copy[bold_start - 2 : bold_start - 1]
-                            == b' '
-                        ):
-                            shadow_copy[bold_start] = 95  # _
-                            break  # first_single_letter_word
-                        if first_multi_letter_word is None:
-                            first_multi_letter_word = bold_start
-                            continue
-                    else:  # there was no first_single_letter_word
-                        if first_multi_letter_word is not None:
-                            shadow_copy[first_multi_letter_word] = 95  # _
-                        elif first_space is not None:
-                            shadow_copy[first_space] = 95  # _
-                bold_matches.clear()
-                odd_italics = False
-                continue
-            if m[2] is None:  # italic
+        append_bold_start = bold_starts.append
+        def process_line(line: bytes) -> bytes:
+            nonlocal odd_italics, odd_bold_italics
+            if odd_italics and (len(bold_starts) + odd_bold_italics) % 2:
+                # one of the bold marks needs to be interpreted as italic
+                first_multi_letter_word = first_space = None
+                for s in bold_starts:
+                    if line[s - 1] == 32:  # space
+                        if first_space is None:
+                            first_space = s
+                        continue
+                    if line[s - 2] == 32:  # space
+                        line = line[:s] + b' ' + line[s + 1 :]
+                        break  # first_single_letter_word
+                    if first_multi_letter_word is None:
+                        first_multi_letter_word = s
+                        continue
+                else:  # there was no first_single_letter_word
+                    if first_multi_letter_word is not None:
+                        line = (
+                            line[:first_multi_letter_word]
+                            + b'_'
+                            + line[first_multi_letter_word + 1 :]
+                        )
+                    elif first_space is not None:
+                        line = (
+                            line[:first_space] + b'_' + line[first_space + 1 :]
+                        )
+            # reset state for the next line
+            bold_starts.clear()
+            odd_italics = False
+            odd_bold_italics = False
+            return line
+        def process_apostrophes(m) -> bytes:
+            nonlocal odd_italics, odd_bold_italics
+            starts = m.starts(1)
+            n = len(starts)
+            if n == 2:  # italic
                 odd_italics ^= True
-                continue
-            if m[3] is None:  # bold
-                s, e = m.span(1)
-                if s != e:  # four apostrophes, hide the first one
-                    shadow_copy[s] = 95  # _
-                append_match(m)
-                continue
-            # bold-italic
-            s, e = m.span(1)
-            es = e - s
-            if es:  # more than 5 apostrophes, hide the previous ones
-                shadow_copy[s:e] = b'_' * es
-            odd_bold_italics ^= True
-            odd_italics ^= True
-        return shadow_copy
+                return m[0]
+            if n == 3:  # bold
+                append_bold_start(starts[0])
+                return m[0]
+            if n == 5:
+                odd_bold_italics ^= True
+                odd_italics ^= True
+                return m[0]
+            if n == 4:  # four apostrophes -> hide the first one
+                s = starts[1]
+                append_bold_start(s)
+                return b'_' * (s - starts[0]) + m.string[s : m.end()]
+            if n > 5:  # more than 5 apostrophes -> hide the prior ones
+                odd_bold_italics ^= True
+                odd_italics ^= True
+                s = starts[-5]
+                return b'_' * (s - starts[0]) + m.string[s : m.end()]
+            raise  # execution should never reach here
+        return bytearray(b'\n').join(
+            [
+                process_line(substitute_apostrophes(process_apostrophes, line))
+                for line in self._shadow.splitlines()
+            ]
+        )
     def _bolds_italics_recurse(self, result: list, filter_cls: Optional[type]):
         for prop in (
@@ -1123,8 +1136,8 @@ class WikiText:
             bold_spans = tts_setdefault('Bold', [])
             get_old_bold_span = {(s[0], s[1]): s for s in bold_spans}.get
             bold_matches = list(BOLD_FINDITER(balanced_shadow, rs, re))
-            for match in bold_matches:
-                ms, me = match.span()
+            for m in bold_matches:
+                ms, me = m.span()
                 b, e = s + ms, s + me
                 old_span = get_old_bold_span((b, e))
                 if old_span is None:
@@ -1146,16 +1159,16 @@ class WikiText:
         # filter_cls is None or filter_cls is Italic
         # remove bold tokens before searching for italics
-        for match in bold_matches:
-            ms, me = match.span()
-            cs, ce = match.span(1)  # content
+        for m in bold_matches:
+            ms, me = m.span()
+            cs, ce = m.span(1)  # content
             balanced_shadow[ms:cs] = b'_' * (cs - ms)
             balanced_shadow[ce:me] = b'_' * (me - ce)
         italic_spans = tts_setdefault('Italic', [])
         get_old_italic_span = {(s[0], s[1]): s for s in italic_spans}.get
-        for match in ITALIC_FINDITER(balanced_shadow, rs, re):
-            ms, me = match.span()
+        for m in ITALIC_FINDITER(balanced_shadow, rs, re):
+            ms, me = m.span()
             b, e = span = s + ms, s + me
             old_span = get_old_italic_span(span)
             if old_span is None:
@@ -1164,9 +1177,7 @@ class WikiText:
             else:
                 span = old_span
             append(
-                Italic(
-                    _lststr, type_to_spans, span, 'Bold', me != match.end(1)
-                )
+                Italic(_lststr, type_to_spans, span, 'Bold', me != m.end(1))
             )
         if recursive and filter_cls is Italic:
             self._bolds_italics_recurse(result, filter_cls)