PyPI - wikitextparser - Versions diffs - 0.55.10__tar.gz → 0.55.12__tar.gz - Mend

wikitextparser 0.55.10tar.gz → 0.55.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

{wikitextparser-0.55.10 → wikitextparser-0.55.12}/CHANGELOG.rst RENAMED Viewed

@@ -1,3 +1,11 @@
+v0.55.12
+--------
+* Performance improvements in extracting bold and italic nodes. (#133)
+v0.55.11
+--------
+* Performance improvements in ``__setitem__``/``__delitem__`` and ``pformat``/``plain_text`` methods. (#131)
 v0.55.10
 --------
 * Fixed a bug in ``plain_text`` causing ``IndexError`` when using a custom function to replace ``templates``/``parser_functions``.

{wikitextparser-0.55.10 → wikitextparser-0.55.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: wikitextparser
-Version: 0.55.10
+Version: 0.55.12
 Summary: A simple parsing tool for MediaWiki's wikitext markup.
 Keywords: MediaWiki,wikitext,parser
 Author-email: 5j9 <5j9@users.noreply.github.com>

{wikitextparser-0.55.10 → wikitextparser-0.55.12}/pyproject.toml RENAMED Viewed

@@ -48,12 +48,12 @@ exclude = ['tests/', 'doc/', 'dev/']
 [tool.ruff]
 line-length = 79
 format.quote-style = 'single'
-isort.combine-as-imports = true
-extend-select = [
+lint.isort.combine-as-imports = true
+lint.extend-select = [
     'I',  # isort
     'UP',  # pyupgrade
 ]
-ignore = [
+lint.ignore = [
     'UP027',  # list comprehensions are faster than generator expressions
     'E721',  # Do not compare types, use `isinstance()`
 ]

{wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # Scheme: [N!]N(.N)*[{a|b|rc}N][.postN][.devN]
-__version__ = '0.55.10'
+__version__ = '0.55.12'
 from . import _wikitext
 from ._argument import Argument  # noqa: F401

{wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_argument.py RENAMED Viewed

@@ -13,7 +13,6 @@ ARG_SHADOW_FULLMATCH = rc(
 class Argument(SubWikiText):
     """Create a new Argument Object.
     Note that in MediaWiki documentation `arguments` are (also) called

{wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_config.py RENAMED Viewed

@@ -1,6 +1,5 @@
 """Utilities to override default configurations."""
 from collections import defaultdict as _defaultdict
 from typing import Iterable as _Iterable

{wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_parser_function.py RENAMED Viewed

@@ -11,7 +11,6 @@ PF_NAME_ARGS_FULLMATCH = rc(
 class SubWikiTextWithArgs(SubWikiText):
     """Define common attributes for `Template` and `ParserFunction`."""
     __slots__ = ()

{wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_spans.py RENAMED Viewed

@@ -1,4 +1,5 @@
 """Define the functions required for parsing wikitext into spans."""
 from functools import partial
 from typing import Callable, Dict, Optional

{wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_table.py RENAMED Viewed

@@ -296,9 +296,9 @@ class Table(SubWikiTextWithAttrs):
         m = CAPTION_MATCH(shadow)
         if m:
             s = m.end('attrs')
-            self[
-                s if s != -1 else m.end('preattrs') : m.end('caption')
-            ] = newcaption
+            self[s if s != -1 else m.end('preattrs') : m.end('caption')] = (
+                newcaption
+            )
             return
         # There is no caption. Create one.
         h, s, t = shadow.partition(b'\n')

{wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_tag.py RENAMED Viewed

@@ -45,7 +45,6 @@ TAG_FULLMATCH = rc(
 class SubWikiTextWithAttrs(SubWikiText):
     """Define a class for SubWikiText objects that have attributes.
     Any class that is going to inherit from SubWikiTextWithAttrs should provide

{wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_template.py RENAMED Viewed

@@ -18,7 +18,6 @@ T = TypeVar('T')
 class Template(SubWikiTextWithArgs):
     """Convert strings to Template objects.
     The string should start with {{ and end with }}.

{wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_wikilink.py RENAMED Viewed

@@ -1,6 +1,5 @@
 """Define the WikiLink class."""
 from typing import List, Optional, Tuple
 from regex import DOTALL

{wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_wikilist.py RENAMED Viewed

@@ -33,7 +33,6 @@ LIST_PATTERN_FORMAT = (  # noqa
 class WikiList(SubWikiText):
     """Class to represent ordered, unordered, and definition lists."""
     __slots__ = 'pattern', '_match_cache'

{wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_wikitext.py RENAMED Viewed

@@ -1,5 +1,4 @@
 from bisect import bisect_left, bisect_right, insort_right
-from copy import deepcopy
 from html import unescape
 from itertools import compress, islice
 from operator import attrgetter
@@ -114,10 +113,11 @@ TABLE_FINDITER = rc(
     DOTALL | MULTILINE | VERBOSE,
 ).finditer
-BOLD_ITALIC_FINDITER = rc(  # bold-italic, bold, or italic tokens
-    rb"""((?>'\0*)*?)'\0*+'\0*+('\0*+('\0*+')?+)?+(?=[^']|$)|($)""",
+substitute_apostrophes = rc(  # bold-italic, bold, or italic tokens
+    rb"('\0*+){2,}+(?=[^']|$)",
     MULTILINE | VERBOSE,
-).finditer
+).sub
+find_lines = rc(rb'(.*?)$').finditer
 BOLD_FINDITER = rc(
     rb"""
@@ -488,6 +488,7 @@ class WikiText:
         # Note: The following algorithm won't work correctly if spans
         # are not sorted.
         # Note: No span should be removed from _type_to_spans.
+        rmlength = rmstop - rmstart
         for spans in self._type_to_spans.values():
             i = len(spans) - 1
             while i >= 0:
@@ -495,7 +496,6 @@ class WikiText:
                 s, e, _, b = span = spans[i]
                 if rmstop <= s:
                     # rmstart <= rmstop <= s <= e
-                    rmlength = rmstop - rmstart
                     # todo
                     span[:] = s - rmlength, e - rmlength, None, None
                     i -= 1
@@ -508,7 +508,7 @@ class WikiText:
                     if rmstop < e:
                         # rmstart < s <= rmstop < e
                         # todo: update byte_array instead
-                        span[:] = rmstart, e + rmstart - rmstop, None, None
+                        span[:] = rmstart, e - rmlength, None, None
                         i -= 1
                         if i < 0:
                             break
@@ -531,7 +531,7 @@ class WikiText:
                     s, e, _, _ = span = spans[i]
                     continue
                 # s <= rmstart <= rmstop <= e
-                span[1] -= rmstop - rmstart
+                span[1] -= rmlength
                 span[2] = None
                 # todo: update bytearray instead
                 span[3] = None
@@ -621,13 +621,12 @@ class WikiText:
         self.string.
         """
         ss, se, _, _ = self._span_data
-        if ss == 0 and se == len(self._lststr[0]):
-            return deepcopy(self._type_to_spans)
         return {
             type_: [
                 [s - ss, e - ss, m, ba[:] if ba is not None else None]
-                for s, e, m, ba in spans[bisect_left(spans, [ss]) :]
-                if e <= se
+                for s, e, m, ba in spans[
+                    bisect_right(spans, [ss]) : bisect_right(spans, [se])
+                ]
             ]
             for type_, spans in self._type_to_spans.items()
         }
@@ -1010,66 +1009,82 @@ class WikiText:
         ]
     @property
-    def _balanced_quotes_shadow(self):
-        """Return bold and italic match objects according MW's algorithm.
+    def _balanced_quotes_shadow(self) -> bytearray:
+        """Return a byte array with non-markup-apostrophes removed.
         The comments at /includes/parser/Parser.php:doQuotes are helpful:
         https://github.com/wikimedia/mediawiki/blob/master/includes/parser/Parser.php
         https://phabricator.wikimedia.org/T15227#178834
         """
-        bold_matches = []
+        bold_starts: List[int] = []
         odd_italics = False
         odd_bold_italics = False
-        shadow_copy = self._shadow[:]
-        append_match = bold_matches.append
-        for match in BOLD_ITALIC_FINDITER(shadow_copy):
-            if match[4] is not None:  # newline or string end
-                if (
-                    odd_italics is True
-                    and (len(bold_matches) + odd_bold_italics) % 2
-                ):
-                    # one of the bold marks needs to be interpreted as italic
-                    first_multi_letter_word = first_space = None
-                    for bold_match in bold_matches:
-                        bold_start = bold_match.start()
-                        if shadow_copy[bold_start - 1 : bold_start] == b' ':
-                            if first_space is None:
-                                first_space = bold_start
-                            continue
-                        if (
-                            shadow_copy[bold_start - 2 : bold_start - 1]
-                            == b' '
-                        ):
-                            shadow_copy[bold_start] = 95  # _
-                            break  # first_single_letter_word
-                        if first_multi_letter_word is None:
-                            first_multi_letter_word = bold_start
-                            continue
-                    else:  # there was no first_single_letter_word
-                        if first_multi_letter_word is not None:
-                            shadow_copy[first_multi_letter_word] = 95  # _
-                        elif first_space is not None:
-                            shadow_copy[first_space] = 95  # _
-                bold_matches.clear()
-                odd_italics = False
-                continue
-            if match[2] is None:  # italic
+        append_bold_start = bold_starts.append
+        def process_line(line: bytes) -> bytes:
+            nonlocal odd_italics, odd_bold_italics
+            if odd_italics and (len(bold_starts) + odd_bold_italics) % 2:
+                # one of the bold marks needs to be interpreted as italic
+                first_multi_letter_word = first_space = None
+                for s in bold_starts:
+                    if line[s - 1] == 32:  # space
+                        if first_space is None:
+                            first_space = s
+                        continue
+                    if line[s - 2] == 32:  # space
+                        line = line[:s] + b' ' + line[s + 1 :]
+                        break  # first_single_letter_word
+                    if first_multi_letter_word is None:
+                        first_multi_letter_word = s
+                        continue
+                else:  # there was no first_single_letter_word
+                    if first_multi_letter_word is not None:
+                        line = (
+                            line[:first_multi_letter_word]
+                            + b'_'
+                            + line[first_multi_letter_word + 1 :]
+                        )
+                    elif first_space is not None:
+                        line = (
+                            line[:first_space] + b'_' + line[first_space + 1 :]
+                        )
+            # reset state for the next line
+            bold_starts.clear()
+            odd_italics = False
+            odd_bold_italics = False
+            return line
+        def process_apostrophes(m) -> bytes:
+            nonlocal odd_italics, odd_bold_italics
+            starts = m.starts(1)
+            n = len(starts)
+            if n == 2:  # italic
                 odd_italics ^= True
-                continue
-            if match[3] is None:  # bold
-                s, e = match.span(1)
-                if s != e:  # four apostrophes, hide the first one
-                    shadow_copy[s] = 95  # _
-                append_match(match)
-                continue
-            # bold-italic
-            s, e = match.span(1)
-            es = e - s
-            if es:  # more than 5 apostrophes, hide the previous ones
-                shadow_copy[s:e] = b'_' * es
-            odd_bold_italics ^= True
-            odd_italics ^= True
-        return shadow_copy
+                return m[0]
+            if n == 3:  # bold
+                append_bold_start(starts[0])
+                return m[0]
+            if n == 5:
+                odd_bold_italics ^= True
+                odd_italics ^= True
+                return m[0]
+            if n == 4:  # four apostrophes -> hide the first one
+                s = starts[1]
+                append_bold_start(s)
+                return b'_' * (s - starts[0]) + m.string[s : m.end()]
+            if n > 5:  # more than 5 apostrophes -> hide the prior ones
+                odd_bold_italics ^= True
+                odd_italics ^= True
+                s = starts[-5]
+                return b'_' * (s - starts[0]) + m.string[s : m.end()]
+            raise  # execution should never reach here
+        return bytearray(b'\n').join(
+            [
+                process_line(substitute_apostrophes(process_apostrophes, line))
+                for line in self._shadow.splitlines()
+            ]
+        )
     def _bolds_italics_recurse(self, result: list, filter_cls: Optional[type]):
         for prop in (
@@ -1121,8 +1136,8 @@ class WikiText:
             bold_spans = tts_setdefault('Bold', [])
             get_old_bold_span = {(s[0], s[1]): s for s in bold_spans}.get
             bold_matches = list(BOLD_FINDITER(balanced_shadow, rs, re))
-            for match in bold_matches:
-                ms, me = match.span()
+            for m in bold_matches:
+                ms, me = m.span()
                 b, e = s + ms, s + me
                 old_span = get_old_bold_span((b, e))
                 if old_span is None:
@@ -1144,16 +1159,16 @@ class WikiText:
         # filter_cls is None or filter_cls is Italic
         # remove bold tokens before searching for italics
-        for match in bold_matches:
-            ms, me = match.span()
-            cs, ce = match.span(1)  # content
+        for m in bold_matches:
+            ms, me = m.span()
+            cs, ce = m.span(1)  # content
             balanced_shadow[ms:cs] = b'_' * (cs - ms)
             balanced_shadow[ce:me] = b'_' * (me - ce)
         italic_spans = tts_setdefault('Italic', [])
         get_old_italic_span = {(s[0], s[1]): s for s in italic_spans}.get
-        for match in ITALIC_FINDITER(balanced_shadow, rs, re):
-            ms, me = match.span()
+        for m in ITALIC_FINDITER(balanced_shadow, rs, re):
+            ms, me = m.span()
             b, e = span = s + ms, s + me
             old_span = get_old_italic_span(span)
             if old_span is None:
@@ -1162,9 +1177,7 @@ class WikiText:
             else:
                 span = old_span
             append(
-                Italic(
-                    _lststr, type_to_spans, span, 'Bold', me != match.end(1)
-                )
+                Italic(_lststr, type_to_spans, span, 'Bold', me != m.end(1))
             )
         if recursive and filter_cls is Italic:
             self._bolds_italics_recurse(result, filter_cls)
@@ -1338,7 +1351,7 @@ class WikiText:
         if level is not None:
             section_spans = compress(
-                section_spans, [l == level for l in levels]
+                section_spans, [lvl == level for lvl in levels]
             )
         return self._section_spans_to_sections(section_spans, shadow)