PyPI - wikitextparser - Versions diffs - 0.54.1.dev0__tar.gz → 0.55.8__tar.gz - Mend

wikitextparser 0.54.1.dev0tar.gz → 0.55.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/CHANGELOG.rst RENAMED Viewed

@@ -1,5 +1,35 @@
-Unreleased
-----------
+v0.55.8
+-------
+- Fixed: Equal signs in extension tag attributes are no longer confused with name-value separator in arguments. (#128)
+v0.55.7
+-------
+- Fixed a bug in ``plain_text``. (#126)
+- Fixed another bug in parsing tables that end without a ``|}`` mark. (#125)
+v0.55.6
+-------
+- Fixed bug in parsing tables that end without a ``|}`` mark. (#124)
+v0.55.5
+-------
+- Fixed: regression in ``plain_text`` not being  able to handle wikilinks only containing fragment/anchor, not title.
+v0.55.4
+-------
+- ``plain_text`` method now uses a more accurate image-detection algorithm.
+v0.55.3
+-------
+- Fixed and improved handling of tables and images in ``plain_text`` (#122)
+v0.55.0
+-------
+- Added:  ``top_levels_only`` argument to ``get_sections``.
+- Deprecated: Calling ``get_sections`` with positional arguments is now deprecated.
+v0.54.1
+-------
 - Fixed some bugs in ``plain_text`` method. (#119, #120)
 - Fixed bug in ``get_tags``. (#121)

wikitextparser-0.55.8/MANIFEST.in ADDED Viewed

@@ -0,0 +1,3 @@
+prune tests
+prune docs
+prune dev

{wikitextparser-0.54.1.dev0/wikitextparser.egg-info → wikitextparser-0.55.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: wikitextparser
-Version: 0.54.1.dev0
+Version: 0.55.8
 Summary: A simple parsing tool for MediaWiki's wikitext markup.
 Author-email: 5j9 <5j9@users.noreply.github.com>
 License: GNU General Public License v3 (GPLv3)
@@ -15,9 +15,7 @@ License-File: LICENSE.md
 Requires-Dist: regex>=2022.9.11
 Requires-Dist: wcwidth
 Provides-Extra: dev
-Requires-Dist: path; extra == "dev"
 Requires-Dist: coverage; extra == "dev"
-Requires-Dist: twine; extra == "dev"
 Provides-Extra: tests
 Requires-Dist: pytest; extra == "tests"

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/pyproject.toml RENAMED Viewed

@@ -38,9 +38,7 @@ Homepage = "https://github.com/5j9/wikitextparser"
 [project.optional-dependencies]
 dev = [
-    "path",
     "coverage",
-    "twine",
 ]
 tests = [
     "pytest",
@@ -56,7 +54,17 @@ namespaces = false
 [tool.setuptools.dynamic.version]
 attr = "wikitextparser.__version__"
-[tool.isort]
-profile = "black"
-line_length = 79
-combine_as_imports = true
+[tool.ruff]
+line-length = 79
+format.quote-style = 'single'
+isort.combine-as-imports = true
+extend-select = [
+    'I',  # isort
+    'UP',  # pyupgrade
+]
+ignore = [
+    'UP027',  # list comprehensions are faster than generator expressions
+]
+[tool.pytest.ini_options]
+addopts = '--quiet --tb=short'

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/__init__.py RENAMED Viewed

@@ -1,8 +1,8 @@
 # Scheme: [N!]N(.N)*[{a|b|rc}N][.postN][.devN]
-__version__ = '0.54.1.dev0'
+__version__ = '0.55.8'
 from . import _wikitext
-from ._argument import Argument
+from ._argument import Argument  # noqa: F401
 from ._comment_bold_italic import Bold, Comment, Italic
 from ._externallink import ExternalLink
 from ._parameter import Parameter

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_argument.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, List, MutableSequence, Optional, Union
+from typing import Dict, List, MutableSequence, Optional, Union
 from regex import DOTALL, MULTILINE

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_cell.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, List, Match, MutableSequence, Union
+from typing import Dict, List, Match, MutableSequence, Union
 from regex import DOTALL, VERBOSE

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_comment_bold_italic.py RENAMED Viewed

@@ -1,18 +1,18 @@
-from typing import Dict, List, MutableSequence, Optional, Union
+from typing import Dict, List, MutableSequence, Optional, Tuple, Union
 from regex import DOTALL, MULTILINE
 from ._wikitext import SubWikiText, rc
 COMMENT_PATTERN = r'<!--[\s\S]*?(?>-->|\Z)'
-COMMA_COMMENT = "'(?>" + COMMENT_PATTERN + ")*+"
-COMMENT_COMMA = "(?>" + COMMENT_PATTERN + ")*+'"
+COMMA_COMMENT = "'(?>" + COMMENT_PATTERN + ')*+'
+COMMENT_COMMA = '(?>' + COMMENT_PATTERN + ")*+'"
 BOLD_FULLMATCH = rc(
-    COMMA_COMMENT * 2 + "'(.*?)(?>'" + COMMENT_COMMA * 2 + "|$)",
+    COMMA_COMMENT * 2 + "'(.*?)(?>'" + COMMENT_COMMA * 2 + '|$)',
     MULTILINE | DOTALL,
 ).fullmatch
 ITALIC_FULLMATCH = rc(
-    COMMA_COMMENT + "'(.*?)(?>'" + COMMENT_COMMA + "|$)", DOTALL
+    COMMA_COMMENT + "'(.*?)(?>'" + COMMENT_COMMA + '|$)', DOTALL
 ).fullmatch
 ITALIC_NOEND_FULLMATCH = rc(COMMA_COMMENT + "'(.*)", DOTALL).fullmatch
@@ -49,7 +49,7 @@ class BoldItalic(SubWikiText):
         self[b:e] = s
     @property
-    def _relative_contents_end(self) -> tuple:
+    def _content_span(self) -> Tuple[int, int]:
         # noinspection PyUnresolvedReferences
         return self._match.span(1)

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_config.py RENAMED Viewed

@@ -126,22 +126,97 @@ _tag_extensions = _parsable_tag_extensions | _unparsable_tag_extensions
 # https://phabricator.wikimedia.org/source/mediawiki/browse/master/includes/DefaultSettings.php
 # See also: https://www.mediawiki.org/wiki/Help:Links#External_links
 _bare_external_link_schemes = {
-    'bitcoin:', 'ftp://', 'ftps://', 'geo:', 'git://', 'gopher://', 'http://',
-    'https://', 'irc://', 'ircs://', 'magnet:', 'mailto:', 'mms://', 'news:',
-    'nntp://', 'redis://', 'sftp://', 'sip:', 'sips:', 'sms:', 'ssh://',
-    'svn://', 'tel:', 'telnet://', 'urn:', 'worldwind://', 'xmpp:',  # '//'
+    'bitcoin:',
+    'ftp://',
+    'ftps://',
+    'geo:',
+    'git://',
+    'gopher://',
+    'http://',
+    'https://',
+    'irc://',
+    'ircs://',
+    'magnet:',
+    'mailto:',
+    'mms://',
+    'news:',
+    'nntp://',
+    'redis://',
+    'sftp://',
+    'sip:',
+    'sips:',
+    'sms:',
+    'ssh://',
+    'svn://',
+    'tel:',
+    'telnet://',
+    'urn:',
+    'worldwind://',
+    'xmpp:',  # '//'
 }
 # generated using dev/html_tag_names.py
 _valid_html_tag_names = {
-    's', 'ins', 'code', 'b', 'ol', 'i', 'h5', 'th', 'dt', 'td',
-    'wbr', 'div', 'big', 'p', 'small', 'h4', 'tt', 'span', 'font',
-    'ruby', 'h3', 'dfn', 'rb', 'li', 'h1', 'cite', 'dl', 'rtc', 'em',
-    'q', 'h2', 'samp', 'strike', 'time', 'blockquote', 'bdi', 'del',
-    'br', 'rp', 'hr', 'abbr', 'sub', 'u', 'kbd', 'table', 'rt', 'dd',
-    'var', 'ul', 'tr', 'center', 'data', 'strong', 'mark',
-    'h6', 'bdo', 'caption', 'sup'}
-_HTML_TAG_NAME = regex_pattern(_valid_html_tag_names) + br'\b'
+    's',
+    'ins',
+    'code',
+    'b',
+    'ol',
+    'i',
+    'h5',
+    'th',
+    'dt',
+    'td',
+    'wbr',
+    'div',
+    'big',
+    'p',
+    'small',
+    'h4',
+    'tt',
+    'span',
+    'font',
+    'ruby',
+    'h3',
+    'dfn',
+    'rb',
+    'li',
+    'h1',
+    'cite',
+    'dl',
+    'rtc',
+    'em',
+    'q',
+    'h2',
+    'samp',
+    'strike',
+    'time',
+    'blockquote',
+    'bdi',
+    'del',
+    'br',
+    'rp',
+    'hr',
+    'abbr',
+    'sub',
+    'u',
+    'kbd',
+    'table',
+    'rt',
+    'dd',
+    'var',
+    'ul',
+    'tr',
+    'center',
+    'data',
+    'strong',
+    'mark',
+    'h6',
+    'bdo',
+    'caption',
+    'sup',
+}
+_HTML_TAG_NAME = regex_pattern(_valid_html_tag_names) + rb'\b'
 _parser_functions = {
     'ARTICLEPAGENAME',
@@ -264,3 +339,43 @@ _parser_functions = {
     'ucfirst',
     'urlencode',
 }
+# https://github.com/wikimedia/mediawiki/blob/de18cff244e8fab2e1ab2470c3b444e76b305e12/includes/libs/mime/MimeAnalyzer.php#L425
+KNOWN_FILE_EXTENSIONS = {
+    'bmp',
+    'djvu',
+    'gif',
+    'iff',
+    'jb2',
+    'jp2',
+    'jpc',
+    'jpeg',
+    'jpg',
+    'jpx',
+    'mid',
+    'mka',
+    'mkv',
+    'mp3',
+    'oga',
+    'ogg',
+    'ogv',
+    'ogx',
+    'opus',
+    'pdf',
+    'png',
+    'psd',
+    'spx',
+    'stl',
+    'svg',
+    'swc',
+    'swf',
+    'tif',
+    'tiff',
+    'wbmp',
+    'webm',
+    'webp',
+    'wmf',
+    'xbm',
+    'xcf',
+}

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_externallink.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List, Optional
 from ._wikitext import BRACKET_EXTERNAL_LINK_URL, IGNORECASE, SubWikiText, rc

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_parameter.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List, Optional, Tuple
 from ._wikitext import WS, SubWikiText
@@ -97,12 +97,12 @@ class Parameter(SubWikiText):
                 len('{{{' + name + '|') : len(
                     '{{{' + name + '|' + innermost_default
                 )
-            ] = ('{{{' + new_default_name + '|' + innermost_default + '}}}')
+            ] = '{{{' + new_default_name + '|' + innermost_default + '}}}'
     @property
     def parameters(self) -> List['Parameter']:
         return super().parameters[1:]
     @property
-    def _relative_contents_end(self) -> tuple:
+    def _content_span(self) -> Tuple[int, int]:
         return 3, -3

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_parser_function.py RENAMED Viewed

@@ -1,5 +1,5 @@
 from bisect import insort
-from typing import Iterable, List, Union
+from typing import Iterable, List, Tuple, Union
 from ._argument import Argument
 from ._wikilist import WikiList
@@ -19,6 +19,10 @@ class SubWikiTextWithArgs(SubWikiText):
     _name_args_matcher = NotImplemented
     _first_arg_sep = 0
+    @property
+    def _content_span(self) -> Tuple[int, int]:
+        return 2, -2
     @property
     def nesting_level(self) -> int:
         """Return the nesting level of self.
@@ -95,10 +99,6 @@ class SubWikiTextWithArgs(SubWikiText):
     def name(self, newname: str) -> None:
         self[2 : 2 + len(self.name)] = newname
-    @property
-    def _relative_contents_end(self) -> tuple:
-        return 2, -2
 class ParserFunction(SubWikiTextWithArgs):
     __slots__ = ()

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_section.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Optional
 from ._wikitext import SubWikiText, rc
@@ -70,7 +70,7 @@ class Section(SubWikiText):
         if m is None:
             raise RuntimeError(
                 "Can't set title for a lead section. "
-                "Try adding it to contents."
+                'Try adding it to contents.'
             )
         self[m.start(2) : m.end(2)] = value

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_spans.py RENAMED Viewed

@@ -1,4 +1,4 @@
-"""Define the functions required for parsing wikitext into spans."""
+"""Define the functions required for parsing wikitext into spans."""
 from functools import partial
 from typing import Callable, Dict, Optional
@@ -27,7 +27,9 @@ PF_TL_FINDITER = rc(
     rb'[\s\0]*+'
     rb'(?>'
     rb'\#[^{}\s:|]++'  # parser function
-    rb'|' + regex_pattern(_parser_functions)[3:] +  # )
+    rb'|'
+    + regex_pattern(_parser_functions)[3:]  # )
+    +
     # should not have any arguments or the arg should start with a :
     rb'(?:'
     rb':(?>[^{}]*+|}(?!})|{(?!{))*+'
@@ -92,9 +94,9 @@ WIKILINK_PARAM_FINDITER = rc(
     REVERSE,
 ).finditer
-MARKUP = b''.maketrans(b"|[]'{}", b'_\2\3___')
-BRACES_PIPE_NEWLINE = b''.maketrans(b"|{}\n", b'____')
-BRACKETS = b''.maketrans(b"[]", b'__')
+MARKUP = b''.maketrans(b"=|[]'{}", b'\1_\2\3___')
+BRACES_PIPE_NEWLINE = b''.maketrans(b'|{}\n', b'____')
+BRACKETS = b''.maketrans(b'[]', b'__')
 PARSABLE_TAG_EXTENSION_NAME = regex_pattern(_parsable_tag_extensions)
 UNPARSABLE_TAG_EXTENSION_NAME = regex_pattern(_unparsable_tag_extensions)
@@ -145,7 +147,7 @@ SPACE_CHARS = rb' \t\n\u000C\r\0'  # \s - \v
 CONTROL_CHARS = rb'\x00-\x1f\x7f-\x9f'
 # https://www.w3.org/TR/html5/syntax.html#syntax-attributes
 ATTR_NAME = rb'(?<attr_name>[^' + SPACE_CHARS + CONTROL_CHARS + rb'"\'>/=]++)'
-EQ_WS = rb'=[' + SPACE_CHARS + rb']*+'
+EQ_WS = rb'[=\1][' + SPACE_CHARS + rb']*+'
 UNQUOTED_ATTR_VAL = rb'(?<attr_value>[^' + SPACE_CHARS + rb'"\'=<>`]++)'
 QUOTED_ATTR_VAL = rb'(?<quote>[\'"])(?<attr_value>.*?)(?P=quote)'
 # May include character references, but for now, ignore the fact that they
@@ -169,7 +171,12 @@ ATTR_VAL = (
 # Ignore ambiguous ampersand for the sake of simplicity.
 ATTRS_PATTERN = (
     rb'(?<attr>'
-    rb'[' + SPACE_CHARS + rb']*+(?>' + ATTR_NAME + ATTR_VAL + rb')'
+    rb'['
+    + SPACE_CHARS
+    + rb']*+(?>'
+    + ATTR_NAME
+    + ATTR_VAL
+    + rb')'
     # See https://stackoverflow.com/a/3558200/2705757 for how HTML5
     # treats self-closing marks.
     + rb'|[^>]++'
@@ -272,39 +279,42 @@ def extract_tag_extensions(
             cms_append([s, e, None, byte_array[s:e]])
             byte_array[s:e] = b'\0' * (e - s)
             continue
         s, e = span('u')  # unparsable
         if s != -1:
             s -= 1  # <
             ets_append([s, e, match, byte_array[s:e]])
             byte_array[s:e] = (e - s) * b'_'
             continue
         s, e = span('p')  # parsable
-        if s != -1:
-            s -= 1  # <
-            ets_append([s, e, match, byte_array[s:e]])
-            cs, ce = span('c')  # content
-            extract_tag_extensions(
-                byte_array,
-                ets_append,
-                cms_append,
-                cs,
-                ce,
-                pms_append,
-                pfs_append,
-                tls_append,
-                wls_append,
-            )
-            _parse_sub_spans(
-                byte_array,
-                s,
-                e,
-                pms_append,
-                pfs_append,
-                tls_append,
-                wls_append,
-            )
-            byte_array[cs:ce] = b'_' * (ce - cs)
-            continue
+        s -= 1  # <
+        ets_append([s, e, match, byte_array[s:e]])
+        cs, ce = span('c')  # content
+        extract_tag_extensions(
+            byte_array,
+            ets_append,
+            cms_append,
+            cs,
+            ce,
+            pms_append,
+            pfs_append,
+            tls_append,
+            wls_append,
+        )
+        _parse_sub_spans(
+            byte_array,
+            s,
+            e,
+            pms_append,
+            pfs_append,
+            tls_append,
+            wls_append,
+        )
+        # Parsable extension tags are not nested but they create separate
+        # environment for bolds, italics, and tables.
+        # Also equal signs are not name-value separators in arguments.
+        byte_array[s:e] = byte_array[s:e].translate(MARKUP)
 def _parse_sub_spans(
@@ -316,9 +326,10 @@ def _parse_sub_spans(
     tls_append: Callable,
     wls_append: Callable,
 ) -> None:
-    start_and_end_tags = *HTML_START_TAG_FINDITER(
-        byte_array, start, end
-    ), *HTML_END_TAG_FINDITER(byte_array, start, end)
+    start_and_end_tags = (
+        *HTML_START_TAG_FINDITER(byte_array, start, end),
+        *HTML_END_TAG_FINDITER(byte_array, start, end),
+    )
     for match in start_and_end_tags:
         ms, me = match.span()
         byte_array[ms:me] = byte_array[ms:me].translate(BRACKETS)

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_table.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from bisect import insort_right
+from bisect import insort_right
 from collections.abc import Mapping
 from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
@@ -48,7 +48,7 @@ HEAD_DIGITS = rc(rb'\s*+\d+').match
 # Captions are optional and only one should be placed between table-start
 # and the first row. Others captions are not part of the table and will
 # be ignored.
-FIRST_NON_CAPTION_LINE = rc(br'\n[\t \0]*+(\|(?!\+)|!)').search
+FIRST_NON_CAPTION_LINE = rc(rb'\n[\t \0]*+(\|(?!\+)|!)|\Z').search
 def head_int(value):
@@ -399,7 +399,7 @@ def _apply_attr_spans(
     # if not table_data:
     #     return table_data
     # 11
-    downward_growing_cells = []  # type: List[Tuple[Optional[T], int, int]]
+    downward_growing_cells: List[Tuple[Optional[T], int, int]] = []
     # 13, 18
     # Algorithm for processing rows
     for attrs_row, row in zip(table_attrs, table_data):

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_tag.py RENAMED Viewed

@@ -1,4 +1,4 @@
-"""Define the Tag class and tag-related regular expressions.
+"""Define the Tag class and tag-related regular expressions.
 Unlike MediaWiki which has very strict HTML rules, regexes
 defined in this module don't follow those restrictions and allow finding
@@ -8,7 +8,7 @@ For more info see:
 * https://www.mediawiki.org/wiki/HTML_restriction
 """
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 from regex import DOTALL, VERBOSE
@@ -28,18 +28,18 @@ from ._wikitext import SubWikiText, rc
 # ).finditer
 # Note that the following regex won't check for nested tags
 TAG_FULLMATCH = rc(
-    rb'''
-    <(?<name>[A-Za-z0-9]++)'''
+    rb"""
+    <(?<name>[A-Za-z0-9]++)"""
     + ATTRS_PATTERN
-    + rb'''
-    ['''
+    + rb"""
+    ["""
     + SPACE_CHARS
-    + rb''']*+
+    + rb"""]*+
     (?>
-        >(?<contents>.*)'''
+        >(?<contents>.*)"""
     + END_TAG_PATTERN.replace(rb'{name}', rb'(?<end_name>[A-Za-z0-9]++)')
-    + rb'''|>  # only start; no end tag; could be self-closing
-    )''',
+    + rb"""|>  # only start; no end tag; could be self-closing
+    )""",
     DOTALL | VERBOSE,
 ).fullmatch
@@ -215,5 +215,6 @@ class Tag(SubWikiTextWithAttrs):
         return super().get_tags(name)[1:]
     @property
-    def _relative_contents_end(self) -> tuple:
-        return self._match.span('contents')
+    def _content_span(self) -> Tuple[int, int]:
+        s = self.string
+        return s.find('>') + 1, s.rfind('<')

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_template.py RENAMED Viewed

@@ -29,6 +29,10 @@ class Template(SubWikiTextWithArgs):
     _name_args_matcher = TL_NAME_ARGS_FULLMATCH
     _first_arg_sep = 124
+    @property
+    def _content_span(self) -> Tuple[int, int]:
+        return 2, -2
     def normal_name(
         self,
         rm_namespaces=('Template',),

{wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_wikilink.py RENAMED Viewed

@@ -1,7 +1,7 @@
-"""Define the WikiLink class."""
+"""Define the WikiLink class."""
-from typing import List, Optional
+from typing import List, Optional, Tuple
 from regex import DOTALL
@@ -22,6 +22,13 @@ FULLMATCH = rc(
 class WikiLink(SubWikiText):
     __slots__ = '_cached_match'
+    @property
+    def _content_span(self) -> Tuple[int, int]:
+        s = self.string
+        f = s.find
+        rf = s.rfind
+        return f('[', f('[') + 1) + 1, rf(']', None, rf(']'))
     @property
     def _match(self):
         shadow = self._shadow
@@ -141,7 +148,3 @@ class WikiLink(SubWikiText):
     @property
     def wikilinks(self) -> List['WikiLink']:
         return super().wikilinks[1:]
-    @property
-    def _relative_contents_end(self) -> tuple:
-        return self._match.span(4)

wikitextparser 0.54.1.dev0__tar.gz → 0.55.8__tar.gz

wikitextparser 0.54.1.dev0tar.gz → 0.55.8tar.gz