wikitextparser 0.54.1.dev0__tar.gz → 0.55.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/CHANGELOG.rst +32 -2
  2. wikitextparser-0.55.8/MANIFEST.in +3 -0
  3. {wikitextparser-0.54.1.dev0/wikitextparser.egg-info → wikitextparser-0.55.8}/PKG-INFO +1 -3
  4. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/pyproject.toml +14 -6
  5. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/__init__.py +2 -2
  6. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_argument.py +1 -1
  7. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_cell.py +1 -1
  8. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_comment_bold_italic.py +6 -6
  9. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_config.py +127 -12
  10. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_externallink.py +1 -1
  11. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_parameter.py +3 -3
  12. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_parser_function.py +5 -5
  13. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_section.py +2 -2
  14. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_spans.py +47 -36
  15. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_table.py +3 -3
  16. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_tag.py +13 -12
  17. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_template.py +4 -0
  18. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_wikilink.py +9 -6
  19. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_wikitext.py +90 -33
  20. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8/wikitextparser.egg-info}/PKG-INFO +1 -3
  21. wikitextparser-0.55.8/wikitextparser.egg-info/SOURCES.txt +27 -0
  22. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser.egg-info/requires.txt +0 -2
  23. wikitextparser-0.54.1.dev0/dev/coverage.py +0 -10
  24. wikitextparser-0.54.1.dev0/dev/html_tag_names.py +0 -76
  25. wikitextparser-0.54.1.dev0/dev/profiles/pp_profile.py +0 -19
  26. wikitextparser-0.54.1.dev0/dev/profiles/table_profile.py +0 -19
  27. wikitextparser-0.54.1.dev0/dev/profiles/vs_mwpfh.py +0 -53
  28. wikitextparser-0.54.1.dev0/docs/conf.py +0 -183
  29. wikitextparser-0.54.1.dev0/tests/__main__.py +0 -3
  30. wikitextparser-0.54.1.dev0/tests/test_argument.py +0 -131
  31. wikitextparser-0.54.1.dev0/tests/test_cell.py +0 -102
  32. wikitextparser-0.54.1.dev0/tests/test_comment_bold_italic.py +0 -54
  33. wikitextparser-0.54.1.dev0/tests/test_config.py +0 -63
  34. wikitextparser-0.54.1.dev0/tests/test_externallink.py +0 -88
  35. wikitextparser-0.54.1.dev0/tests/test_parameter.py +0 -104
  36. wikitextparser-0.54.1.dev0/tests/test_parser_function.py +0 -74
  37. wikitextparser-0.54.1.dev0/tests/test_section.py +0 -106
  38. wikitextparser-0.54.1.dev0/tests/test_spans.py +0 -628
  39. wikitextparser-0.54.1.dev0/tests/test_table.py +0 -664
  40. wikitextparser-0.54.1.dev0/tests/test_tag.py +0 -197
  41. wikitextparser-0.54.1.dev0/tests/test_template.py +0 -336
  42. wikitextparser-0.54.1.dev0/tests/test_wikilink.py +0 -208
  43. wikitextparser-0.54.1.dev0/tests/test_wikilist.py +0 -163
  44. wikitextparser-0.54.1.dev0/tests/wikitext/test_external_links.py +0 -226
  45. wikitextparser-0.54.1.dev0/tests/wikitext/test_get_bolds_and_italics.py +0 -138
  46. wikitextparser-0.54.1.dev0/tests/wikitext/test_get_lists.py +0 -100
  47. wikitextparser-0.54.1.dev0/tests/wikitext/test_get_tags.py +0 -92
  48. wikitextparser-0.54.1.dev0/tests/wikitext/test_pformat.py +0 -406
  49. wikitextparser-0.54.1.dev0/tests/wikitext/test_plain_text.py +0 -186
  50. wikitextparser-0.54.1.dev0/tests/wikitext/test_sections.py +0 -107
  51. wikitextparser-0.54.1.dev0/tests/wikitext/test_tables.py +0 -181
  52. wikitextparser-0.54.1.dev0/tests/wikitext/test_wikitext.py +0 -377
  53. wikitextparser-0.54.1.dev0/wikitextparser.egg-info/SOURCES.txt +0 -56
  54. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/LICENSE.md +0 -0
  55. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/README.rst +0 -0
  56. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/setup.cfg +0 -0
  57. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_wikilist.py +0 -0
  58. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser.egg-info/dependency_links.txt +0 -0
  59. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser.egg-info/top_level.txt +0 -0
  60. {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser.egg-info/zip-safe +0 -0
@@ -1,5 +1,35 @@
1
- Unreleased
2
- ----------
1
+ v0.55.8
2
+ -------
3
+ - Fixed: Equal signs in extension tag attributes are no longer confused with name-value separator in arguments. (#128)
4
+
5
+ v0.55.7
6
+ -------
7
+ - Fixed a bug in ``plain_text``. (#126)
8
+ - Fixed another bug in parsing tables that end without a ``|}`` mark. (#125)
9
+
10
+ v0.55.6
11
+ -------
12
+ - Fixed bug in parsing tables that end without a ``|}`` mark. (#124)
13
+
14
+ v0.55.5
15
+ -------
16
+ - Fixed: regression in ``plain_text`` not being able to handle wikilinks only containing fragment/anchor, not title.
17
+
18
+ v0.55.4
19
+ -------
20
+ - ``plain_text`` method now uses a more accurate image-detection algorithm.
21
+
22
+ v0.55.3
23
+ -------
24
+ - Fixed and improved handling of tables and images in ``plain_text`` (#122)
25
+
26
+ v0.55.0
27
+ -------
28
+ - Added: ``top_levels_only`` argument to ``get_sections``.
29
+ - Deprecated: Calling ``get_sections`` with positional arguments is now deprecated.
30
+
31
+ v0.54.1
32
+ -------
3
33
  - Fixed some bugs in ``plain_text`` method. (#119, #120)
4
34
  - Fixed bug in ``get_tags``. (#121)
5
35
 
@@ -0,0 +1,3 @@
1
+ prune tests
2
+ prune docs
3
+ prune dev
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: wikitextparser
3
- Version: 0.54.1.dev0
3
+ Version: 0.55.8
4
4
  Summary: A simple parsing tool for MediaWiki's wikitext markup.
5
5
  Author-email: 5j9 <5j9@users.noreply.github.com>
6
6
  License: GNU General Public License v3 (GPLv3)
@@ -15,9 +15,7 @@ License-File: LICENSE.md
15
15
  Requires-Dist: regex>=2022.9.11
16
16
  Requires-Dist: wcwidth
17
17
  Provides-Extra: dev
18
- Requires-Dist: path; extra == "dev"
19
18
  Requires-Dist: coverage; extra == "dev"
20
- Requires-Dist: twine; extra == "dev"
21
19
  Provides-Extra: tests
22
20
  Requires-Dist: pytest; extra == "tests"
23
21
 
@@ -38,9 +38,7 @@ Homepage = "https://github.com/5j9/wikitextparser"
38
38
 
39
39
  [project.optional-dependencies]
40
40
  dev = [
41
- "path",
42
41
  "coverage",
43
- "twine",
44
42
  ]
45
43
  tests = [
46
44
  "pytest",
@@ -56,7 +54,17 @@ namespaces = false
56
54
  [tool.setuptools.dynamic.version]
57
55
  attr = "wikitextparser.__version__"
58
56
 
59
- [tool.isort]
60
- profile = "black"
61
- line_length = 79
62
- combine_as_imports = true
57
+ [tool.ruff]
58
+ line-length = 79
59
+ format.quote-style = 'single'
60
+ isort.combine-as-imports = true
61
+ extend-select = [
62
+ 'I', # isort
63
+ 'UP', # pyupgrade
64
+ ]
65
+ ignore = [
66
+ 'UP027', # list comprehensions are faster than generator expressions
67
+ ]
68
+
69
+ [tool.pytest.ini_options]
70
+ addopts = '--quiet --tb=short'
@@ -1,8 +1,8 @@
1
1
  # Scheme: [N!]N(.N)*[{a|b|rc}N][.postN][.devN]
2
- __version__ = '0.54.1.dev0'
2
+ __version__ = '0.55.8'
3
3
 
4
4
  from . import _wikitext
5
- from ._argument import Argument
5
+ from ._argument import Argument # noqa: F401
6
6
  from ._comment_bold_italic import Bold, Comment, Italic
7
7
  from ._externallink import ExternalLink
8
8
  from ._parameter import Parameter
@@ -1,4 +1,4 @@
1
- from typing import Dict, List, MutableSequence, Optional, Union
1
+ from typing import Dict, List, MutableSequence, Optional, Union
2
2
 
3
3
  from regex import DOTALL, MULTILINE
4
4
 
@@ -1,4 +1,4 @@
1
- from typing import Dict, List, Match, MutableSequence, Union
1
+ from typing import Dict, List, Match, MutableSequence, Union
2
2
 
3
3
  from regex import DOTALL, VERBOSE
4
4
 
@@ -1,18 +1,18 @@
1
- from typing import Dict, List, MutableSequence, Optional, Union
1
+ from typing import Dict, List, MutableSequence, Optional, Tuple, Union
2
2
 
3
3
  from regex import DOTALL, MULTILINE
4
4
 
5
5
  from ._wikitext import SubWikiText, rc
6
6
 
7
7
  COMMENT_PATTERN = r'<!--[\s\S]*?(?>-->|\Z)'
8
- COMMA_COMMENT = "'(?>" + COMMENT_PATTERN + ")*+"
9
- COMMENT_COMMA = "(?>" + COMMENT_PATTERN + ")*+'"
8
+ COMMA_COMMENT = "'(?>" + COMMENT_PATTERN + ')*+'
9
+ COMMENT_COMMA = '(?>' + COMMENT_PATTERN + ")*+'"
10
10
  BOLD_FULLMATCH = rc(
11
- COMMA_COMMENT * 2 + "'(.*?)(?>'" + COMMENT_COMMA * 2 + "|$)",
11
+ COMMA_COMMENT * 2 + "'(.*?)(?>'" + COMMENT_COMMA * 2 + '|$)',
12
12
  MULTILINE | DOTALL,
13
13
  ).fullmatch
14
14
  ITALIC_FULLMATCH = rc(
15
- COMMA_COMMENT + "'(.*?)(?>'" + COMMENT_COMMA + "|$)", DOTALL
15
+ COMMA_COMMENT + "'(.*?)(?>'" + COMMENT_COMMA + '|$)', DOTALL
16
16
  ).fullmatch
17
17
  ITALIC_NOEND_FULLMATCH = rc(COMMA_COMMENT + "'(.*)", DOTALL).fullmatch
18
18
 
@@ -49,7 +49,7 @@ class BoldItalic(SubWikiText):
49
49
  self[b:e] = s
50
50
 
51
51
  @property
52
- def _relative_contents_end(self) -> tuple:
52
+ def _content_span(self) -> Tuple[int, int]:
53
53
  # noinspection PyUnresolvedReferences
54
54
  return self._match.span(1)
55
55
 
@@ -126,22 +126,97 @@ _tag_extensions = _parsable_tag_extensions | _unparsable_tag_extensions
126
126
  # https://phabricator.wikimedia.org/source/mediawiki/browse/master/includes/DefaultSettings.php
127
127
  # See also: https://www.mediawiki.org/wiki/Help:Links#External_links
128
128
  _bare_external_link_schemes = {
129
- 'bitcoin:', 'ftp://', 'ftps://', 'geo:', 'git://', 'gopher://', 'http://',
130
- 'https://', 'irc://', 'ircs://', 'magnet:', 'mailto:', 'mms://', 'news:',
131
- 'nntp://', 'redis://', 'sftp://', 'sip:', 'sips:', 'sms:', 'ssh://',
132
- 'svn://', 'tel:', 'telnet://', 'urn:', 'worldwind://', 'xmpp:', # '//'
129
+ 'bitcoin:',
130
+ 'ftp://',
131
+ 'ftps://',
132
+ 'geo:',
133
+ 'git://',
134
+ 'gopher://',
135
+ 'http://',
136
+ 'https://',
137
+ 'irc://',
138
+ 'ircs://',
139
+ 'magnet:',
140
+ 'mailto:',
141
+ 'mms://',
142
+ 'news:',
143
+ 'nntp://',
144
+ 'redis://',
145
+ 'sftp://',
146
+ 'sip:',
147
+ 'sips:',
148
+ 'sms:',
149
+ 'ssh://',
150
+ 'svn://',
151
+ 'tel:',
152
+ 'telnet://',
153
+ 'urn:',
154
+ 'worldwind://',
155
+ 'xmpp:', # '//'
133
156
  }
134
157
 
135
158
  # generated using dev/html_tag_names.py
136
159
  _valid_html_tag_names = {
137
- 's', 'ins', 'code', 'b', 'ol', 'i', 'h5', 'th', 'dt', 'td',
138
- 'wbr', 'div', 'big', 'p', 'small', 'h4', 'tt', 'span', 'font',
139
- 'ruby', 'h3', 'dfn', 'rb', 'li', 'h1', 'cite', 'dl', 'rtc', 'em',
140
- 'q', 'h2', 'samp', 'strike', 'time', 'blockquote', 'bdi', 'del',
141
- 'br', 'rp', 'hr', 'abbr', 'sub', 'u', 'kbd', 'table', 'rt', 'dd',
142
- 'var', 'ul', 'tr', 'center', 'data', 'strong', 'mark',
143
- 'h6', 'bdo', 'caption', 'sup'}
144
- _HTML_TAG_NAME = regex_pattern(_valid_html_tag_names) + br'\b'
160
+ 's',
161
+ 'ins',
162
+ 'code',
163
+ 'b',
164
+ 'ol',
165
+ 'i',
166
+ 'h5',
167
+ 'th',
168
+ 'dt',
169
+ 'td',
170
+ 'wbr',
171
+ 'div',
172
+ 'big',
173
+ 'p',
174
+ 'small',
175
+ 'h4',
176
+ 'tt',
177
+ 'span',
178
+ 'font',
179
+ 'ruby',
180
+ 'h3',
181
+ 'dfn',
182
+ 'rb',
183
+ 'li',
184
+ 'h1',
185
+ 'cite',
186
+ 'dl',
187
+ 'rtc',
188
+ 'em',
189
+ 'q',
190
+ 'h2',
191
+ 'samp',
192
+ 'strike',
193
+ 'time',
194
+ 'blockquote',
195
+ 'bdi',
196
+ 'del',
197
+ 'br',
198
+ 'rp',
199
+ 'hr',
200
+ 'abbr',
201
+ 'sub',
202
+ 'u',
203
+ 'kbd',
204
+ 'table',
205
+ 'rt',
206
+ 'dd',
207
+ 'var',
208
+ 'ul',
209
+ 'tr',
210
+ 'center',
211
+ 'data',
212
+ 'strong',
213
+ 'mark',
214
+ 'h6',
215
+ 'bdo',
216
+ 'caption',
217
+ 'sup',
218
+ }
219
+ _HTML_TAG_NAME = regex_pattern(_valid_html_tag_names) + rb'\b'
145
220
 
146
221
  _parser_functions = {
147
222
  'ARTICLEPAGENAME',
@@ -264,3 +339,43 @@ _parser_functions = {
264
339
  'ucfirst',
265
340
  'urlencode',
266
341
  }
342
+
343
+
344
+ # https://github.com/wikimedia/mediawiki/blob/de18cff244e8fab2e1ab2470c3b444e76b305e12/includes/libs/mime/MimeAnalyzer.php#L425
345
+ KNOWN_FILE_EXTENSIONS = {
346
+ 'bmp',
347
+ 'djvu',
348
+ 'gif',
349
+ 'iff',
350
+ 'jb2',
351
+ 'jp2',
352
+ 'jpc',
353
+ 'jpeg',
354
+ 'jpg',
355
+ 'jpx',
356
+ 'mid',
357
+ 'mka',
358
+ 'mkv',
359
+ 'mp3',
360
+ 'oga',
361
+ 'ogg',
362
+ 'ogv',
363
+ 'ogx',
364
+ 'opus',
365
+ 'pdf',
366
+ 'png',
367
+ 'psd',
368
+ 'spx',
369
+ 'stl',
370
+ 'svg',
371
+ 'swc',
372
+ 'swf',
373
+ 'tif',
374
+ 'tiff',
375
+ 'wbmp',
376
+ 'webm',
377
+ 'webp',
378
+ 'wmf',
379
+ 'xbm',
380
+ 'xcf',
381
+ }
@@ -1,4 +1,4 @@
1
- from typing import List, Optional
1
+ from typing import List, Optional
2
2
 
3
3
  from ._wikitext import BRACKET_EXTERNAL_LINK_URL, IGNORECASE, SubWikiText, rc
4
4
 
@@ -1,4 +1,4 @@
1
- from typing import List, Optional
1
+ from typing import List, Optional, Tuple
2
2
 
3
3
  from ._wikitext import WS, SubWikiText
4
4
 
@@ -97,12 +97,12 @@ class Parameter(SubWikiText):
97
97
  len('{{{' + name + '|') : len(
98
98
  '{{{' + name + '|' + innermost_default
99
99
  )
100
- ] = ('{{{' + new_default_name + '|' + innermost_default + '}}}')
100
+ ] = '{{{' + new_default_name + '|' + innermost_default + '}}}'
101
101
 
102
102
  @property
103
103
  def parameters(self) -> List['Parameter']:
104
104
  return super().parameters[1:]
105
105
 
106
106
  @property
107
- def _relative_contents_end(self) -> tuple:
107
+ def _content_span(self) -> Tuple[int, int]:
108
108
  return 3, -3
@@ -1,5 +1,5 @@
1
1
  from bisect import insort
2
- from typing import Iterable, List, Union
2
+ from typing import Iterable, List, Tuple, Union
3
3
 
4
4
  from ._argument import Argument
5
5
  from ._wikilist import WikiList
@@ -19,6 +19,10 @@ class SubWikiTextWithArgs(SubWikiText):
19
19
  _name_args_matcher = NotImplemented
20
20
  _first_arg_sep = 0
21
21
 
22
+ @property
23
+ def _content_span(self) -> Tuple[int, int]:
24
+ return 2, -2
25
+
22
26
  @property
23
27
  def nesting_level(self) -> int:
24
28
  """Return the nesting level of self.
@@ -95,10 +99,6 @@ class SubWikiTextWithArgs(SubWikiText):
95
99
  def name(self, newname: str) -> None:
96
100
  self[2 : 2 + len(self.name)] = newname
97
101
 
98
- @property
99
- def _relative_contents_end(self) -> tuple:
100
- return 2, -2
101
-
102
102
 
103
103
  class ParserFunction(SubWikiTextWithArgs):
104
104
  __slots__ = ()
@@ -1,4 +1,4 @@
1
- from typing import Optional
1
+ from typing import Optional
2
2
 
3
3
  from ._wikitext import SubWikiText, rc
4
4
 
@@ -70,7 +70,7 @@ class Section(SubWikiText):
70
70
  if m is None:
71
71
  raise RuntimeError(
72
72
  "Can't set title for a lead section. "
73
- "Try adding it to contents."
73
+ 'Try adding it to contents.'
74
74
  )
75
75
  self[m.start(2) : m.end(2)] = value
76
76
 
@@ -1,4 +1,4 @@
1
- """Define the functions required for parsing wikitext into spans."""
1
+ """Define the functions required for parsing wikitext into spans."""
2
2
  from functools import partial
3
3
  from typing import Callable, Dict, Optional
4
4
 
@@ -27,7 +27,9 @@ PF_TL_FINDITER = rc(
27
27
  rb'[\s\0]*+'
28
28
  rb'(?>'
29
29
  rb'\#[^{}\s:|]++' # parser function
30
- rb'|' + regex_pattern(_parser_functions)[3:] + # )
30
+ rb'|'
31
+ + regex_pattern(_parser_functions)[3:] # )
32
+ +
31
33
  # should not have any arguments or the arg should start with a :
32
34
  rb'(?:'
33
35
  rb':(?>[^{}]*+|}(?!})|{(?!{))*+'
@@ -92,9 +94,9 @@ WIKILINK_PARAM_FINDITER = rc(
92
94
  REVERSE,
93
95
  ).finditer
94
96
 
95
- MARKUP = b''.maketrans(b"|[]'{}", b'_\2\3___')
96
- BRACES_PIPE_NEWLINE = b''.maketrans(b"|{}\n", b'____')
97
- BRACKETS = b''.maketrans(b"[]", b'__')
97
+ MARKUP = b''.maketrans(b"=|[]'{}", b'\1_\2\3___')
98
+ BRACES_PIPE_NEWLINE = b''.maketrans(b'|{}\n', b'____')
99
+ BRACKETS = b''.maketrans(b'[]', b'__')
98
100
 
99
101
  PARSABLE_TAG_EXTENSION_NAME = regex_pattern(_parsable_tag_extensions)
100
102
  UNPARSABLE_TAG_EXTENSION_NAME = regex_pattern(_unparsable_tag_extensions)
@@ -145,7 +147,7 @@ SPACE_CHARS = rb' \t\n\u000C\r\0' # \s - \v
145
147
  CONTROL_CHARS = rb'\x00-\x1f\x7f-\x9f'
146
148
  # https://www.w3.org/TR/html5/syntax.html#syntax-attributes
147
149
  ATTR_NAME = rb'(?<attr_name>[^' + SPACE_CHARS + CONTROL_CHARS + rb'"\'>/=]++)'
148
- EQ_WS = rb'=[' + SPACE_CHARS + rb']*+'
150
+ EQ_WS = rb'[=\1][' + SPACE_CHARS + rb']*+'
149
151
  UNQUOTED_ATTR_VAL = rb'(?<attr_value>[^' + SPACE_CHARS + rb'"\'=<>`]++)'
150
152
  QUOTED_ATTR_VAL = rb'(?<quote>[\'"])(?<attr_value>.*?)(?P=quote)'
151
153
  # May include character references, but for now, ignore the fact that they
@@ -169,7 +171,12 @@ ATTR_VAL = (
169
171
  # Ignore ambiguous ampersand for the sake of simplicity.
170
172
  ATTRS_PATTERN = (
171
173
  rb'(?<attr>'
172
- rb'[' + SPACE_CHARS + rb']*+(?>' + ATTR_NAME + ATTR_VAL + rb')'
174
+ rb'['
175
+ + SPACE_CHARS
176
+ + rb']*+(?>'
177
+ + ATTR_NAME
178
+ + ATTR_VAL
179
+ + rb')'
173
180
  # See https://stackoverflow.com/a/3558200/2705757 for how HTML5
174
181
  # treats self-closing marks.
175
182
  + rb'|[^>]++'
@@ -272,39 +279,42 @@ def extract_tag_extensions(
272
279
  cms_append([s, e, None, byte_array[s:e]])
273
280
  byte_array[s:e] = b'\0' * (e - s)
274
281
  continue
282
+
275
283
  s, e = span('u') # unparsable
276
284
  if s != -1:
277
285
  s -= 1 # <
278
286
  ets_append([s, e, match, byte_array[s:e]])
279
287
  byte_array[s:e] = (e - s) * b'_'
280
288
  continue
289
+
281
290
  s, e = span('p') # parsable
282
- if s != -1:
283
- s -= 1 # <
284
- ets_append([s, e, match, byte_array[s:e]])
285
- cs, ce = span('c') # content
286
- extract_tag_extensions(
287
- byte_array,
288
- ets_append,
289
- cms_append,
290
- cs,
291
- ce,
292
- pms_append,
293
- pfs_append,
294
- tls_append,
295
- wls_append,
296
- )
297
- _parse_sub_spans(
298
- byte_array,
299
- s,
300
- e,
301
- pms_append,
302
- pfs_append,
303
- tls_append,
304
- wls_append,
305
- )
306
- byte_array[cs:ce] = b'_' * (ce - cs)
307
- continue
291
+ s -= 1 # <
292
+ ets_append([s, e, match, byte_array[s:e]])
293
+ cs, ce = span('c') # content
294
+ extract_tag_extensions(
295
+ byte_array,
296
+ ets_append,
297
+ cms_append,
298
+ cs,
299
+ ce,
300
+ pms_append,
301
+ pfs_append,
302
+ tls_append,
303
+ wls_append,
304
+ )
305
+ _parse_sub_spans(
306
+ byte_array,
307
+ s,
308
+ e,
309
+ pms_append,
310
+ pfs_append,
311
+ tls_append,
312
+ wls_append,
313
+ )
314
+ # Parsable extension tags are not nested but they create separate
315
+ # environment for bolds, italics, and tables.
316
+ # Also equal signs are not name-value separators in arguments.
317
+ byte_array[s:e] = byte_array[s:e].translate(MARKUP)
308
318
 
309
319
 
310
320
  def _parse_sub_spans(
@@ -316,9 +326,10 @@ def _parse_sub_spans(
316
326
  tls_append: Callable,
317
327
  wls_append: Callable,
318
328
  ) -> None:
319
- start_and_end_tags = *HTML_START_TAG_FINDITER(
320
- byte_array, start, end
321
- ), *HTML_END_TAG_FINDITER(byte_array, start, end)
329
+ start_and_end_tags = (
330
+ *HTML_START_TAG_FINDITER(byte_array, start, end),
331
+ *HTML_END_TAG_FINDITER(byte_array, start, end),
332
+ )
322
333
  for match in start_and_end_tags:
323
334
  ms, me = match.span()
324
335
  byte_array[ms:me] = byte_array[ms:me].translate(BRACKETS)
@@ -1,4 +1,4 @@
1
- from bisect import insort_right
1
+ from bisect import insort_right
2
2
  from collections.abc import Mapping
3
3
  from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
4
4
 
@@ -48,7 +48,7 @@ HEAD_DIGITS = rc(rb'\s*+\d+').match
48
48
  # Captions are optional and only one should be placed between table-start
49
49
  # and the first row. Others captions are not part of the table and will
50
50
  # be ignored.
51
- FIRST_NON_CAPTION_LINE = rc(br'\n[\t \0]*+(\|(?!\+)|!)').search
51
+ FIRST_NON_CAPTION_LINE = rc(rb'\n[\t \0]*+(\|(?!\+)|!)|\Z').search
52
52
 
53
53
 
54
54
  def head_int(value):
@@ -399,7 +399,7 @@ def _apply_attr_spans(
399
399
  # if not table_data:
400
400
  # return table_data
401
401
  # 11
402
- downward_growing_cells = [] # type: List[Tuple[Optional[T], int, int]]
402
+ downward_growing_cells: List[Tuple[Optional[T], int, int]] = []
403
403
  # 13, 18
404
404
  # Algorithm for processing rows
405
405
  for attrs_row, row in zip(table_attrs, table_data):
@@ -1,4 +1,4 @@
1
- """Define the Tag class and tag-related regular expressions.
1
+ """Define the Tag class and tag-related regular expressions.
2
2
 
3
3
  Unlike MediaWiki which has very strict HTML rules, regexes
4
4
  defined in this module don't follow those restrictions and allow finding
@@ -8,7 +8,7 @@ For more info see:
8
8
  * https://www.mediawiki.org/wiki/HTML_restriction
9
9
  """
10
10
 
11
- from typing import Any, Dict, List, Optional
11
+ from typing import Any, Dict, List, Optional, Tuple
12
12
 
13
13
  from regex import DOTALL, VERBOSE
14
14
 
@@ -28,18 +28,18 @@ from ._wikitext import SubWikiText, rc
28
28
  # ).finditer
29
29
  # Note that the following regex won't check for nested tags
30
30
  TAG_FULLMATCH = rc(
31
- rb'''
32
- <(?<name>[A-Za-z0-9]++)'''
31
+ rb"""
32
+ <(?<name>[A-Za-z0-9]++)"""
33
33
  + ATTRS_PATTERN
34
- + rb'''
35
- ['''
34
+ + rb"""
35
+ ["""
36
36
  + SPACE_CHARS
37
- + rb''']*+
37
+ + rb"""]*+
38
38
  (?>
39
- >(?<contents>.*)'''
39
+ >(?<contents>.*)"""
40
40
  + END_TAG_PATTERN.replace(rb'{name}', rb'(?<end_name>[A-Za-z0-9]++)')
41
- + rb'''|> # only start; no end tag; could be self-closing
42
- )''',
41
+ + rb"""|> # only start; no end tag; could be self-closing
42
+ )""",
43
43
  DOTALL | VERBOSE,
44
44
  ).fullmatch
45
45
 
@@ -215,5 +215,6 @@ class Tag(SubWikiTextWithAttrs):
215
215
  return super().get_tags(name)[1:]
216
216
 
217
217
  @property
218
- def _relative_contents_end(self) -> tuple:
219
- return self._match.span('contents')
218
+ def _content_span(self) -> Tuple[int, int]:
219
+ s = self.string
220
+ return s.find('>') + 1, s.rfind('<')
@@ -29,6 +29,10 @@ class Template(SubWikiTextWithArgs):
29
29
  _name_args_matcher = TL_NAME_ARGS_FULLMATCH
30
30
  _first_arg_sep = 124
31
31
 
32
+ @property
33
+ def _content_span(self) -> Tuple[int, int]:
34
+ return 2, -2
35
+
32
36
  def normal_name(
33
37
  self,
34
38
  rm_namespaces=('Template',),
@@ -1,7 +1,7 @@
1
- """Define the WikiLink class."""
1
+ """Define the WikiLink class."""
2
2
 
3
3
 
4
- from typing import List, Optional
4
+ from typing import List, Optional, Tuple
5
5
 
6
6
  from regex import DOTALL
7
7
 
@@ -22,6 +22,13 @@ FULLMATCH = rc(
22
22
  class WikiLink(SubWikiText):
23
23
  __slots__ = '_cached_match'
24
24
 
25
+ @property
26
+ def _content_span(self) -> Tuple[int, int]:
27
+ s = self.string
28
+ f = s.find
29
+ rf = s.rfind
30
+ return f('[', f('[') + 1) + 1, rf(']', None, rf(']'))
31
+
25
32
  @property
26
33
  def _match(self):
27
34
  shadow = self._shadow
@@ -141,7 +148,3 @@ class WikiLink(SubWikiText):
141
148
  @property
142
149
  def wikilinks(self) -> List['WikiLink']:
143
150
  return super().wikilinks[1:]
144
-
145
- @property
146
- def _relative_contents_end(self) -> tuple:
147
- return self._match.span(4)