wikitextparser 0.54.1.dev0__tar.gz → 0.55.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/CHANGELOG.rst +32 -2
- wikitextparser-0.55.8/MANIFEST.in +3 -0
- {wikitextparser-0.54.1.dev0/wikitextparser.egg-info → wikitextparser-0.55.8}/PKG-INFO +1 -3
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/pyproject.toml +14 -6
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/__init__.py +2 -2
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_argument.py +1 -1
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_cell.py +1 -1
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_comment_bold_italic.py +6 -6
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_config.py +127 -12
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_externallink.py +1 -1
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_parameter.py +3 -3
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_parser_function.py +5 -5
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_section.py +2 -2
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_spans.py +47 -36
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_table.py +3 -3
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_tag.py +13 -12
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_template.py +4 -0
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_wikilink.py +9 -6
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_wikitext.py +90 -33
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8/wikitextparser.egg-info}/PKG-INFO +1 -3
- wikitextparser-0.55.8/wikitextparser.egg-info/SOURCES.txt +27 -0
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser.egg-info/requires.txt +0 -2
- wikitextparser-0.54.1.dev0/dev/coverage.py +0 -10
- wikitextparser-0.54.1.dev0/dev/html_tag_names.py +0 -76
- wikitextparser-0.54.1.dev0/dev/profiles/pp_profile.py +0 -19
- wikitextparser-0.54.1.dev0/dev/profiles/table_profile.py +0 -19
- wikitextparser-0.54.1.dev0/dev/profiles/vs_mwpfh.py +0 -53
- wikitextparser-0.54.1.dev0/docs/conf.py +0 -183
- wikitextparser-0.54.1.dev0/tests/__main__.py +0 -3
- wikitextparser-0.54.1.dev0/tests/test_argument.py +0 -131
- wikitextparser-0.54.1.dev0/tests/test_cell.py +0 -102
- wikitextparser-0.54.1.dev0/tests/test_comment_bold_italic.py +0 -54
- wikitextparser-0.54.1.dev0/tests/test_config.py +0 -63
- wikitextparser-0.54.1.dev0/tests/test_externallink.py +0 -88
- wikitextparser-0.54.1.dev0/tests/test_parameter.py +0 -104
- wikitextparser-0.54.1.dev0/tests/test_parser_function.py +0 -74
- wikitextparser-0.54.1.dev0/tests/test_section.py +0 -106
- wikitextparser-0.54.1.dev0/tests/test_spans.py +0 -628
- wikitextparser-0.54.1.dev0/tests/test_table.py +0 -664
- wikitextparser-0.54.1.dev0/tests/test_tag.py +0 -197
- wikitextparser-0.54.1.dev0/tests/test_template.py +0 -336
- wikitextparser-0.54.1.dev0/tests/test_wikilink.py +0 -208
- wikitextparser-0.54.1.dev0/tests/test_wikilist.py +0 -163
- wikitextparser-0.54.1.dev0/tests/wikitext/test_external_links.py +0 -226
- wikitextparser-0.54.1.dev0/tests/wikitext/test_get_bolds_and_italics.py +0 -138
- wikitextparser-0.54.1.dev0/tests/wikitext/test_get_lists.py +0 -100
- wikitextparser-0.54.1.dev0/tests/wikitext/test_get_tags.py +0 -92
- wikitextparser-0.54.1.dev0/tests/wikitext/test_pformat.py +0 -406
- wikitextparser-0.54.1.dev0/tests/wikitext/test_plain_text.py +0 -186
- wikitextparser-0.54.1.dev0/tests/wikitext/test_sections.py +0 -107
- wikitextparser-0.54.1.dev0/tests/wikitext/test_tables.py +0 -181
- wikitextparser-0.54.1.dev0/tests/wikitext/test_wikitext.py +0 -377
- wikitextparser-0.54.1.dev0/wikitextparser.egg-info/SOURCES.txt +0 -56
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/LICENSE.md +0 -0
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/README.rst +0 -0
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/setup.cfg +0 -0
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser/_wikilist.py +0 -0
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser.egg-info/dependency_links.txt +0 -0
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser.egg-info/top_level.txt +0 -0
- {wikitextparser-0.54.1.dev0 → wikitextparser-0.55.8}/wikitextparser.egg-info/zip-safe +0 -0
|
@@ -1,5 +1,35 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
v0.55.8
|
|
2
|
+
-------
|
|
3
|
+
- Fixed: Equal signs in extension tag attributes are no longer confused with name-value separator in arguments. (#128)
|
|
4
|
+
|
|
5
|
+
v0.55.7
|
|
6
|
+
-------
|
|
7
|
+
- Fixed a bug in ``plain_text``. (#126)
|
|
8
|
+
- Fixed another bug in parsing tables that end without a ``|}`` mark. (#125)
|
|
9
|
+
|
|
10
|
+
v0.55.6
|
|
11
|
+
-------
|
|
12
|
+
- Fixed bug in parsing tables that end without a ``|}`` mark. (#124)
|
|
13
|
+
|
|
14
|
+
v0.55.5
|
|
15
|
+
-------
|
|
16
|
+
- Fixed: regression in ``plain_text`` not being able to handle wikilinks only containing fragment/anchor, not title.
|
|
17
|
+
|
|
18
|
+
v0.55.4
|
|
19
|
+
-------
|
|
20
|
+
- ``plain_text`` method now uses a more accurate image-detection algorithm.
|
|
21
|
+
|
|
22
|
+
v0.55.3
|
|
23
|
+
-------
|
|
24
|
+
- Fixed and improved handling of tables and images in ``plain_text`` (#122)
|
|
25
|
+
|
|
26
|
+
v0.55.0
|
|
27
|
+
-------
|
|
28
|
+
- Added: ``top_levels_only`` argument to ``get_sections``.
|
|
29
|
+
- Deprecated: Calling ``get_sections`` with positional arguments is now deprecated.
|
|
30
|
+
|
|
31
|
+
v0.54.1
|
|
32
|
+
-------
|
|
3
33
|
- Fixed some bugs in ``plain_text`` method. (#119, #120)
|
|
4
34
|
- Fixed bug in ``get_tags``. (#121)
|
|
5
35
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: wikitextparser
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.55.8
|
|
4
4
|
Summary: A simple parsing tool for MediaWiki's wikitext markup.
|
|
5
5
|
Author-email: 5j9 <5j9@users.noreply.github.com>
|
|
6
6
|
License: GNU General Public License v3 (GPLv3)
|
|
@@ -15,9 +15,7 @@ License-File: LICENSE.md
|
|
|
15
15
|
Requires-Dist: regex>=2022.9.11
|
|
16
16
|
Requires-Dist: wcwidth
|
|
17
17
|
Provides-Extra: dev
|
|
18
|
-
Requires-Dist: path; extra == "dev"
|
|
19
18
|
Requires-Dist: coverage; extra == "dev"
|
|
20
|
-
Requires-Dist: twine; extra == "dev"
|
|
21
19
|
Provides-Extra: tests
|
|
22
20
|
Requires-Dist: pytest; extra == "tests"
|
|
23
21
|
|
|
@@ -38,9 +38,7 @@ Homepage = "https://github.com/5j9/wikitextparser"
|
|
|
38
38
|
|
|
39
39
|
[project.optional-dependencies]
|
|
40
40
|
dev = [
|
|
41
|
-
"path",
|
|
42
41
|
"coverage",
|
|
43
|
-
"twine",
|
|
44
42
|
]
|
|
45
43
|
tests = [
|
|
46
44
|
"pytest",
|
|
@@ -56,7 +54,17 @@ namespaces = false
|
|
|
56
54
|
[tool.setuptools.dynamic.version]
|
|
57
55
|
attr = "wikitextparser.__version__"
|
|
58
56
|
|
|
59
|
-
[tool.
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
57
|
+
[tool.ruff]
|
|
58
|
+
line-length = 79
|
|
59
|
+
format.quote-style = 'single'
|
|
60
|
+
isort.combine-as-imports = true
|
|
61
|
+
extend-select = [
|
|
62
|
+
'I', # isort
|
|
63
|
+
'UP', # pyupgrade
|
|
64
|
+
]
|
|
65
|
+
ignore = [
|
|
66
|
+
'UP027', # list comprehensions are faster than generator expressions
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
[tool.pytest.ini_options]
|
|
70
|
+
addopts = '--quiet --tb=short'
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# Scheme: [N!]N(.N)*[{a|b|rc}N][.postN][.devN]
|
|
2
|
-
__version__ = '0.
|
|
2
|
+
__version__ = '0.55.8'
|
|
3
3
|
|
|
4
4
|
from . import _wikitext
|
|
5
|
-
from ._argument import Argument
|
|
5
|
+
from ._argument import Argument # noqa: F401
|
|
6
6
|
from ._comment_bold_italic import Bold, Comment, Italic
|
|
7
7
|
from ._externallink import ExternalLink
|
|
8
8
|
from ._parameter import Parameter
|
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
|
|
1
|
+
from typing import Dict, List, MutableSequence, Optional, Tuple, Union
|
|
2
2
|
|
|
3
3
|
from regex import DOTALL, MULTILINE
|
|
4
4
|
|
|
5
5
|
from ._wikitext import SubWikiText, rc
|
|
6
6
|
|
|
7
7
|
COMMENT_PATTERN = r'<!--[\s\S]*?(?>-->|\Z)'
|
|
8
|
-
COMMA_COMMENT = "'(?>" + COMMENT_PATTERN +
|
|
9
|
-
COMMENT_COMMA =
|
|
8
|
+
COMMA_COMMENT = "'(?>" + COMMENT_PATTERN + ')*+'
|
|
9
|
+
COMMENT_COMMA = '(?>' + COMMENT_PATTERN + ")*+'"
|
|
10
10
|
BOLD_FULLMATCH = rc(
|
|
11
|
-
COMMA_COMMENT * 2 + "'(.*?)(?>'" + COMMENT_COMMA * 2 +
|
|
11
|
+
COMMA_COMMENT * 2 + "'(.*?)(?>'" + COMMENT_COMMA * 2 + '|$)',
|
|
12
12
|
MULTILINE | DOTALL,
|
|
13
13
|
).fullmatch
|
|
14
14
|
ITALIC_FULLMATCH = rc(
|
|
15
|
-
COMMA_COMMENT + "'(.*?)(?>'" + COMMENT_COMMA +
|
|
15
|
+
COMMA_COMMENT + "'(.*?)(?>'" + COMMENT_COMMA + '|$)', DOTALL
|
|
16
16
|
).fullmatch
|
|
17
17
|
ITALIC_NOEND_FULLMATCH = rc(COMMA_COMMENT + "'(.*)", DOTALL).fullmatch
|
|
18
18
|
|
|
@@ -49,7 +49,7 @@ class BoldItalic(SubWikiText):
|
|
|
49
49
|
self[b:e] = s
|
|
50
50
|
|
|
51
51
|
@property
|
|
52
|
-
def
|
|
52
|
+
def _content_span(self) -> Tuple[int, int]:
|
|
53
53
|
# noinspection PyUnresolvedReferences
|
|
54
54
|
return self._match.span(1)
|
|
55
55
|
|
|
@@ -126,22 +126,97 @@ _tag_extensions = _parsable_tag_extensions | _unparsable_tag_extensions
|
|
|
126
126
|
# https://phabricator.wikimedia.org/source/mediawiki/browse/master/includes/DefaultSettings.php
|
|
127
127
|
# See also: https://www.mediawiki.org/wiki/Help:Links#External_links
|
|
128
128
|
_bare_external_link_schemes = {
|
|
129
|
-
'bitcoin:',
|
|
130
|
-
'
|
|
131
|
-
'
|
|
132
|
-
'
|
|
129
|
+
'bitcoin:',
|
|
130
|
+
'ftp://',
|
|
131
|
+
'ftps://',
|
|
132
|
+
'geo:',
|
|
133
|
+
'git://',
|
|
134
|
+
'gopher://',
|
|
135
|
+
'http://',
|
|
136
|
+
'https://',
|
|
137
|
+
'irc://',
|
|
138
|
+
'ircs://',
|
|
139
|
+
'magnet:',
|
|
140
|
+
'mailto:',
|
|
141
|
+
'mms://',
|
|
142
|
+
'news:',
|
|
143
|
+
'nntp://',
|
|
144
|
+
'redis://',
|
|
145
|
+
'sftp://',
|
|
146
|
+
'sip:',
|
|
147
|
+
'sips:',
|
|
148
|
+
'sms:',
|
|
149
|
+
'ssh://',
|
|
150
|
+
'svn://',
|
|
151
|
+
'tel:',
|
|
152
|
+
'telnet://',
|
|
153
|
+
'urn:',
|
|
154
|
+
'worldwind://',
|
|
155
|
+
'xmpp:', # '//'
|
|
133
156
|
}
|
|
134
157
|
|
|
135
158
|
# generated using dev/html_tag_names.py
|
|
136
159
|
_valid_html_tag_names = {
|
|
137
|
-
's',
|
|
138
|
-
'
|
|
139
|
-
'
|
|
140
|
-
'
|
|
141
|
-
'
|
|
142
|
-
'
|
|
143
|
-
'
|
|
144
|
-
|
|
160
|
+
's',
|
|
161
|
+
'ins',
|
|
162
|
+
'code',
|
|
163
|
+
'b',
|
|
164
|
+
'ol',
|
|
165
|
+
'i',
|
|
166
|
+
'h5',
|
|
167
|
+
'th',
|
|
168
|
+
'dt',
|
|
169
|
+
'td',
|
|
170
|
+
'wbr',
|
|
171
|
+
'div',
|
|
172
|
+
'big',
|
|
173
|
+
'p',
|
|
174
|
+
'small',
|
|
175
|
+
'h4',
|
|
176
|
+
'tt',
|
|
177
|
+
'span',
|
|
178
|
+
'font',
|
|
179
|
+
'ruby',
|
|
180
|
+
'h3',
|
|
181
|
+
'dfn',
|
|
182
|
+
'rb',
|
|
183
|
+
'li',
|
|
184
|
+
'h1',
|
|
185
|
+
'cite',
|
|
186
|
+
'dl',
|
|
187
|
+
'rtc',
|
|
188
|
+
'em',
|
|
189
|
+
'q',
|
|
190
|
+
'h2',
|
|
191
|
+
'samp',
|
|
192
|
+
'strike',
|
|
193
|
+
'time',
|
|
194
|
+
'blockquote',
|
|
195
|
+
'bdi',
|
|
196
|
+
'del',
|
|
197
|
+
'br',
|
|
198
|
+
'rp',
|
|
199
|
+
'hr',
|
|
200
|
+
'abbr',
|
|
201
|
+
'sub',
|
|
202
|
+
'u',
|
|
203
|
+
'kbd',
|
|
204
|
+
'table',
|
|
205
|
+
'rt',
|
|
206
|
+
'dd',
|
|
207
|
+
'var',
|
|
208
|
+
'ul',
|
|
209
|
+
'tr',
|
|
210
|
+
'center',
|
|
211
|
+
'data',
|
|
212
|
+
'strong',
|
|
213
|
+
'mark',
|
|
214
|
+
'h6',
|
|
215
|
+
'bdo',
|
|
216
|
+
'caption',
|
|
217
|
+
'sup',
|
|
218
|
+
}
|
|
219
|
+
_HTML_TAG_NAME = regex_pattern(_valid_html_tag_names) + rb'\b'
|
|
145
220
|
|
|
146
221
|
_parser_functions = {
|
|
147
222
|
'ARTICLEPAGENAME',
|
|
@@ -264,3 +339,43 @@ _parser_functions = {
|
|
|
264
339
|
'ucfirst',
|
|
265
340
|
'urlencode',
|
|
266
341
|
}
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
# https://github.com/wikimedia/mediawiki/blob/de18cff244e8fab2e1ab2470c3b444e76b305e12/includes/libs/mime/MimeAnalyzer.php#L425
|
|
345
|
+
KNOWN_FILE_EXTENSIONS = {
|
|
346
|
+
'bmp',
|
|
347
|
+
'djvu',
|
|
348
|
+
'gif',
|
|
349
|
+
'iff',
|
|
350
|
+
'jb2',
|
|
351
|
+
'jp2',
|
|
352
|
+
'jpc',
|
|
353
|
+
'jpeg',
|
|
354
|
+
'jpg',
|
|
355
|
+
'jpx',
|
|
356
|
+
'mid',
|
|
357
|
+
'mka',
|
|
358
|
+
'mkv',
|
|
359
|
+
'mp3',
|
|
360
|
+
'oga',
|
|
361
|
+
'ogg',
|
|
362
|
+
'ogv',
|
|
363
|
+
'ogx',
|
|
364
|
+
'opus',
|
|
365
|
+
'pdf',
|
|
366
|
+
'png',
|
|
367
|
+
'psd',
|
|
368
|
+
'spx',
|
|
369
|
+
'stl',
|
|
370
|
+
'svg',
|
|
371
|
+
'swc',
|
|
372
|
+
'swf',
|
|
373
|
+
'tif',
|
|
374
|
+
'tiff',
|
|
375
|
+
'wbmp',
|
|
376
|
+
'webm',
|
|
377
|
+
'webp',
|
|
378
|
+
'wmf',
|
|
379
|
+
'xbm',
|
|
380
|
+
'xcf',
|
|
381
|
+
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
from typing import List, Optional, Tuple
|
|
2
2
|
|
|
3
3
|
from ._wikitext import WS, SubWikiText
|
|
4
4
|
|
|
@@ -97,12 +97,12 @@ class Parameter(SubWikiText):
|
|
|
97
97
|
len('{{{' + name + '|') : len(
|
|
98
98
|
'{{{' + name + '|' + innermost_default
|
|
99
99
|
)
|
|
100
|
-
] =
|
|
100
|
+
] = '{{{' + new_default_name + '|' + innermost_default + '}}}'
|
|
101
101
|
|
|
102
102
|
@property
|
|
103
103
|
def parameters(self) -> List['Parameter']:
|
|
104
104
|
return super().parameters[1:]
|
|
105
105
|
|
|
106
106
|
@property
|
|
107
|
-
def
|
|
107
|
+
def _content_span(self) -> Tuple[int, int]:
|
|
108
108
|
return 3, -3
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from bisect import insort
|
|
2
|
-
from typing import Iterable, List, Union
|
|
2
|
+
from typing import Iterable, List, Tuple, Union
|
|
3
3
|
|
|
4
4
|
from ._argument import Argument
|
|
5
5
|
from ._wikilist import WikiList
|
|
@@ -19,6 +19,10 @@ class SubWikiTextWithArgs(SubWikiText):
|
|
|
19
19
|
_name_args_matcher = NotImplemented
|
|
20
20
|
_first_arg_sep = 0
|
|
21
21
|
|
|
22
|
+
@property
|
|
23
|
+
def _content_span(self) -> Tuple[int, int]:
|
|
24
|
+
return 2, -2
|
|
25
|
+
|
|
22
26
|
@property
|
|
23
27
|
def nesting_level(self) -> int:
|
|
24
28
|
"""Return the nesting level of self.
|
|
@@ -95,10 +99,6 @@ class SubWikiTextWithArgs(SubWikiText):
|
|
|
95
99
|
def name(self, newname: str) -> None:
|
|
96
100
|
self[2 : 2 + len(self.name)] = newname
|
|
97
101
|
|
|
98
|
-
@property
|
|
99
|
-
def _relative_contents_end(self) -> tuple:
|
|
100
|
-
return 2, -2
|
|
101
|
-
|
|
102
102
|
|
|
103
103
|
class ParserFunction(SubWikiTextWithArgs):
|
|
104
104
|
__slots__ = ()
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
3
|
from ._wikitext import SubWikiText, rc
|
|
4
4
|
|
|
@@ -70,7 +70,7 @@ class Section(SubWikiText):
|
|
|
70
70
|
if m is None:
|
|
71
71
|
raise RuntimeError(
|
|
72
72
|
"Can't set title for a lead section. "
|
|
73
|
-
|
|
73
|
+
'Try adding it to contents.'
|
|
74
74
|
)
|
|
75
75
|
self[m.start(2) : m.end(2)] = value
|
|
76
76
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
"""Define the functions required for parsing wikitext into spans."""
|
|
2
2
|
from functools import partial
|
|
3
3
|
from typing import Callable, Dict, Optional
|
|
4
4
|
|
|
@@ -27,7 +27,9 @@ PF_TL_FINDITER = rc(
|
|
|
27
27
|
rb'[\s\0]*+'
|
|
28
28
|
rb'(?>'
|
|
29
29
|
rb'\#[^{}\s:|]++' # parser function
|
|
30
|
-
rb'|'
|
|
30
|
+
rb'|'
|
|
31
|
+
+ regex_pattern(_parser_functions)[3:] # )
|
|
32
|
+
+
|
|
31
33
|
# should not have any arguments or the arg should start with a :
|
|
32
34
|
rb'(?:'
|
|
33
35
|
rb':(?>[^{}]*+|}(?!})|{(?!{))*+'
|
|
@@ -92,9 +94,9 @@ WIKILINK_PARAM_FINDITER = rc(
|
|
|
92
94
|
REVERSE,
|
|
93
95
|
).finditer
|
|
94
96
|
|
|
95
|
-
MARKUP = b''.maketrans(b"
|
|
96
|
-
BRACES_PIPE_NEWLINE = b''.maketrans(b
|
|
97
|
-
BRACKETS = b''.maketrans(b
|
|
97
|
+
MARKUP = b''.maketrans(b"=|[]'{}", b'\1_\2\3___')
|
|
98
|
+
BRACES_PIPE_NEWLINE = b''.maketrans(b'|{}\n', b'____')
|
|
99
|
+
BRACKETS = b''.maketrans(b'[]', b'__')
|
|
98
100
|
|
|
99
101
|
PARSABLE_TAG_EXTENSION_NAME = regex_pattern(_parsable_tag_extensions)
|
|
100
102
|
UNPARSABLE_TAG_EXTENSION_NAME = regex_pattern(_unparsable_tag_extensions)
|
|
@@ -145,7 +147,7 @@ SPACE_CHARS = rb' \t\n\u000C\r\0' # \s - \v
|
|
|
145
147
|
CONTROL_CHARS = rb'\x00-\x1f\x7f-\x9f'
|
|
146
148
|
# https://www.w3.org/TR/html5/syntax.html#syntax-attributes
|
|
147
149
|
ATTR_NAME = rb'(?<attr_name>[^' + SPACE_CHARS + CONTROL_CHARS + rb'"\'>/=]++)'
|
|
148
|
-
EQ_WS = rb'
|
|
150
|
+
EQ_WS = rb'[=\1][' + SPACE_CHARS + rb']*+'
|
|
149
151
|
UNQUOTED_ATTR_VAL = rb'(?<attr_value>[^' + SPACE_CHARS + rb'"\'=<>`]++)'
|
|
150
152
|
QUOTED_ATTR_VAL = rb'(?<quote>[\'"])(?<attr_value>.*?)(?P=quote)'
|
|
151
153
|
# May include character references, but for now, ignore the fact that they
|
|
@@ -169,7 +171,12 @@ ATTR_VAL = (
|
|
|
169
171
|
# Ignore ambiguous ampersand for the sake of simplicity.
|
|
170
172
|
ATTRS_PATTERN = (
|
|
171
173
|
rb'(?<attr>'
|
|
172
|
-
rb'['
|
|
174
|
+
rb'['
|
|
175
|
+
+ SPACE_CHARS
|
|
176
|
+
+ rb']*+(?>'
|
|
177
|
+
+ ATTR_NAME
|
|
178
|
+
+ ATTR_VAL
|
|
179
|
+
+ rb')'
|
|
173
180
|
# See https://stackoverflow.com/a/3558200/2705757 for how HTML5
|
|
174
181
|
# treats self-closing marks.
|
|
175
182
|
+ rb'|[^>]++'
|
|
@@ -272,39 +279,42 @@ def extract_tag_extensions(
|
|
|
272
279
|
cms_append([s, e, None, byte_array[s:e]])
|
|
273
280
|
byte_array[s:e] = b'\0' * (e - s)
|
|
274
281
|
continue
|
|
282
|
+
|
|
275
283
|
s, e = span('u') # unparsable
|
|
276
284
|
if s != -1:
|
|
277
285
|
s -= 1 # <
|
|
278
286
|
ets_append([s, e, match, byte_array[s:e]])
|
|
279
287
|
byte_array[s:e] = (e - s) * b'_'
|
|
280
288
|
continue
|
|
289
|
+
|
|
281
290
|
s, e = span('p') # parsable
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
291
|
+
s -= 1 # <
|
|
292
|
+
ets_append([s, e, match, byte_array[s:e]])
|
|
293
|
+
cs, ce = span('c') # content
|
|
294
|
+
extract_tag_extensions(
|
|
295
|
+
byte_array,
|
|
296
|
+
ets_append,
|
|
297
|
+
cms_append,
|
|
298
|
+
cs,
|
|
299
|
+
ce,
|
|
300
|
+
pms_append,
|
|
301
|
+
pfs_append,
|
|
302
|
+
tls_append,
|
|
303
|
+
wls_append,
|
|
304
|
+
)
|
|
305
|
+
_parse_sub_spans(
|
|
306
|
+
byte_array,
|
|
307
|
+
s,
|
|
308
|
+
e,
|
|
309
|
+
pms_append,
|
|
310
|
+
pfs_append,
|
|
311
|
+
tls_append,
|
|
312
|
+
wls_append,
|
|
313
|
+
)
|
|
314
|
+
# Parsable extension tags are not nested but they create separate
|
|
315
|
+
# environment for bolds, italics, and tables.
|
|
316
|
+
# Also equal signs are not name-value separators in arguments.
|
|
317
|
+
byte_array[s:e] = byte_array[s:e].translate(MARKUP)
|
|
308
318
|
|
|
309
319
|
|
|
310
320
|
def _parse_sub_spans(
|
|
@@ -316,9 +326,10 @@ def _parse_sub_spans(
|
|
|
316
326
|
tls_append: Callable,
|
|
317
327
|
wls_append: Callable,
|
|
318
328
|
) -> None:
|
|
319
|
-
start_and_end_tags =
|
|
320
|
-
byte_array, start, end
|
|
321
|
-
|
|
329
|
+
start_and_end_tags = (
|
|
330
|
+
*HTML_START_TAG_FINDITER(byte_array, start, end),
|
|
331
|
+
*HTML_END_TAG_FINDITER(byte_array, start, end),
|
|
332
|
+
)
|
|
322
333
|
for match in start_and_end_tags:
|
|
323
334
|
ms, me = match.span()
|
|
324
335
|
byte_array[ms:me] = byte_array[ms:me].translate(BRACKETS)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
from bisect import insort_right
|
|
2
2
|
from collections.abc import Mapping
|
|
3
3
|
from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
|
|
4
4
|
|
|
@@ -48,7 +48,7 @@ HEAD_DIGITS = rc(rb'\s*+\d+').match
|
|
|
48
48
|
# Captions are optional and only one should be placed between table-start
|
|
49
49
|
# and the first row. Others captions are not part of the table and will
|
|
50
50
|
# be ignored.
|
|
51
|
-
FIRST_NON_CAPTION_LINE = rc(
|
|
51
|
+
FIRST_NON_CAPTION_LINE = rc(rb'\n[\t \0]*+(\|(?!\+)|!)|\Z').search
|
|
52
52
|
|
|
53
53
|
|
|
54
54
|
def head_int(value):
|
|
@@ -399,7 +399,7 @@ def _apply_attr_spans(
|
|
|
399
399
|
# if not table_data:
|
|
400
400
|
# return table_data
|
|
401
401
|
# 11
|
|
402
|
-
downward_growing_cells
|
|
402
|
+
downward_growing_cells: List[Tuple[Optional[T], int, int]] = []
|
|
403
403
|
# 13, 18
|
|
404
404
|
# Algorithm for processing rows
|
|
405
405
|
for attrs_row, row in zip(table_attrs, table_data):
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
|
|
1
|
+
"""Define the Tag class and tag-related regular expressions.
|
|
2
2
|
|
|
3
3
|
Unlike MediaWiki which has very strict HTML rules, regexes
|
|
4
4
|
defined in this module don't follow those restrictions and allow finding
|
|
@@ -8,7 +8,7 @@ For more info see:
|
|
|
8
8
|
* https://www.mediawiki.org/wiki/HTML_restriction
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
|
-
from typing import Any, Dict, List, Optional
|
|
11
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
12
12
|
|
|
13
13
|
from regex import DOTALL, VERBOSE
|
|
14
14
|
|
|
@@ -28,18 +28,18 @@ from ._wikitext import SubWikiText, rc
|
|
|
28
28
|
# ).finditer
|
|
29
29
|
# Note that the following regex won't check for nested tags
|
|
30
30
|
TAG_FULLMATCH = rc(
|
|
31
|
-
rb
|
|
32
|
-
<(?<name>[A-Za-z0-9]++)
|
|
31
|
+
rb"""
|
|
32
|
+
<(?<name>[A-Za-z0-9]++)"""
|
|
33
33
|
+ ATTRS_PATTERN
|
|
34
|
-
+ rb
|
|
35
|
-
[
|
|
34
|
+
+ rb"""
|
|
35
|
+
["""
|
|
36
36
|
+ SPACE_CHARS
|
|
37
|
-
+ rb
|
|
37
|
+
+ rb"""]*+
|
|
38
38
|
(?>
|
|
39
|
-
>(?<contents>.*)
|
|
39
|
+
>(?<contents>.*)"""
|
|
40
40
|
+ END_TAG_PATTERN.replace(rb'{name}', rb'(?<end_name>[A-Za-z0-9]++)')
|
|
41
|
-
+ rb
|
|
42
|
-
)
|
|
41
|
+
+ rb"""|> # only start; no end tag; could be self-closing
|
|
42
|
+
)""",
|
|
43
43
|
DOTALL | VERBOSE,
|
|
44
44
|
).fullmatch
|
|
45
45
|
|
|
@@ -215,5 +215,6 @@ class Tag(SubWikiTextWithAttrs):
|
|
|
215
215
|
return super().get_tags(name)[1:]
|
|
216
216
|
|
|
217
217
|
@property
|
|
218
|
-
def
|
|
219
|
-
|
|
218
|
+
def _content_span(self) -> Tuple[int, int]:
|
|
219
|
+
s = self.string
|
|
220
|
+
return s.find('>') + 1, s.rfind('<')
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
|
|
1
|
+
"""Define the WikiLink class."""
|
|
2
2
|
|
|
3
3
|
|
|
4
|
-
from typing import List, Optional
|
|
4
|
+
from typing import List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from regex import DOTALL
|
|
7
7
|
|
|
@@ -22,6 +22,13 @@ FULLMATCH = rc(
|
|
|
22
22
|
class WikiLink(SubWikiText):
|
|
23
23
|
__slots__ = '_cached_match'
|
|
24
24
|
|
|
25
|
+
@property
|
|
26
|
+
def _content_span(self) -> Tuple[int, int]:
|
|
27
|
+
s = self.string
|
|
28
|
+
f = s.find
|
|
29
|
+
rf = s.rfind
|
|
30
|
+
return f('[', f('[') + 1) + 1, rf(']', None, rf(']'))
|
|
31
|
+
|
|
25
32
|
@property
|
|
26
33
|
def _match(self):
|
|
27
34
|
shadow = self._shadow
|
|
@@ -141,7 +148,3 @@ class WikiLink(SubWikiText):
|
|
|
141
148
|
@property
|
|
142
149
|
def wikilinks(self) -> List['WikiLink']:
|
|
143
150
|
return super().wikilinks[1:]
|
|
144
|
-
|
|
145
|
-
@property
|
|
146
|
-
def _relative_contents_end(self) -> tuple:
|
|
147
|
-
return self._match.span(4)
|