wikitextparser 0.55.11__tar.gz → 0.55.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/CHANGELOG.rst +8 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/PKG-INFO +1 -1
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/__init__.py +1 -1
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_parser_function.py +1 -4
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_section.py +2 -2
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_wikitext.py +76 -77
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/.coveragerc +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/.github/workflows/tests.yml +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/.gitignore +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/.readthedocs.yaml +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/LICENSE.md +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/README.rst +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/docs/CHANGELOG.rst +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/docs/Makefile +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/docs/README.rst +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/docs/conf.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/docs/index.rst +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/docs/make.bat +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/pyproject.toml +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_argument.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_cell.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_comment_bold_italic.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_config.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_externallink.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_parameter.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_spans.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_table.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_tag.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_template.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_wikilink.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.13}/wikitextparser/_wikilist.py +0 -0
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
v0.55.13
|
|
2
|
+
--------
|
|
3
|
+
* Fixed a bug in ``Section.level`` resulting in malformed section titles when multiple levels are added (#135)
|
|
4
|
+
|
|
5
|
+
v0.55.12
|
|
6
|
+
--------
|
|
7
|
+
* Performance improvements in extracting bold and italic nodes. (#133)
|
|
8
|
+
|
|
1
9
|
v0.55.11
|
|
2
10
|
--------
|
|
3
11
|
* Performance improvements in ``__setitem__``/``__delitem__`` and ``pformat``/``plain_text`` methods. (#131)
|
|
@@ -60,10 +60,7 @@ class SubWikiTextWithArgs(SubWikiText):
|
|
|
60
60
|
else:
|
|
61
61
|
arg_span = old_span
|
|
62
62
|
arg = Argument(lststr, type_to_spans, arg_span, type_, self)
|
|
63
|
-
arg.
|
|
64
|
-
lststr[0][s:e],
|
|
65
|
-
shadow[arg_self_start:arg_self_end],
|
|
66
|
-
)
|
|
63
|
+
arg._span_data[3] = shadow[arg_self_start:arg_self_end]
|
|
67
64
|
arguments_append(arg)
|
|
68
65
|
return arguments
|
|
69
66
|
|
|
@@ -42,9 +42,9 @@ class Section(SubWikiText):
|
|
|
42
42
|
if level_diff == 0:
|
|
43
43
|
return
|
|
44
44
|
if level_diff < 0:
|
|
45
|
-
new_equals = '=' *
|
|
45
|
+
new_equals = '=' * -level_diff
|
|
46
46
|
self.insert(0, new_equals)
|
|
47
|
-
self.insert(m.end(2)
|
|
47
|
+
self.insert(m.end(2) - level_diff, new_equals)
|
|
48
48
|
return
|
|
49
49
|
del self[:level_diff]
|
|
50
50
|
del self[m.end(2) : m.end(2) + level_diff]
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from bisect import bisect_left, bisect_right, insort_right
|
|
2
|
-
from copy import deepcopy
|
|
3
2
|
from html import unescape
|
|
4
3
|
from itertools import compress, islice
|
|
5
4
|
from operator import attrgetter
|
|
@@ -41,7 +40,6 @@ from ._spans import (
|
|
|
41
40
|
END_TAG_PATTERN,
|
|
42
41
|
EXTERNAL_LINK_URL_TAIL,
|
|
43
42
|
INVALID_URL_CHARS,
|
|
44
|
-
PARSABLE_TAG_EXTENSION_NAME,
|
|
45
43
|
START_TAG_PATTERN,
|
|
46
44
|
parse_to_spans,
|
|
47
45
|
rc,
|
|
@@ -53,10 +51,6 @@ NAME_CAPTURING_HTML_START_TAG_FINDITER = rc(
|
|
|
53
51
|
)
|
|
54
52
|
).finditer
|
|
55
53
|
|
|
56
|
-
PARSABLE_TAG_EXTENSIONS_MATCH = rc(
|
|
57
|
-
rb'<' + PARSABLE_TAG_EXTENSION_NAME + rb'\b', IGNORECASE
|
|
58
|
-
).match
|
|
59
|
-
|
|
60
54
|
# External links
|
|
61
55
|
BRACKET_EXTERNAL_LINK_SCHEMES = regex_pattern(
|
|
62
56
|
_bare_external_link_schemes | {'//'}
|
|
@@ -114,10 +108,7 @@ TABLE_FINDITER = rc(
|
|
|
114
108
|
DOTALL | MULTILINE | VERBOSE,
|
|
115
109
|
).finditer
|
|
116
110
|
|
|
117
|
-
|
|
118
|
-
rb"""((?>'\0*)*?)'\0*+'\0*+('\0*+('\0*+')?+)?+(?=[^']|$)|($)""",
|
|
119
|
-
MULTILINE | VERBOSE,
|
|
120
|
-
).finditer
|
|
111
|
+
substitute_apostrophes = rc(rb"('\0*+){2,}+(?=[^']|$)", MULTILINE).sub
|
|
121
112
|
|
|
122
113
|
BOLD_FINDITER = rc(
|
|
123
114
|
rb"""
|
|
@@ -213,7 +204,7 @@ class WikiText:
|
|
|
213
204
|
# The following class attribute acts as a default value.
|
|
214
205
|
_type = 'WikiText'
|
|
215
206
|
|
|
216
|
-
__slots__ = '_type_to_spans', '_lststr', '_span_data'
|
|
207
|
+
__slots__ = '_type_to_spans', '_lststr', '_span_data'
|
|
217
208
|
|
|
218
209
|
def __init__(
|
|
219
210
|
self,
|
|
@@ -241,7 +232,6 @@ class WikiText:
|
|
|
241
232
|
if _type not in SPAN_PARSER_TYPES:
|
|
242
233
|
type_to_spans = self._type_to_spans = parse_to_spans(byte_array)
|
|
243
234
|
type_to_spans[_type] = [span]
|
|
244
|
-
self._shadow_cache = string, byte_array
|
|
245
235
|
else:
|
|
246
236
|
# In SPAN_PARSER_TYPES, we can't pass the original byte_array to
|
|
247
237
|
# parser to generate the shadow because it will replace the whole
|
|
@@ -259,7 +249,6 @@ class WikiText:
|
|
|
259
249
|
byte_array[0] = 3
|
|
260
250
|
byte_array[-1] = 32
|
|
261
251
|
type_to_spans = parse_to_spans(byte_array)
|
|
262
|
-
self._shadow_cache = string, byte_array
|
|
263
252
|
type_to_spans[_type].insert(0, span)
|
|
264
253
|
self._type_to_spans = type_to_spans
|
|
265
254
|
if type(self) is Parameter:
|
|
@@ -621,15 +610,12 @@ class WikiText:
|
|
|
621
610
|
self.string.
|
|
622
611
|
"""
|
|
623
612
|
ss, se, _, _ = self._span_data
|
|
624
|
-
if ss == 0 and se == len(self._lststr[0]):
|
|
625
|
-
return deepcopy(self._type_to_spans)
|
|
626
613
|
return {
|
|
627
614
|
type_: [
|
|
628
615
|
[s - ss, e - ss, m, ba[:] if ba is not None else None]
|
|
629
616
|
for s, e, m, ba in spans[
|
|
630
|
-
|
|
617
|
+
bisect_right(spans, [ss]) : bisect_right(spans, [se])
|
|
631
618
|
]
|
|
632
|
-
if e <= se
|
|
633
619
|
]
|
|
634
620
|
for type_, spans in self._type_to_spans.items()
|
|
635
621
|
}
|
|
@@ -1012,66 +998,81 @@ class WikiText:
|
|
|
1012
998
|
]
|
|
1013
999
|
|
|
1014
1000
|
@property
|
|
1015
|
-
def _balanced_quotes_shadow(self):
|
|
1016
|
-
"""Return
|
|
1001
|
+
def _balanced_quotes_shadow(self) -> bytearray:
|
|
1002
|
+
"""Return a byte array with non-markup-apostrophes removed.
|
|
1017
1003
|
|
|
1018
1004
|
The comments at /includes/parser/Parser.php:doQuotes are helpful:
|
|
1019
1005
|
https://github.com/wikimedia/mediawiki/blob/master/includes/parser/Parser.php
|
|
1020
1006
|
https://phabricator.wikimedia.org/T15227#178834
|
|
1021
1007
|
"""
|
|
1022
|
-
|
|
1008
|
+
bold_starts: List[int] = []
|
|
1023
1009
|
odd_italics = False
|
|
1024
1010
|
odd_bold_italics = False
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
first_multi_letter_word
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1011
|
+
append_bold_start = bold_starts.append
|
|
1012
|
+
|
|
1013
|
+
def process_line(line: bytes) -> bytes:
|
|
1014
|
+
nonlocal odd_italics, odd_bold_italics
|
|
1015
|
+
if odd_italics and (len(bold_starts) + odd_bold_italics) % 2:
|
|
1016
|
+
# one of the bold marks needs to be interpreted as italic
|
|
1017
|
+
first_multi_letter_word = first_space = None
|
|
1018
|
+
for s in bold_starts:
|
|
1019
|
+
if line[s - 1] == 32: # space
|
|
1020
|
+
if first_space is None:
|
|
1021
|
+
first_space = s
|
|
1022
|
+
continue
|
|
1023
|
+
if line[s - 2] == 32: # space
|
|
1024
|
+
line = line[:s] + b' ' + line[s + 1 :]
|
|
1025
|
+
break # first_single_letter_word
|
|
1026
|
+
if first_multi_letter_word is None:
|
|
1027
|
+
first_multi_letter_word = s
|
|
1028
|
+
continue
|
|
1029
|
+
else: # there was no first_single_letter_word
|
|
1030
|
+
if first_multi_letter_word is not None:
|
|
1031
|
+
line = (
|
|
1032
|
+
line[:first_multi_letter_word]
|
|
1033
|
+
+ b'_'
|
|
1034
|
+
+ line[first_multi_letter_word + 1 :]
|
|
1035
|
+
)
|
|
1036
|
+
elif first_space is not None:
|
|
1037
|
+
line = (
|
|
1038
|
+
line[:first_space] + b'_' + line[first_space + 1 :]
|
|
1039
|
+
)
|
|
1040
|
+
# reset state for the next line
|
|
1041
|
+
bold_starts.clear()
|
|
1042
|
+
odd_italics = False
|
|
1043
|
+
odd_bold_italics = False
|
|
1044
|
+
return line
|
|
1045
|
+
|
|
1046
|
+
def process_apostrophes(m) -> bytes:
|
|
1047
|
+
nonlocal odd_italics, odd_bold_italics
|
|
1048
|
+
starts = m.starts(1)
|
|
1049
|
+
n = len(starts)
|
|
1050
|
+
if n == 2: # italic
|
|
1059
1051
|
odd_italics ^= True
|
|
1060
|
-
|
|
1061
|
-
if
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1052
|
+
return m[0]
|
|
1053
|
+
if n == 3: # bold
|
|
1054
|
+
append_bold_start(starts[0])
|
|
1055
|
+
return m[0]
|
|
1056
|
+
if n == 5:
|
|
1057
|
+
odd_bold_italics ^= True
|
|
1058
|
+
odd_italics ^= True
|
|
1059
|
+
return m[0]
|
|
1060
|
+
if n == 4: # four apostrophes -> hide the first one
|
|
1061
|
+
s = starts[1]
|
|
1062
|
+
append_bold_start(s)
|
|
1063
|
+
return b'_' * (s - starts[0]) + m.string[s : m.end()]
|
|
1064
|
+
# more than 5 apostrophes -> hide the prior ones
|
|
1072
1065
|
odd_bold_italics ^= True
|
|
1073
1066
|
odd_italics ^= True
|
|
1074
|
-
|
|
1067
|
+
s = starts[-5]
|
|
1068
|
+
return b'_' * (s - starts[0]) + m.string[s : m.end()]
|
|
1069
|
+
|
|
1070
|
+
return bytearray(b'\n').join(
|
|
1071
|
+
[
|
|
1072
|
+
process_line(substitute_apostrophes(process_apostrophes, line))
|
|
1073
|
+
for line in self._shadow.splitlines()
|
|
1074
|
+
]
|
|
1075
|
+
)
|
|
1075
1076
|
|
|
1076
1077
|
def _bolds_italics_recurse(self, result: list, filter_cls: Optional[type]):
|
|
1077
1078
|
for prop in (
|
|
@@ -1123,8 +1124,8 @@ class WikiText:
|
|
|
1123
1124
|
bold_spans = tts_setdefault('Bold', [])
|
|
1124
1125
|
get_old_bold_span = {(s[0], s[1]): s for s in bold_spans}.get
|
|
1125
1126
|
bold_matches = list(BOLD_FINDITER(balanced_shadow, rs, re))
|
|
1126
|
-
for
|
|
1127
|
-
ms, me =
|
|
1127
|
+
for m in bold_matches:
|
|
1128
|
+
ms, me = m.span()
|
|
1128
1129
|
b, e = s + ms, s + me
|
|
1129
1130
|
old_span = get_old_bold_span((b, e))
|
|
1130
1131
|
if old_span is None:
|
|
@@ -1146,16 +1147,16 @@ class WikiText:
|
|
|
1146
1147
|
# filter_cls is None or filter_cls is Italic
|
|
1147
1148
|
|
|
1148
1149
|
# remove bold tokens before searching for italics
|
|
1149
|
-
for
|
|
1150
|
-
ms, me =
|
|
1151
|
-
cs, ce =
|
|
1150
|
+
for m in bold_matches:
|
|
1151
|
+
ms, me = m.span()
|
|
1152
|
+
cs, ce = m.span(1) # content
|
|
1152
1153
|
balanced_shadow[ms:cs] = b'_' * (cs - ms)
|
|
1153
1154
|
balanced_shadow[ce:me] = b'_' * (me - ce)
|
|
1154
1155
|
|
|
1155
1156
|
italic_spans = tts_setdefault('Italic', [])
|
|
1156
1157
|
get_old_italic_span = {(s[0], s[1]): s for s in italic_spans}.get
|
|
1157
|
-
for
|
|
1158
|
-
ms, me =
|
|
1158
|
+
for m in ITALIC_FINDITER(balanced_shadow, rs, re):
|
|
1159
|
+
ms, me = m.span()
|
|
1159
1160
|
b, e = span = s + ms, s + me
|
|
1160
1161
|
old_span = get_old_italic_span(span)
|
|
1161
1162
|
if old_span is None:
|
|
@@ -1164,9 +1165,7 @@ class WikiText:
|
|
|
1164
1165
|
else:
|
|
1165
1166
|
span = old_span
|
|
1166
1167
|
append(
|
|
1167
|
-
Italic(
|
|
1168
|
-
_lststr, type_to_spans, span, 'Bold', me != match.end(1)
|
|
1169
|
-
)
|
|
1168
|
+
Italic(_lststr, type_to_spans, span, 'Bold', me != m.end(1))
|
|
1170
1169
|
)
|
|
1171
1170
|
if recursive and filter_cls is Italic:
|
|
1172
1171
|
self._bolds_italics_recurse(result, filter_cls)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|