wikitextparser 0.55.11__tar.gz → 0.55.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/CHANGELOG.rst +4 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/PKG-INFO +1 -1
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/__init__.py +1 -1
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_wikitext.py +81 -70
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/.coveragerc +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/.github/workflows/tests.yml +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/.gitignore +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/.readthedocs.yaml +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/LICENSE.md +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/README.rst +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/docs/CHANGELOG.rst +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/docs/Makefile +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/docs/README.rst +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/docs/conf.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/docs/index.rst +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/docs/make.bat +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/pyproject.toml +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_argument.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_cell.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_comment_bold_italic.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_config.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_externallink.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_parameter.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_parser_function.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_section.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_spans.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_table.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_tag.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_template.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_wikilink.py +0 -0
- {wikitextparser-0.55.11 → wikitextparser-0.55.12}/wikitextparser/_wikilist.py +0 -0
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from bisect import bisect_left, bisect_right, insort_right
|
|
2
|
-
from copy import deepcopy
|
|
3
2
|
from html import unescape
|
|
4
3
|
from itertools import compress, islice
|
|
5
4
|
from operator import attrgetter
|
|
@@ -114,10 +113,11 @@ TABLE_FINDITER = rc(
|
|
|
114
113
|
DOTALL | MULTILINE | VERBOSE,
|
|
115
114
|
).finditer
|
|
116
115
|
|
|
117
|
-
|
|
118
|
-
rb"
|
|
116
|
+
substitute_apostrophes = rc( # bold-italic, bold, or italic tokens
|
|
117
|
+
rb"('\0*+){2,}+(?=[^']|$)",
|
|
119
118
|
MULTILINE | VERBOSE,
|
|
120
|
-
).
|
|
119
|
+
).sub
|
|
120
|
+
find_lines = rc(rb'(.*?)$').finditer
|
|
121
121
|
|
|
122
122
|
BOLD_FINDITER = rc(
|
|
123
123
|
rb"""
|
|
@@ -621,15 +621,12 @@ class WikiText:
|
|
|
621
621
|
self.string.
|
|
622
622
|
"""
|
|
623
623
|
ss, se, _, _ = self._span_data
|
|
624
|
-
if ss == 0 and se == len(self._lststr[0]):
|
|
625
|
-
return deepcopy(self._type_to_spans)
|
|
626
624
|
return {
|
|
627
625
|
type_: [
|
|
628
626
|
[s - ss, e - ss, m, ba[:] if ba is not None else None]
|
|
629
627
|
for s, e, m, ba in spans[
|
|
630
|
-
|
|
628
|
+
bisect_right(spans, [ss]) : bisect_right(spans, [se])
|
|
631
629
|
]
|
|
632
|
-
if e <= se
|
|
633
630
|
]
|
|
634
631
|
for type_, spans in self._type_to_spans.items()
|
|
635
632
|
}
|
|
@@ -1012,66 +1009,82 @@ class WikiText:
|
|
|
1012
1009
|
]
|
|
1013
1010
|
|
|
1014
1011
|
@property
|
|
1015
|
-
def _balanced_quotes_shadow(self):
|
|
1016
|
-
"""Return
|
|
1012
|
+
def _balanced_quotes_shadow(self) -> bytearray:
|
|
1013
|
+
"""Return a byte array with non-markup-apostrophes removed.
|
|
1017
1014
|
|
|
1018
1015
|
The comments at /includes/parser/Parser.php:doQuotes are helpful:
|
|
1019
1016
|
https://github.com/wikimedia/mediawiki/blob/master/includes/parser/Parser.php
|
|
1020
1017
|
https://phabricator.wikimedia.org/T15227#178834
|
|
1021
1018
|
"""
|
|
1022
|
-
|
|
1019
|
+
bold_starts: List[int] = []
|
|
1023
1020
|
odd_italics = False
|
|
1024
1021
|
odd_bold_italics = False
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
first_multi_letter_word
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1022
|
+
append_bold_start = bold_starts.append
|
|
1023
|
+
|
|
1024
|
+
def process_line(line: bytes) -> bytes:
|
|
1025
|
+
nonlocal odd_italics, odd_bold_italics
|
|
1026
|
+
if odd_italics and (len(bold_starts) + odd_bold_italics) % 2:
|
|
1027
|
+
# one of the bold marks needs to be interpreted as italic
|
|
1028
|
+
first_multi_letter_word = first_space = None
|
|
1029
|
+
for s in bold_starts:
|
|
1030
|
+
if line[s - 1] == 32: # space
|
|
1031
|
+
if first_space is None:
|
|
1032
|
+
first_space = s
|
|
1033
|
+
continue
|
|
1034
|
+
if line[s - 2] == 32: # space
|
|
1035
|
+
line = line[:s] + b' ' + line[s + 1 :]
|
|
1036
|
+
break # first_single_letter_word
|
|
1037
|
+
if first_multi_letter_word is None:
|
|
1038
|
+
first_multi_letter_word = s
|
|
1039
|
+
continue
|
|
1040
|
+
else: # there was no first_single_letter_word
|
|
1041
|
+
if first_multi_letter_word is not None:
|
|
1042
|
+
line = (
|
|
1043
|
+
line[:first_multi_letter_word]
|
|
1044
|
+
+ b'_'
|
|
1045
|
+
+ line[first_multi_letter_word + 1 :]
|
|
1046
|
+
)
|
|
1047
|
+
elif first_space is not None:
|
|
1048
|
+
line = (
|
|
1049
|
+
line[:first_space] + b'_' + line[first_space + 1 :]
|
|
1050
|
+
)
|
|
1051
|
+
# reset state for the next line
|
|
1052
|
+
bold_starts.clear()
|
|
1053
|
+
odd_italics = False
|
|
1054
|
+
odd_bold_italics = False
|
|
1055
|
+
return line
|
|
1056
|
+
|
|
1057
|
+
def process_apostrophes(m) -> bytes:
|
|
1058
|
+
nonlocal odd_italics, odd_bold_italics
|
|
1059
|
+
starts = m.starts(1)
|
|
1060
|
+
n = len(starts)
|
|
1061
|
+
if n == 2: # italic
|
|
1059
1062
|
odd_italics ^= True
|
|
1060
|
-
|
|
1061
|
-
if
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1063
|
+
return m[0]
|
|
1064
|
+
if n == 3: # bold
|
|
1065
|
+
append_bold_start(starts[0])
|
|
1066
|
+
return m[0]
|
|
1067
|
+
if n == 5:
|
|
1068
|
+
odd_bold_italics ^= True
|
|
1069
|
+
odd_italics ^= True
|
|
1070
|
+
return m[0]
|
|
1071
|
+
if n == 4: # four apostrophes -> hide the first one
|
|
1072
|
+
s = starts[1]
|
|
1073
|
+
append_bold_start(s)
|
|
1074
|
+
return b'_' * (s - starts[0]) + m.string[s : m.end()]
|
|
1075
|
+
if n > 5: # more than 5 apostrophes -> hide the prior ones
|
|
1076
|
+
odd_bold_italics ^= True
|
|
1077
|
+
odd_italics ^= True
|
|
1078
|
+
s = starts[-5]
|
|
1079
|
+
return b'_' * (s - starts[0]) + m.string[s : m.end()]
|
|
1080
|
+
raise # execution should never reach here
|
|
1081
|
+
|
|
1082
|
+
return bytearray(b'\n').join(
|
|
1083
|
+
[
|
|
1084
|
+
process_line(substitute_apostrophes(process_apostrophes, line))
|
|
1085
|
+
for line in self._shadow.splitlines()
|
|
1086
|
+
]
|
|
1087
|
+
)
|
|
1075
1088
|
|
|
1076
1089
|
def _bolds_italics_recurse(self, result: list, filter_cls: Optional[type]):
|
|
1077
1090
|
for prop in (
|
|
@@ -1123,8 +1136,8 @@ class WikiText:
|
|
|
1123
1136
|
bold_spans = tts_setdefault('Bold', [])
|
|
1124
1137
|
get_old_bold_span = {(s[0], s[1]): s for s in bold_spans}.get
|
|
1125
1138
|
bold_matches = list(BOLD_FINDITER(balanced_shadow, rs, re))
|
|
1126
|
-
for
|
|
1127
|
-
ms, me =
|
|
1139
|
+
for m in bold_matches:
|
|
1140
|
+
ms, me = m.span()
|
|
1128
1141
|
b, e = s + ms, s + me
|
|
1129
1142
|
old_span = get_old_bold_span((b, e))
|
|
1130
1143
|
if old_span is None:
|
|
@@ -1146,16 +1159,16 @@ class WikiText:
|
|
|
1146
1159
|
# filter_cls is None or filter_cls is Italic
|
|
1147
1160
|
|
|
1148
1161
|
# remove bold tokens before searching for italics
|
|
1149
|
-
for
|
|
1150
|
-
ms, me =
|
|
1151
|
-
cs, ce =
|
|
1162
|
+
for m in bold_matches:
|
|
1163
|
+
ms, me = m.span()
|
|
1164
|
+
cs, ce = m.span(1) # content
|
|
1152
1165
|
balanced_shadow[ms:cs] = b'_' * (cs - ms)
|
|
1153
1166
|
balanced_shadow[ce:me] = b'_' * (me - ce)
|
|
1154
1167
|
|
|
1155
1168
|
italic_spans = tts_setdefault('Italic', [])
|
|
1156
1169
|
get_old_italic_span = {(s[0], s[1]): s for s in italic_spans}.get
|
|
1157
|
-
for
|
|
1158
|
-
ms, me =
|
|
1170
|
+
for m in ITALIC_FINDITER(balanced_shadow, rs, re):
|
|
1171
|
+
ms, me = m.span()
|
|
1159
1172
|
b, e = span = s + ms, s + me
|
|
1160
1173
|
old_span = get_old_italic_span(span)
|
|
1161
1174
|
if old_span is None:
|
|
@@ -1164,9 +1177,7 @@ class WikiText:
|
|
|
1164
1177
|
else:
|
|
1165
1178
|
span = old_span
|
|
1166
1179
|
append(
|
|
1167
|
-
Italic(
|
|
1168
|
-
_lststr, type_to_spans, span, 'Bold', me != match.end(1)
|
|
1169
|
-
)
|
|
1180
|
+
Italic(_lststr, type_to_spans, span, 'Bold', me != m.end(1))
|
|
1170
1181
|
)
|
|
1171
1182
|
if recursive and filter_cls is Italic:
|
|
1172
1183
|
self._bolds_italics_recurse(result, filter_cls)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|