wikitextparser 0.55.10__tar.gz → 0.55.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/CHANGELOG.rst +8 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/PKG-INFO +1 -1
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/pyproject.toml +3 -3
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/__init__.py +1 -1
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_argument.py +0 -1
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_config.py +0 -1
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_parser_function.py +0 -1
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_spans.py +1 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_table.py +3 -3
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_tag.py +0 -1
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_template.py +0 -1
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_wikilink.py +0 -1
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_wikilist.py +0 -1
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_wikitext.py +87 -74
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/.coveragerc +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/.github/workflows/tests.yml +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/.gitignore +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/.readthedocs.yaml +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/LICENSE.md +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/README.rst +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/docs/CHANGELOG.rst +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/docs/Makefile +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/docs/README.rst +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/docs/conf.py +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/docs/index.rst +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/docs/make.bat +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_cell.py +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_comment_bold_italic.py +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_externallink.py +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_parameter.py +0 -0
- {wikitextparser-0.55.10 → wikitextparser-0.55.12}/wikitextparser/_section.py +0 -0
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
v0.55.12
|
|
2
|
+
--------
|
|
3
|
+
* Performance improvements in extracting bold and italic nodes. (#133)
|
|
4
|
+
|
|
5
|
+
v0.55.11
|
|
6
|
+
--------
|
|
7
|
+
* Performance improvements in ``__setitem__``/``__delitem__`` and ``pformat``/``plain_text`` methods. (#131)
|
|
8
|
+
|
|
1
9
|
v0.55.10
|
|
2
10
|
--------
|
|
3
11
|
* Fixed a bug in ``plain_text`` causing ``IndexError`` when using a custom function to replace ``templates``/``parser_functions``.
|
|
@@ -48,12 +48,12 @@ exclude = ['tests/', 'doc/', 'dev/']
|
|
|
48
48
|
[tool.ruff]
|
|
49
49
|
line-length = 79
|
|
50
50
|
format.quote-style = 'single'
|
|
51
|
-
isort.combine-as-imports = true
|
|
52
|
-
extend-select = [
|
|
51
|
+
lint.isort.combine-as-imports = true
|
|
52
|
+
lint.extend-select = [
|
|
53
53
|
'I', # isort
|
|
54
54
|
'UP', # pyupgrade
|
|
55
55
|
]
|
|
56
|
-
ignore = [
|
|
56
|
+
lint.ignore = [
|
|
57
57
|
'UP027', # list comprehensions are faster than generator expressions
|
|
58
58
|
'E721', # Do not compare types, use `isinstance()`
|
|
59
59
|
]
|
|
@@ -296,9 +296,9 @@ class Table(SubWikiTextWithAttrs):
|
|
|
296
296
|
m = CAPTION_MATCH(shadow)
|
|
297
297
|
if m:
|
|
298
298
|
s = m.end('attrs')
|
|
299
|
-
self[
|
|
300
|
-
|
|
301
|
-
|
|
299
|
+
self[s if s != -1 else m.end('preattrs') : m.end('caption')] = (
|
|
300
|
+
newcaption
|
|
301
|
+
)
|
|
302
302
|
return
|
|
303
303
|
# There is no caption. Create one.
|
|
304
304
|
h, s, t = shadow.partition(b'\n')
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from bisect import bisect_left, bisect_right, insort_right
|
|
2
|
-
from copy import deepcopy
|
|
3
2
|
from html import unescape
|
|
4
3
|
from itertools import compress, islice
|
|
5
4
|
from operator import attrgetter
|
|
@@ -114,10 +113,11 @@ TABLE_FINDITER = rc(
|
|
|
114
113
|
DOTALL | MULTILINE | VERBOSE,
|
|
115
114
|
).finditer
|
|
116
115
|
|
|
117
|
-
|
|
118
|
-
rb"
|
|
116
|
+
substitute_apostrophes = rc( # bold-italic, bold, or italic tokens
|
|
117
|
+
rb"('\0*+){2,}+(?=[^']|$)",
|
|
119
118
|
MULTILINE | VERBOSE,
|
|
120
|
-
).
|
|
119
|
+
).sub
|
|
120
|
+
find_lines = rc(rb'(.*?)$').finditer
|
|
121
121
|
|
|
122
122
|
BOLD_FINDITER = rc(
|
|
123
123
|
rb"""
|
|
@@ -488,6 +488,7 @@ class WikiText:
|
|
|
488
488
|
# Note: The following algorithm won't work correctly if spans
|
|
489
489
|
# are not sorted.
|
|
490
490
|
# Note: No span should be removed from _type_to_spans.
|
|
491
|
+
rmlength = rmstop - rmstart
|
|
491
492
|
for spans in self._type_to_spans.values():
|
|
492
493
|
i = len(spans) - 1
|
|
493
494
|
while i >= 0:
|
|
@@ -495,7 +496,6 @@ class WikiText:
|
|
|
495
496
|
s, e, _, b = span = spans[i]
|
|
496
497
|
if rmstop <= s:
|
|
497
498
|
# rmstart <= rmstop <= s <= e
|
|
498
|
-
rmlength = rmstop - rmstart
|
|
499
499
|
# todo
|
|
500
500
|
span[:] = s - rmlength, e - rmlength, None, None
|
|
501
501
|
i -= 1
|
|
@@ -508,7 +508,7 @@ class WikiText:
|
|
|
508
508
|
if rmstop < e:
|
|
509
509
|
# rmstart < s <= rmstop < e
|
|
510
510
|
# todo: update byte_array instead
|
|
511
|
-
span[:] = rmstart, e
|
|
511
|
+
span[:] = rmstart, e - rmlength, None, None
|
|
512
512
|
i -= 1
|
|
513
513
|
if i < 0:
|
|
514
514
|
break
|
|
@@ -531,7 +531,7 @@ class WikiText:
|
|
|
531
531
|
s, e, _, _ = span = spans[i]
|
|
532
532
|
continue
|
|
533
533
|
# s <= rmstart <= rmstop <= e
|
|
534
|
-
span[1] -=
|
|
534
|
+
span[1] -= rmlength
|
|
535
535
|
span[2] = None
|
|
536
536
|
# todo: update bytearray instead
|
|
537
537
|
span[3] = None
|
|
@@ -621,13 +621,12 @@ class WikiText:
|
|
|
621
621
|
self.string.
|
|
622
622
|
"""
|
|
623
623
|
ss, se, _, _ = self._span_data
|
|
624
|
-
if ss == 0 and se == len(self._lststr[0]):
|
|
625
|
-
return deepcopy(self._type_to_spans)
|
|
626
624
|
return {
|
|
627
625
|
type_: [
|
|
628
626
|
[s - ss, e - ss, m, ba[:] if ba is not None else None]
|
|
629
|
-
for s, e, m, ba in spans[
|
|
630
|
-
|
|
627
|
+
for s, e, m, ba in spans[
|
|
628
|
+
bisect_right(spans, [ss]) : bisect_right(spans, [se])
|
|
629
|
+
]
|
|
631
630
|
]
|
|
632
631
|
for type_, spans in self._type_to_spans.items()
|
|
633
632
|
}
|
|
@@ -1010,66 +1009,82 @@ class WikiText:
|
|
|
1010
1009
|
]
|
|
1011
1010
|
|
|
1012
1011
|
@property
|
|
1013
|
-
def _balanced_quotes_shadow(self):
|
|
1014
|
-
"""Return
|
|
1012
|
+
def _balanced_quotes_shadow(self) -> bytearray:
|
|
1013
|
+
"""Return a byte array with non-markup-apostrophes removed.
|
|
1015
1014
|
|
|
1016
1015
|
The comments at /includes/parser/Parser.php:doQuotes are helpful:
|
|
1017
1016
|
https://github.com/wikimedia/mediawiki/blob/master/includes/parser/Parser.php
|
|
1018
1017
|
https://phabricator.wikimedia.org/T15227#178834
|
|
1019
1018
|
"""
|
|
1020
|
-
|
|
1019
|
+
bold_starts: List[int] = []
|
|
1021
1020
|
odd_italics = False
|
|
1022
1021
|
odd_bold_italics = False
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
first_multi_letter_word
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1022
|
+
append_bold_start = bold_starts.append
|
|
1023
|
+
|
|
1024
|
+
def process_line(line: bytes) -> bytes:
|
|
1025
|
+
nonlocal odd_italics, odd_bold_italics
|
|
1026
|
+
if odd_italics and (len(bold_starts) + odd_bold_italics) % 2:
|
|
1027
|
+
# one of the bold marks needs to be interpreted as italic
|
|
1028
|
+
first_multi_letter_word = first_space = None
|
|
1029
|
+
for s in bold_starts:
|
|
1030
|
+
if line[s - 1] == 32: # space
|
|
1031
|
+
if first_space is None:
|
|
1032
|
+
first_space = s
|
|
1033
|
+
continue
|
|
1034
|
+
if line[s - 2] == 32: # space
|
|
1035
|
+
line = line[:s] + b' ' + line[s + 1 :]
|
|
1036
|
+
break # first_single_letter_word
|
|
1037
|
+
if first_multi_letter_word is None:
|
|
1038
|
+
first_multi_letter_word = s
|
|
1039
|
+
continue
|
|
1040
|
+
else: # there was no first_single_letter_word
|
|
1041
|
+
if first_multi_letter_word is not None:
|
|
1042
|
+
line = (
|
|
1043
|
+
line[:first_multi_letter_word]
|
|
1044
|
+
+ b'_'
|
|
1045
|
+
+ line[first_multi_letter_word + 1 :]
|
|
1046
|
+
)
|
|
1047
|
+
elif first_space is not None:
|
|
1048
|
+
line = (
|
|
1049
|
+
line[:first_space] + b'_' + line[first_space + 1 :]
|
|
1050
|
+
)
|
|
1051
|
+
# reset state for the next line
|
|
1052
|
+
bold_starts.clear()
|
|
1053
|
+
odd_italics = False
|
|
1054
|
+
odd_bold_italics = False
|
|
1055
|
+
return line
|
|
1056
|
+
|
|
1057
|
+
def process_apostrophes(m) -> bytes:
|
|
1058
|
+
nonlocal odd_italics, odd_bold_italics
|
|
1059
|
+
starts = m.starts(1)
|
|
1060
|
+
n = len(starts)
|
|
1061
|
+
if n == 2: # italic
|
|
1057
1062
|
odd_italics ^= True
|
|
1058
|
-
|
|
1059
|
-
if
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1063
|
+
return m[0]
|
|
1064
|
+
if n == 3: # bold
|
|
1065
|
+
append_bold_start(starts[0])
|
|
1066
|
+
return m[0]
|
|
1067
|
+
if n == 5:
|
|
1068
|
+
odd_bold_italics ^= True
|
|
1069
|
+
odd_italics ^= True
|
|
1070
|
+
return m[0]
|
|
1071
|
+
if n == 4: # four apostrophes -> hide the first one
|
|
1072
|
+
s = starts[1]
|
|
1073
|
+
append_bold_start(s)
|
|
1074
|
+
return b'_' * (s - starts[0]) + m.string[s : m.end()]
|
|
1075
|
+
if n > 5: # more than 5 apostrophes -> hide the prior ones
|
|
1076
|
+
odd_bold_italics ^= True
|
|
1077
|
+
odd_italics ^= True
|
|
1078
|
+
s = starts[-5]
|
|
1079
|
+
return b'_' * (s - starts[0]) + m.string[s : m.end()]
|
|
1080
|
+
raise # execution should never reach here
|
|
1081
|
+
|
|
1082
|
+
return bytearray(b'\n').join(
|
|
1083
|
+
[
|
|
1084
|
+
process_line(substitute_apostrophes(process_apostrophes, line))
|
|
1085
|
+
for line in self._shadow.splitlines()
|
|
1086
|
+
]
|
|
1087
|
+
)
|
|
1073
1088
|
|
|
1074
1089
|
def _bolds_italics_recurse(self, result: list, filter_cls: Optional[type]):
|
|
1075
1090
|
for prop in (
|
|
@@ -1121,8 +1136,8 @@ class WikiText:
|
|
|
1121
1136
|
bold_spans = tts_setdefault('Bold', [])
|
|
1122
1137
|
get_old_bold_span = {(s[0], s[1]): s for s in bold_spans}.get
|
|
1123
1138
|
bold_matches = list(BOLD_FINDITER(balanced_shadow, rs, re))
|
|
1124
|
-
for
|
|
1125
|
-
ms, me =
|
|
1139
|
+
for m in bold_matches:
|
|
1140
|
+
ms, me = m.span()
|
|
1126
1141
|
b, e = s + ms, s + me
|
|
1127
1142
|
old_span = get_old_bold_span((b, e))
|
|
1128
1143
|
if old_span is None:
|
|
@@ -1144,16 +1159,16 @@ class WikiText:
|
|
|
1144
1159
|
# filter_cls is None or filter_cls is Italic
|
|
1145
1160
|
|
|
1146
1161
|
# remove bold tokens before searching for italics
|
|
1147
|
-
for
|
|
1148
|
-
ms, me =
|
|
1149
|
-
cs, ce =
|
|
1162
|
+
for m in bold_matches:
|
|
1163
|
+
ms, me = m.span()
|
|
1164
|
+
cs, ce = m.span(1) # content
|
|
1150
1165
|
balanced_shadow[ms:cs] = b'_' * (cs - ms)
|
|
1151
1166
|
balanced_shadow[ce:me] = b'_' * (me - ce)
|
|
1152
1167
|
|
|
1153
1168
|
italic_spans = tts_setdefault('Italic', [])
|
|
1154
1169
|
get_old_italic_span = {(s[0], s[1]): s for s in italic_spans}.get
|
|
1155
|
-
for
|
|
1156
|
-
ms, me =
|
|
1170
|
+
for m in ITALIC_FINDITER(balanced_shadow, rs, re):
|
|
1171
|
+
ms, me = m.span()
|
|
1157
1172
|
b, e = span = s + ms, s + me
|
|
1158
1173
|
old_span = get_old_italic_span(span)
|
|
1159
1174
|
if old_span is None:
|
|
@@ -1162,9 +1177,7 @@ class WikiText:
|
|
|
1162
1177
|
else:
|
|
1163
1178
|
span = old_span
|
|
1164
1179
|
append(
|
|
1165
|
-
Italic(
|
|
1166
|
-
_lststr, type_to_spans, span, 'Bold', me != match.end(1)
|
|
1167
|
-
)
|
|
1180
|
+
Italic(_lststr, type_to_spans, span, 'Bold', me != m.end(1))
|
|
1168
1181
|
)
|
|
1169
1182
|
if recursive and filter_cls is Italic:
|
|
1170
1183
|
self._bolds_italics_recurse(result, filter_cls)
|
|
@@ -1338,7 +1351,7 @@ class WikiText:
|
|
|
1338
1351
|
|
|
1339
1352
|
if level is not None:
|
|
1340
1353
|
section_spans = compress(
|
|
1341
|
-
section_spans, [
|
|
1354
|
+
section_spans, [lvl == level for lvl in levels]
|
|
1342
1355
|
)
|
|
1343
1356
|
|
|
1344
1357
|
return self._section_spans_to_sections(section_spans, shadow)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|