lxml 5.3.2__cp312-cp312-win32.whl → 6.0.0__cp312-cp312-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lxml/__init__.py +1 -1
- lxml/_elementpath.cp312-win32.pyd +0 -0
- lxml/_elementpath.py +3 -1
- lxml/apihelpers.pxi +25 -17
- lxml/builder.cp312-win32.pyd +0 -0
- lxml/builder.py +11 -0
- lxml/debug.pxi +0 -54
- lxml/etree.cp312-win32.pyd +0 -0
- lxml/etree.h +244 -248
- lxml/etree.pyx +154 -33
- lxml/etree_api.h +204 -195
- lxml/extensions.pxi +3 -6
- lxml/html/__init__.py +7 -3
- lxml/html/_difflib.cp312-win32.pyd +0 -0
- lxml/html/_difflib.py +2106 -0
- lxml/html/builder.py +40 -0
- lxml/html/defs.py +3 -3
- lxml/html/diff.cp312-win32.pyd +0 -0
- lxml/html/diff.py +406 -312
- lxml/includes/etree_defs.h +6 -6
- lxml/includes/lxml-version.h +1 -1
- lxml/includes/tree.pxd +10 -12
- lxml/includes/xmlparser.pxd +46 -8
- lxml/lxml.etree.h +24 -28
- lxml/lxml.etree_api.h +59 -50
- lxml/objectify.cp312-win32.pyd +0 -0
- lxml/objectify.pyx +11 -7
- lxml/parser.pxi +106 -47
- lxml/sax.cp312-win32.pyd +0 -0
- lxml/sax.py +11 -0
- lxml/saxparser.pxi +14 -14
- lxml/schematron.pxi +8 -3
- lxml/serializer.pxi +71 -3
- lxml/xslt.pxi +10 -3
- lxml-6.0.0.dist-info/METADATA +163 -0
- {lxml-5.3.2.dist-info → lxml-6.0.0.dist-info}/RECORD +40 -38
- {lxml-5.3.2.dist-info → lxml-6.0.0.dist-info}/WHEEL +1 -1
- {lxml-5.3.2.dist-info → lxml-6.0.0.dist-info}/licenses/LICENSE.txt +3 -1
- lxml-5.3.2.dist-info/METADATA +0 -100
- {lxml-5.3.2.dist-info → lxml-6.0.0.dist-info}/licenses/LICENSES.txt +0 -0
- {lxml-5.3.2.dist-info → lxml-6.0.0.dist-info}/top_level.txt +0 -0
lxml/html/diff.py
CHANGED
@@ -1,35 +1,74 @@
|
|
1
1
|
# cython: language_level=3
|
2
2
|
|
3
|
+
try:
|
4
|
+
import cython
|
5
|
+
except ImportError:
|
6
|
+
class fake_cython:
|
7
|
+
compiled = False
|
8
|
+
def cfunc(self, func): return func
|
9
|
+
def cclass(self, func): return func
|
10
|
+
def declare(self, _, value): return value
|
11
|
+
def __getattr__(self, type_name): return "object"
|
12
|
+
|
13
|
+
cython = fake_cython()
|
14
|
+
|
15
|
+
try:
|
16
|
+
from . import _difflib as difflib
|
17
|
+
import inspect
|
18
|
+
if inspect.isfunction(difflib.get_close_matches):
|
19
|
+
raise ImportError(
|
20
|
+
"Embedded difflib is not compiled to a fast binary, using the stdlib instead.")
|
21
|
+
from cython.cimports.lxml.html._difflib import SequenceMatcher
|
22
|
+
except ImportError:
|
23
|
+
import difflib
|
24
|
+
if not cython.compiled:
|
25
|
+
from difflib import SequenceMatcher
|
26
|
+
|
27
|
+
import itertools
|
28
|
+
import functools
|
29
|
+
import operator
|
30
|
+
import re
|
3
31
|
|
4
|
-
import difflib
|
5
32
|
from lxml import etree
|
6
33
|
from lxml.html import fragment_fromstring
|
7
|
-
import
|
34
|
+
from . import defs
|
8
35
|
|
9
36
|
__all__ = ['html_annotate', 'htmldiff']
|
10
37
|
|
11
|
-
|
12
|
-
|
13
|
-
except ImportError:
|
14
|
-
from cgi import escape as html_escape
|
15
|
-
try:
|
16
|
-
_unicode = unicode
|
17
|
-
except NameError:
|
18
|
-
# Python 3
|
19
|
-
_unicode = str
|
20
|
-
try:
|
21
|
-
basestring
|
22
|
-
except NameError:
|
23
|
-
# Python 3
|
24
|
-
basestring = str
|
38
|
+
group_by_first_item = functools.partial(itertools.groupby, key=operator.itemgetter(0))
|
39
|
+
|
25
40
|
|
26
41
|
############################################################
|
27
42
|
## Annotation
|
28
43
|
############################################################
|
29
44
|
|
45
|
+
@cython.cfunc
|
46
|
+
def html_escape(text: str, _escapes: tuple = ('&', '<', '>', '"', ''')) -> str:
|
47
|
+
# Not so slow compiled version of 'html.escape()'.
|
48
|
+
# Most of the time, we replace little to nothing, so use a fast decision what needs to be done.
|
49
|
+
ch: cython.Py_UCS4
|
50
|
+
replace: cython.char[5] = [False] * 5
|
51
|
+
for ch in text:
|
52
|
+
replace[0] |= ch == '&'
|
53
|
+
replace[1] |= ch == '<'
|
54
|
+
replace[2] |= ch == '>'
|
55
|
+
replace[3] |= ch == '"'
|
56
|
+
replace[4] |= ch == "'"
|
57
|
+
|
58
|
+
for i in range(5):
|
59
|
+
if replace[i]:
|
60
|
+
text = text.replace('&<>"\''[i], _escapes[i])
|
61
|
+
|
62
|
+
return text
|
63
|
+
|
64
|
+
|
65
|
+
if not cython.compiled:
|
66
|
+
from html import escape as html_escape
|
67
|
+
|
68
|
+
|
30
69
|
def default_markup(text, version):
|
31
70
|
return '<span title="%s">%s</span>' % (
|
32
|
-
html_escape(
|
71
|
+
html_escape(version), text)
|
33
72
|
|
34
73
|
def html_annotate(doclist, markup=default_markup):
|
35
74
|
"""
|
@@ -71,15 +110,15 @@ def html_annotate(doclist, markup=default_markup):
|
|
71
110
|
result = markup_serialize_tokens(cur_tokens, markup)
|
72
111
|
return ''.join(result).strip()
|
73
112
|
|
74
|
-
def tokenize_annotated(doc, annotation):
|
113
|
+
def tokenize_annotated(doc, annotation):
|
75
114
|
"""Tokenize a document and add an annotation attribute to each token
|
76
115
|
"""
|
77
116
|
tokens = tokenize(doc, include_hrefs=False)
|
78
|
-
for tok in tokens:
|
117
|
+
for tok in tokens:
|
79
118
|
tok.annotation = annotation
|
80
119
|
return tokens
|
81
120
|
|
82
|
-
def html_annotate_merge_annotations(tokens_old, tokens_new):
|
121
|
+
def html_annotate_merge_annotations(tokens_old, tokens_new):
|
83
122
|
"""Merge the annotations from tokens_old into tokens_new, when the
|
84
123
|
tokens in the new document already existed in the old document.
|
85
124
|
"""
|
@@ -87,52 +126,50 @@ def html_annotate_merge_annotations(tokens_old, tokens_new):
|
|
87
126
|
commands = s.get_opcodes()
|
88
127
|
|
89
128
|
for command, i1, i2, j1, j2 in commands:
|
90
|
-
if command == 'equal':
|
129
|
+
if command == 'equal':
|
91
130
|
eq_old = tokens_old[i1:i2]
|
92
131
|
eq_new = tokens_new[j1:j2]
|
93
132
|
copy_annotations(eq_old, eq_new)
|
94
133
|
|
95
|
-
def copy_annotations(src, dest):
|
134
|
+
def copy_annotations(src, dest):
|
96
135
|
"""
|
97
136
|
Copy annotations from the tokens listed in src to the tokens in dest
|
98
137
|
"""
|
99
138
|
assert len(src) == len(dest)
|
100
|
-
for src_tok, dest_tok in zip(src, dest):
|
139
|
+
for src_tok, dest_tok in zip(src, dest):
|
101
140
|
dest_tok.annotation = src_tok.annotation
|
102
141
|
|
103
142
|
def compress_tokens(tokens):
|
104
143
|
"""
|
105
|
-
Combine adjacent tokens when there is no HTML between the tokens,
|
144
|
+
Combine adjacent tokens when there is no HTML between the tokens,
|
106
145
|
and they share an annotation
|
107
146
|
"""
|
108
|
-
result = [tokens[0]]
|
109
|
-
for tok in tokens[1:]:
|
110
|
-
if (not
|
111
|
-
|
112
|
-
|
147
|
+
result = [tokens[0]]
|
148
|
+
for tok in tokens[1:]:
|
149
|
+
if (not tok.pre_tags and
|
150
|
+
not result[-1].post_tags and
|
151
|
+
result[-1].annotation == tok.annotation):
|
113
152
|
compress_merge_back(result, tok)
|
114
|
-
else:
|
153
|
+
else:
|
115
154
|
result.append(tok)
|
116
155
|
return result
|
117
156
|
|
118
|
-
|
157
|
+
@cython.cfunc
|
158
|
+
def compress_merge_back(tokens: list, tok):
|
119
159
|
""" Merge tok into the last element of tokens (modifying the list of
|
120
160
|
tokens in-place). """
|
121
161
|
last = tokens[-1]
|
122
|
-
if type(last) is not token or type(tok) is not token:
|
162
|
+
if type(last) is not token or type(tok) is not token:
|
123
163
|
tokens.append(tok)
|
124
164
|
else:
|
125
|
-
text =
|
126
|
-
if last.trailing_whitespace:
|
127
|
-
text += last.trailing_whitespace
|
128
|
-
text += tok
|
165
|
+
text = last + last.trailing_whitespace + tok
|
129
166
|
merged = token(text,
|
130
167
|
pre_tags=last.pre_tags,
|
131
168
|
post_tags=tok.post_tags,
|
132
169
|
trailing_whitespace=tok.trailing_whitespace)
|
133
170
|
merged.annotation = last.annotation
|
134
171
|
tokens[-1] = merged
|
135
|
-
|
172
|
+
|
136
173
|
def markup_serialize_tokens(tokens, markup_func):
|
137
174
|
"""
|
138
175
|
Serialize the list of tokens into a list of text chunks, calling
|
@@ -141,9 +178,7 @@ def markup_serialize_tokens(tokens, markup_func):
|
|
141
178
|
for token in tokens:
|
142
179
|
yield from token.pre_tags
|
143
180
|
html = token.html()
|
144
|
-
html = markup_func(html, token.annotation)
|
145
|
-
if token.trailing_whitespace:
|
146
|
-
html += token.trailing_whitespace
|
181
|
+
html = markup_func(html, token.annotation) + token.trailing_whitespace
|
147
182
|
yield html
|
148
183
|
yield from token.post_tags
|
149
184
|
|
@@ -160,7 +195,7 @@ def htmldiff(old_html, new_html):
|
|
160
195
|
(i.e., no <html> tag).
|
161
196
|
|
162
197
|
Returns HTML with <ins> and <del> tags added around the
|
163
|
-
appropriate text.
|
198
|
+
appropriate text.
|
164
199
|
|
165
200
|
Markup is generally ignored, with the markup from new_html
|
166
201
|
preserved, and possibly some markup from old_html (though it is
|
@@ -168,20 +203,25 @@ def htmldiff(old_html, new_html):
|
|
168
203
|
words in the HTML are diffed. The exception is <img> tags, which
|
169
204
|
are treated like words, and the href attribute of <a> tags, which
|
170
205
|
are noted inside the tag itself when there are changes.
|
171
|
-
"""
|
206
|
+
"""
|
172
207
|
old_html_tokens = tokenize(old_html)
|
173
208
|
new_html_tokens = tokenize(new_html)
|
174
209
|
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
|
175
|
-
|
210
|
+
try:
|
211
|
+
result = ''.join(result).strip()
|
212
|
+
except (ValueError, TypeError) as exc:
|
213
|
+
print(exc)
|
214
|
+
result = ''
|
176
215
|
return fixup_ins_del_tags(result)
|
177
216
|
|
217
|
+
|
178
218
|
def htmldiff_tokens(html1_tokens, html2_tokens):
|
179
219
|
""" Does a diff on the tokens themselves, returning a list of text
|
180
220
|
chunks (not tokens).
|
181
221
|
"""
|
182
222
|
# There are several passes as we do the differences. The tokens
|
183
223
|
# isolate the portion of the content we care to diff; difflib does
|
184
|
-
# all the actual hard work at that point.
|
224
|
+
# all the actual hard work at that point.
|
185
225
|
#
|
186
226
|
# Then we must create a valid document from pieces of both the old
|
187
227
|
# document and the new document. We generally prefer to take
|
@@ -205,14 +245,16 @@ def htmldiff_tokens(html1_tokens, html2_tokens):
|
|
205
245
|
if command == 'delete' or command == 'replace':
|
206
246
|
del_tokens = expand_tokens(html1_tokens[i1:i2])
|
207
247
|
merge_delete(del_tokens, result)
|
248
|
+
|
208
249
|
# If deletes were inserted directly as <del> then we'd have an
|
209
250
|
# invalid document at this point. Instead we put in special
|
210
251
|
# markers, and when the complete diffed document has been created
|
211
252
|
# we try to move the deletes around and resolve any problems.
|
212
|
-
|
253
|
+
cleanup_delete(result)
|
213
254
|
|
214
255
|
return result
|
215
256
|
|
257
|
+
|
216
258
|
def expand_tokens(tokens, equal=False):
|
217
259
|
"""Given a list of tokens, return a generator of the chunks of
|
218
260
|
text for the data in the tokens.
|
@@ -220,31 +262,64 @@ def expand_tokens(tokens, equal=False):
|
|
220
262
|
for token in tokens:
|
221
263
|
yield from token.pre_tags
|
222
264
|
if not equal or not token.hide_when_equal:
|
223
|
-
|
224
|
-
yield token.html() + token.trailing_whitespace
|
225
|
-
else:
|
226
|
-
yield token.html()
|
265
|
+
yield token.html() + token.trailing_whitespace
|
227
266
|
yield from token.post_tags
|
228
267
|
|
229
|
-
|
268
|
+
|
269
|
+
def merge_insert(ins_chunks, doc: list):
|
230
270
|
""" doc is the already-handled document (as a list of text chunks);
|
231
271
|
here we add <ins>ins_chunks</ins> to the end of that. """
|
232
|
-
# Though we don't throw away
|
272
|
+
# Though we don't throw away unbalanced start/end tags
|
233
273
|
# (we assume there is accompanying markup later or earlier in the
|
234
274
|
# document), we only put <ins> around the balanced portion.
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
balanced
|
245
|
-
|
246
|
-
|
247
|
-
|
275
|
+
|
276
|
+
# Legacy note: We make a choice here. Originally, we merged all sequences of
|
277
|
+
# unbalanced tags together into separate start and end tag groups. Now, we look at
|
278
|
+
# each sequence separately, leading to more fine-grained diffs but different
|
279
|
+
# tag structure than before.
|
280
|
+
|
281
|
+
item: tuple
|
282
|
+
for balanced, marked_chunks in group_by_first_item(mark_unbalanced(ins_chunks)):
|
283
|
+
chunks = [item[1] for item in marked_chunks]
|
284
|
+
if balanced == 'b':
|
285
|
+
if doc and not doc[-1].endswith(' '):
|
286
|
+
# Fix up the case where the word before the insert didn't end with a space.
|
287
|
+
doc[-1] += ' '
|
288
|
+
doc.append('<ins>')
|
289
|
+
doc.extend(chunks)
|
290
|
+
if doc[-1].endswith(' '):
|
291
|
+
# We move space outside of </ins>.
|
292
|
+
doc[-1] = doc[-1][:-1]
|
293
|
+
doc.append('</ins> ')
|
294
|
+
else:
|
295
|
+
# unmatched start or end
|
296
|
+
doc.extend(chunks)
|
297
|
+
|
298
|
+
|
299
|
+
@cython.cfunc
|
300
|
+
def tag_name_of_chunk(chunk: str) -> str:
|
301
|
+
i: cython.Py_ssize_t
|
302
|
+
ch: cython.Py_UCS4
|
303
|
+
|
304
|
+
if chunk[0] != '<':
|
305
|
+
return ""
|
306
|
+
|
307
|
+
start_pos = 1
|
308
|
+
for i, ch in enumerate(chunk):
|
309
|
+
if ch == '/':
|
310
|
+
start_pos = 2
|
311
|
+
elif ch == '>':
|
312
|
+
return chunk[start_pos:i]
|
313
|
+
elif ch.isspace():
|
314
|
+
return chunk[start_pos:i]
|
315
|
+
|
316
|
+
return chunk[start_pos:]
|
317
|
+
|
318
|
+
if not cython.compiled:
|
319
|
+
# Avoid performance regression in Python due to string iteration.
|
320
|
+
def tag_name_of_chunk(chunk: str) -> str:
|
321
|
+
return chunk.split(None, 1)[0].strip('<>/')
|
322
|
+
|
248
323
|
|
249
324
|
# These are sentinels to represent the start and end of a <del>
|
250
325
|
# segment, until we do the cleanup phase to turn them into proper
|
@@ -254,19 +329,18 @@ class DEL_START:
|
|
254
329
|
class DEL_END:
|
255
330
|
pass
|
256
331
|
|
257
|
-
class NoDeletes(Exception):
|
258
|
-
""" Raised when the document no longer contains any pending deletes
|
259
|
-
(DEL_START/DEL_END) """
|
260
332
|
|
261
|
-
def merge_delete(del_chunks, doc):
|
333
|
+
def merge_delete(del_chunks, doc: list):
|
262
334
|
""" Adds the text chunks in del_chunks to the document doc (another
|
263
335
|
list of text chunks) with marker to show it is a delete.
|
264
336
|
cleanup_delete later resolves these markers into <del> tags."""
|
337
|
+
|
265
338
|
doc.append(DEL_START)
|
266
339
|
doc.extend(del_chunks)
|
267
340
|
doc.append(DEL_END)
|
268
341
|
|
269
|
-
|
342
|
+
|
343
|
+
def cleanup_delete(chunks: list):
|
270
344
|
""" Cleans up any DEL_START/DEL_END markers in the document, replacing
|
271
345
|
them with <del></del>. To do this while keeping the document
|
272
346
|
valid, it may need to drop some tags (either start or end tags).
|
@@ -274,166 +348,192 @@ def cleanup_delete(chunks):
|
|
274
348
|
It may also move the del into adjacent tags to try to move it to a
|
275
349
|
similar location where it was originally located (e.g., moving a
|
276
350
|
delete into preceding <div> tag, if the del looks like (DEL_START,
|
277
|
-
'Text</div>', DEL_END)
|
351
|
+
'Text</div>', DEL_END)
|
352
|
+
"""
|
353
|
+
chunk_count = len(chunks)
|
354
|
+
|
355
|
+
i: cython.Py_ssize_t
|
356
|
+
del_start: cython.Py_ssize_t
|
357
|
+
del_end: cython.Py_ssize_t
|
358
|
+
shift_start_right: cython.Py_ssize_t
|
359
|
+
shift_end_left: cython.Py_ssize_t
|
360
|
+
unbalanced_start: cython.Py_ssize_t
|
361
|
+
unbalanced_end: cython.Py_ssize_t
|
362
|
+
pos: cython.Py_ssize_t
|
363
|
+
start_pos: cython.Py_ssize_t
|
364
|
+
chunk: str
|
365
|
+
|
366
|
+
start_pos = 0
|
278
367
|
while 1:
|
279
368
|
# Find a pending DEL_START/DEL_END, splitting the document
|
280
369
|
# into stuff-preceding-DEL_START, stuff-inside, and
|
281
370
|
# stuff-following-DEL_END
|
282
371
|
try:
|
283
|
-
|
284
|
-
except
|
372
|
+
del_start = chunks.index(DEL_START, start_pos)
|
373
|
+
except ValueError:
|
285
374
|
# Nothing found, we've cleaned up the entire doc
|
286
375
|
break
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
376
|
+
else:
|
377
|
+
del_end = chunks.index(DEL_END, del_start + 1)
|
378
|
+
|
379
|
+
shift_end_left = shift_start_right = 0
|
380
|
+
unbalanced_start = unbalanced_end = 0
|
381
|
+
deleted_chunks = mark_unbalanced(chunks[del_start+1:del_end])
|
382
|
+
|
383
|
+
# For unbalanced start tags at the beginning, find matching (non-deleted)
|
384
|
+
# end tags after the current DEL_END and move the start tag outside.
|
385
|
+
for balanced, del_chunk in deleted_chunks:
|
386
|
+
if balanced != 'us':
|
387
|
+
break
|
388
|
+
unbalanced_start += 1
|
389
|
+
unbalanced_start_name = tag_name_of_chunk(del_chunk)
|
390
|
+
for i in range(del_end+1, chunk_count):
|
391
|
+
if chunks[i] is DEL_START:
|
392
|
+
break
|
393
|
+
chunk = chunks[i]
|
394
|
+
if chunk[0] != '<' or chunk[1] == '/':
|
395
|
+
# Reached a word or closing tag.
|
396
|
+
break
|
397
|
+
name = tag_name_of_chunk(chunk)
|
398
|
+
if name == 'ins':
|
399
|
+
# Cannot move into an insert.
|
400
|
+
break
|
401
|
+
assert name != 'del', f"Unexpected delete tag: {chunk!r}"
|
402
|
+
if name != unbalanced_start_name:
|
403
|
+
# Avoid mixing in other start tags.
|
404
|
+
break
|
405
|
+
# Exclude start tag to balance the end tag.
|
406
|
+
shift_start_right += 1
|
407
|
+
|
408
|
+
# For unbalanced end tags at the end, find matching (non-deleted)
|
409
|
+
# start tags before the currend DEL_START and move the end tag outside.
|
410
|
+
for balanced, del_chunk in reversed(deleted_chunks):
|
411
|
+
if balanced != 'ue':
|
412
|
+
break
|
413
|
+
unbalanced_end += 1
|
414
|
+
unbalanced_end_name = tag_name_of_chunk(del_chunk)
|
415
|
+
for i in range(del_start - 1, -1, -1):
|
416
|
+
if chunks[i] is DEL_END:
|
417
|
+
break
|
418
|
+
chunk = chunks[i]
|
419
|
+
if chunk[0] == '<' and chunk[1] != '/':
|
420
|
+
# Reached an opening tag, can we go further? Maybe not...
|
421
|
+
break
|
422
|
+
name = tag_name_of_chunk(chunk)
|
423
|
+
if name == 'ins' or name == 'del':
|
424
|
+
# Cannot move into an insert or delete.
|
425
|
+
break
|
426
|
+
if name != unbalanced_end_name:
|
427
|
+
# Avoid mixing in other start tags.
|
428
|
+
break
|
429
|
+
# Exclude end tag to balance the start tag.
|
430
|
+
shift_end_left += 1
|
431
|
+
|
432
|
+
"""
|
433
|
+
# This is what we do below in loops, spelled out using slicing and list copying:
|
434
|
+
|
435
|
+
chunks[del_start - shift_end_left : del_end + shift_start_right + 1] = [
|
436
|
+
*chunks[del_start + 1: del_start + shift_start_right + 1],
|
437
|
+
'<del>',
|
438
|
+
*chunks[del_start + unbalanced_start + 1 : del_end - unbalanced_end],
|
439
|
+
'</del> ',
|
440
|
+
*chunks[del_end - shift_end_left: del_end],
|
441
|
+
]
|
442
|
+
|
443
|
+
new_del_end = del_end - 2 * shift_end_left
|
444
|
+
assert chunks[new_del_end] == '</del> '
|
445
|
+
del_end = new_del_end
|
446
|
+
|
447
|
+
if new_del_start > 0 and not chunks[new_del_start - 1].endswith(' '):
|
448
|
+
# Fix up case where the word before us didn't have a trailing space.
|
449
|
+
chunks[new_del_start - 1] += ' '
|
450
|
+
if new_del_end > 0 and chunks[new_del_end - 1].endswith(' '):
|
451
|
+
# Move space outside of </del>.
|
452
|
+
chunks[new_del_end - 1] = chunks[new_del_end - 1][:-1]
|
453
|
+
"""
|
454
|
+
pos = del_start - shift_end_left
|
455
|
+
# Move re-balanced start tags before the '<del>'.
|
456
|
+
for i in range(del_start + 1, del_start + shift_start_right + 1):
|
457
|
+
chunks[pos] = chunks[i]
|
458
|
+
pos += 1
|
459
|
+
if pos and not chunks[pos - 1].endswith(' '):
|
460
|
+
# Fix up the case where the word before '<del>' didn't have a trailing space.
|
461
|
+
chunks[pos - 1] += ' '
|
462
|
+
chunks[pos] = '<del>'
|
463
|
+
pos += 1
|
464
|
+
# Copy only the balanced deleted content between '<del>' and '</del>'.
|
465
|
+
for i in range(del_start + unbalanced_start + 1, del_end - unbalanced_end):
|
466
|
+
chunks[pos] = chunks[i]
|
467
|
+
pos += 1
|
468
|
+
if chunks[pos - 1].endswith(' '):
|
469
|
+
# Move trailing space outside of </del>.
|
470
|
+
chunks[pos - 1] = chunks[pos - 1][:-1]
|
471
|
+
chunks[pos] = '</del> '
|
472
|
+
pos += 1
|
473
|
+
# Move re-balanced end tags after the '</del>'.
|
474
|
+
for i in range(del_end - shift_end_left, del_end):
|
475
|
+
chunks[pos] = chunks[i]
|
476
|
+
pos += 1
|
477
|
+
# Adjust the length of the processed part in 'chunks'.
|
478
|
+
del chunks[pos : del_end + shift_start_right + 1]
|
479
|
+
start_pos = pos
|
480
|
+
|
481
|
+
|
482
|
+
@cython.cfunc
|
483
|
+
def mark_unbalanced(chunks) -> list:
|
318
484
|
tag_stack = []
|
319
|
-
|
485
|
+
marked = []
|
486
|
+
|
487
|
+
chunk: str
|
488
|
+
parents: list
|
489
|
+
|
320
490
|
for chunk in chunks:
|
321
491
|
if not chunk.startswith('<'):
|
322
|
-
|
492
|
+
marked.append(('b', chunk))
|
323
493
|
continue
|
324
|
-
|
325
|
-
name = chunk
|
494
|
+
|
495
|
+
name = tag_name_of_chunk(chunk)
|
326
496
|
if name in empty_tags:
|
327
|
-
|
497
|
+
marked.append(('b', chunk))
|
328
498
|
continue
|
329
|
-
if endtag:
|
330
|
-
if tag_stack and tag_stack[-1][0] == name:
|
331
|
-
balanced.append(chunk)
|
332
|
-
name, pos, tag = tag_stack.pop()
|
333
|
-
balanced[pos] = tag
|
334
|
-
elif tag_stack:
|
335
|
-
start.extend([tag for name, pos, tag in tag_stack])
|
336
|
-
tag_stack = []
|
337
|
-
end.append(chunk)
|
338
|
-
else:
|
339
|
-
end.append(chunk)
|
340
|
-
else:
|
341
|
-
tag_stack.append((name, len(balanced), chunk))
|
342
|
-
balanced.append(None)
|
343
|
-
start.extend(
|
344
|
-
[chunk for name, pos, chunk in tag_stack])
|
345
|
-
balanced = [chunk for chunk in balanced if chunk is not None]
|
346
|
-
return start, balanced, end
|
347
|
-
|
348
|
-
def split_delete(chunks):
|
349
|
-
""" Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
|
350
|
-
stuff_after_DEL_END). Returns the first case found (there may be
|
351
|
-
more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
|
352
|
-
there's no DEL_START found. """
|
353
|
-
try:
|
354
|
-
pos = chunks.index(DEL_START)
|
355
|
-
except ValueError:
|
356
|
-
raise NoDeletes
|
357
|
-
pos2 = chunks.index(DEL_END)
|
358
|
-
return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
|
359
|
-
|
360
|
-
def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
|
361
|
-
""" pre_delete and post_delete implicitly point to a place in the
|
362
|
-
document (where the two were split). This moves that point (by
|
363
|
-
popping items from one and pushing them onto the other). It moves
|
364
|
-
the point to try to find a place where unbalanced_start applies.
|
365
|
-
|
366
|
-
As an example::
|
367
|
-
|
368
|
-
>>> unbalanced_start = ['<div>']
|
369
|
-
>>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
|
370
|
-
>>> pre, post = doc[:3], doc[3:]
|
371
|
-
>>> pre, post
|
372
|
-
(['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
|
373
|
-
>>> locate_unbalanced_start(unbalanced_start, pre, post)
|
374
|
-
>>> pre, post
|
375
|
-
(['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
|
376
|
-
|
377
|
-
As you can see, we moved the point so that the dangling <div> that
|
378
|
-
we found will be effectively replaced by the div in the original
|
379
|
-
document. If this doesn't work out, we just throw away
|
380
|
-
unbalanced_start without doing anything.
|
381
|
-
"""
|
382
|
-
while 1:
|
383
|
-
if not unbalanced_start:
|
384
|
-
# We have totally succeeded in finding the position
|
385
|
-
break
|
386
|
-
finding = unbalanced_start[0]
|
387
|
-
finding_name = finding.split()[0].strip('<>')
|
388
|
-
if not post_delete:
|
389
|
-
break
|
390
|
-
next = post_delete[0]
|
391
|
-
if next is DEL_START or not next.startswith('<'):
|
392
|
-
# Reached a word, we can't move the delete text forward
|
393
|
-
break
|
394
|
-
if next[1] == '/':
|
395
|
-
# Reached a closing tag, can we go further? Maybe not...
|
396
|
-
break
|
397
|
-
name = next.split()[0].strip('<>')
|
398
|
-
if name == 'ins':
|
399
|
-
# Can't move into an insert
|
400
|
-
break
|
401
|
-
assert name != 'del', (
|
402
|
-
"Unexpected delete tag: %r" % next)
|
403
|
-
if name == finding_name:
|
404
|
-
unbalanced_start.pop(0)
|
405
|
-
pre_delete.append(post_delete.pop(0))
|
406
|
-
else:
|
407
|
-
# Found a tag that doesn't match
|
408
|
-
break
|
409
499
|
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
post_delete.insert(0, pre_delete.pop())
|
500
|
+
if chunk[1] == '/':
|
501
|
+
# closing tag found, unwind tag stack
|
502
|
+
while tag_stack:
|
503
|
+
start_name, start_chunk, parents = tag_stack.pop()
|
504
|
+
if start_name == name:
|
505
|
+
# balanced tag closing, keep rest of stack intact
|
506
|
+
parents.append(('b', start_chunk))
|
507
|
+
parents.extend(marked)
|
508
|
+
parents.append(('b', chunk))
|
509
|
+
marked = parents
|
510
|
+
chunk = None
|
511
|
+
break
|
512
|
+
else:
|
513
|
+
# unmatched start tag
|
514
|
+
parents.append(('us', start_chunk))
|
515
|
+
parents.extend(marked)
|
516
|
+
marked = parents
|
517
|
+
|
518
|
+
if chunk is not None:
|
519
|
+
# unmatched end tag left after clearing the stack
|
520
|
+
marked.append(('ue', chunk))
|
432
521
|
else:
|
433
|
-
#
|
434
|
-
|
522
|
+
# new start tag found
|
523
|
+
tag_stack.append((name, chunk, marked))
|
524
|
+
marked = []
|
435
525
|
|
436
|
-
|
526
|
+
# add any unbalanced start tags
|
527
|
+
while tag_stack:
|
528
|
+
_, start_chunk, parents = tag_stack.pop()
|
529
|
+
parents.append(('us', start_chunk))
|
530
|
+
parents.extend(marked)
|
531
|
+
marked = parents
|
532
|
+
|
533
|
+
return marked
|
534
|
+
|
535
|
+
|
536
|
+
class token(str):
|
437
537
|
""" Represents a diffable token, generally a word that is displayed to
|
438
538
|
the user. Opening tags are attached to this token when they are
|
439
539
|
adjacent (pre_tags) and closing tags that follow the word
|
@@ -451,28 +551,20 @@ class token(_unicode):
|
|
451
551
|
hide_when_equal = False
|
452
552
|
|
453
553
|
def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
|
454
|
-
obj =
|
455
|
-
|
456
|
-
if pre_tags is not None:
|
457
|
-
obj.pre_tags = pre_tags
|
458
|
-
else:
|
459
|
-
obj.pre_tags = []
|
460
|
-
|
461
|
-
if post_tags is not None:
|
462
|
-
obj.post_tags = post_tags
|
463
|
-
else:
|
464
|
-
obj.post_tags = []
|
554
|
+
obj = str.__new__(cls, text)
|
465
555
|
|
556
|
+
obj.pre_tags = pre_tags if pre_tags is not None else []
|
557
|
+
obj.post_tags = post_tags if post_tags is not None else []
|
466
558
|
obj.trailing_whitespace = trailing_whitespace
|
467
559
|
|
468
560
|
return obj
|
469
561
|
|
470
562
|
def __repr__(self):
|
471
|
-
return 'token(%s, %r, %r, %r)' % (
|
472
|
-
|
563
|
+
return 'token(%s, %r, %r, %r)' % (
|
564
|
+
str.__repr__(self), self.pre_tags, self.post_tags, self.trailing_whitespace)
|
473
565
|
|
474
566
|
def html(self):
|
475
|
-
return
|
567
|
+
return str(self)
|
476
568
|
|
477
569
|
class tag_token(token):
|
478
570
|
|
@@ -480,11 +572,11 @@ class tag_token(token):
|
|
480
572
|
the <img> tag, which takes up visible space just like a word but
|
481
573
|
is only represented in a document by a tag. """
|
482
574
|
|
483
|
-
def __new__(cls, tag, data, html_repr, pre_tags=None,
|
575
|
+
def __new__(cls, tag, data, html_repr, pre_tags=None,
|
484
576
|
post_tags=None, trailing_whitespace=""):
|
485
|
-
obj = token.__new__(cls, "
|
486
|
-
pre_tags=pre_tags,
|
487
|
-
post_tags=post_tags,
|
577
|
+
obj = token.__new__(cls, f"{type}: {data}",
|
578
|
+
pre_tags=pre_tags,
|
579
|
+
post_tags=post_tags,
|
488
580
|
trailing_whitespace=trailing_whitespace)
|
489
581
|
obj.tag = tag
|
490
582
|
obj.data = data
|
@@ -493,11 +585,11 @@ class tag_token(token):
|
|
493
585
|
|
494
586
|
def __repr__(self):
|
495
587
|
return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
|
496
|
-
self.tag,
|
497
|
-
self.data,
|
498
|
-
self.html_repr,
|
499
|
-
self.pre_tags,
|
500
|
-
self.post_tags,
|
588
|
+
self.tag,
|
589
|
+
self.data,
|
590
|
+
self.html_repr,
|
591
|
+
self.pre_tags,
|
592
|
+
self.post_tags,
|
501
593
|
self.trailing_whitespace)
|
502
594
|
def html(self):
|
503
595
|
return self.html_repr
|
@@ -512,6 +604,7 @@ class href_token(token):
|
|
512
604
|
def html(self):
|
513
605
|
return ' Link: %s' % self
|
514
606
|
|
607
|
+
|
515
608
|
def tokenize(html, include_hrefs=True):
|
516
609
|
"""
|
517
610
|
Parse the given HTML and returns token objects (words with attached tags).
|
@@ -536,6 +629,7 @@ def tokenize(html, include_hrefs=True):
|
|
536
629
|
# Finally re-joining them into token objects:
|
537
630
|
return fixup_chunks(chunks)
|
538
631
|
|
632
|
+
|
539
633
|
def parse_html(html, cleanup=True):
|
540
634
|
"""
|
541
635
|
Parses an HTML fragment, returning an lxml element. Note that the HTML will be
|
@@ -549,25 +643,24 @@ def parse_html(html, cleanup=True):
|
|
549
643
|
html = cleanup_html(html)
|
550
644
|
return fragment_fromstring(html, create_parent=True)
|
551
645
|
|
552
|
-
|
553
|
-
|
554
|
-
|
646
|
+
|
647
|
+
_search_body = re.compile(r'<body.*?>', re.I|re.S).search
|
648
|
+
_search_end_body = re.compile(r'</body.*?>', re.I|re.S).search
|
649
|
+
_replace_ins_del = re.compile(r'</?(ins|del).*?>', re.I|re.S).sub
|
555
650
|
|
556
651
|
def cleanup_html(html):
|
557
652
|
""" This 'cleans' the HTML, meaning that any page structure is removed
|
558
653
|
(only the contents of <body> are used, if there is any <body).
|
559
654
|
Also <ins> and <del> tags are removed. """
|
560
|
-
match =
|
655
|
+
match = _search_body(html)
|
561
656
|
if match:
|
562
657
|
html = html[match.end():]
|
563
|
-
match =
|
658
|
+
match = _search_end_body(html)
|
564
659
|
if match:
|
565
660
|
html = html[:match.start()]
|
566
|
-
html =
|
661
|
+
html = _replace_ins_del('', html)
|
567
662
|
return html
|
568
|
-
|
569
663
|
|
570
|
-
end_whitespace_re = re.compile(r'[ \t\n\r]$')
|
571
664
|
|
572
665
|
def split_trailing_whitespace(word):
|
573
666
|
"""
|
@@ -631,11 +724,9 @@ def fixup_chunks(chunks):
|
|
631
724
|
|
632
725
|
|
633
726
|
# All the tags in HTML that don't require end tags:
|
634
|
-
empty_tags = (
|
635
|
-
'param', 'img', 'area', 'br', 'basefont', 'input',
|
636
|
-
'base', 'meta', 'link', 'col')
|
727
|
+
empty_tags = cython.declare(frozenset, defs.empty_tags)
|
637
728
|
|
638
|
-
block_level_tags = (
|
729
|
+
block_level_tags = cython.declare(frozenset, frozenset([
|
639
730
|
'address',
|
640
731
|
'blockquote',
|
641
732
|
'center',
|
@@ -660,9 +751,9 @@ block_level_tags = (
|
|
660
751
|
'pre',
|
661
752
|
'table',
|
662
753
|
'ul',
|
663
|
-
|
754
|
+
]))
|
664
755
|
|
665
|
-
block_level_container_tags = (
|
756
|
+
block_level_container_tags = cython.declare(frozenset, frozenset([
|
666
757
|
'dd',
|
667
758
|
'dt',
|
668
759
|
'frameset',
|
@@ -673,7 +764,11 @@ block_level_container_tags = (
|
|
673
764
|
'th',
|
674
765
|
'thead',
|
675
766
|
'tr',
|
676
|
-
|
767
|
+
]))
|
768
|
+
|
769
|
+
any_block_level_tag = cython.declare(tuple, tuple(sorted(
|
770
|
+
block_level_tags | block_level_container_tags))
|
771
|
+
)
|
677
772
|
|
678
773
|
|
679
774
|
def flatten_el(el, include_hrefs, skip_tag=False):
|
@@ -703,7 +798,7 @@ def flatten_el(el, include_hrefs, skip_tag=False):
|
|
703
798
|
for word in end_words:
|
704
799
|
yield html_escape(word)
|
705
800
|
|
706
|
-
|
801
|
+
_find_words = re.compile(r'\S+(?:\s+|$)', re.U).findall
|
707
802
|
|
708
803
|
def split_words(text):
|
709
804
|
""" Splits some text into words. Includes trailing whitespace
|
@@ -711,27 +806,27 @@ def split_words(text):
|
|
711
806
|
if not text or not text.strip():
|
712
807
|
return []
|
713
808
|
|
714
|
-
words =
|
809
|
+
words = _find_words(text)
|
715
810
|
return words
|
716
811
|
|
717
|
-
|
812
|
+
_has_start_whitespace = re.compile(r'^[ \t\n\r]').match
|
718
813
|
|
719
814
|
def start_tag(el):
|
720
815
|
"""
|
721
816
|
The text representation of the start tag for a tag.
|
722
817
|
"""
|
723
|
-
|
724
|
-
|
725
|
-
|
818
|
+
attributes = ''.join([
|
819
|
+
f' {name}="{html_escape(value)}"'
|
820
|
+
for name, value in el.attrib.items()
|
821
|
+
])
|
822
|
+
return f'<{el.tag}{attributes}>'
|
726
823
|
|
727
824
|
def end_tag(el):
|
728
825
|
""" The text representation of an end tag for a tag. Includes
|
729
826
|
trailing whitespace when appropriate. """
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
extra = ''
|
734
|
-
return '</%s>%s' % (el.tag, extra)
|
827
|
+
tail = el.tail
|
828
|
+
extra = ' ' if tail and _has_start_whitespace(tail) else ''
|
829
|
+
return f'</{el.tag}>{extra}'
|
735
830
|
|
736
831
|
def is_word(tok):
|
737
832
|
return not tok.startswith('<')
|
@@ -753,13 +848,13 @@ def fixup_ins_del_tags(html):
|
|
753
848
|
|
754
849
|
def serialize_html_fragment(el, skip_outer=False):
|
755
850
|
""" Serialize a single lxml element as HTML. The serialized form
|
756
|
-
includes the elements tail.
|
851
|
+
includes the elements tail.
|
757
852
|
|
758
853
|
If skip_outer is true, then don't serialize the outermost tag
|
759
854
|
"""
|
760
|
-
assert not isinstance(el,
|
761
|
-
"You should pass in an element, not a string like
|
762
|
-
html = etree.tostring(el, method="html", encoding=
|
855
|
+
assert not isinstance(el, str), (
|
856
|
+
f"You should pass in an element, not a string like {el!r}")
|
857
|
+
html = etree.tostring(el, method="html", encoding='unicode')
|
763
858
|
if skip_outer:
|
764
859
|
# Get rid of the extra starting tag:
|
765
860
|
html = html[html.find('>')+1:]
|
@@ -769,59 +864,64 @@ def serialize_html_fragment(el, skip_outer=False):
|
|
769
864
|
else:
|
770
865
|
return html
|
771
866
|
|
867
|
+
|
868
|
+
@cython.cfunc
|
772
869
|
def _fixup_ins_del_tags(doc):
|
773
870
|
"""fixup_ins_del_tags that works on an lxml document in-place
|
774
871
|
"""
|
775
|
-
for
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
872
|
+
for el in list(doc.iter('ins', 'del')):
|
873
|
+
if not _contains_block_level_tag(el):
|
874
|
+
continue
|
875
|
+
_move_el_inside_block(el, tag=el.tag)
|
876
|
+
el.drop_tag()
|
877
|
+
#_merge_element_contents(el)
|
878
|
+
|
782
879
|
|
880
|
+
@cython.cfunc
|
783
881
|
def _contains_block_level_tag(el):
|
784
882
|
"""True if the element contains any block-level elements, like <p>, <td>, etc.
|
785
883
|
"""
|
786
|
-
|
884
|
+
for el in el.iter(*any_block_level_tag):
|
787
885
|
return True
|
788
|
-
for child in el:
|
789
|
-
if _contains_block_level_tag(child):
|
790
|
-
return True
|
791
886
|
return False
|
792
887
|
|
888
|
+
|
889
|
+
@cython.cfunc
|
793
890
|
def _move_el_inside_block(el, tag):
|
794
891
|
""" helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
|
795
892
|
and moves them inside any block-level tags. """
|
796
|
-
|
797
|
-
|
893
|
+
makeelement = el.makeelement
|
894
|
+
for block_level_el in el.iter(*any_block_level_tag):
|
895
|
+
if block_level_el is not el:
|
798
896
|
break
|
799
897
|
else:
|
800
898
|
# No block-level tags in any child
|
801
|
-
children_tag =
|
899
|
+
children_tag = makeelement(tag)
|
802
900
|
children_tag.text = el.text
|
803
901
|
el.text = None
|
804
|
-
children_tag.extend(
|
902
|
+
children_tag.extend(iter(el))
|
805
903
|
el[:] = [children_tag]
|
806
904
|
return
|
905
|
+
|
807
906
|
for child in list(el):
|
808
907
|
if _contains_block_level_tag(child):
|
809
908
|
_move_el_inside_block(child, tag)
|
810
909
|
if child.tail:
|
811
|
-
tail_tag =
|
910
|
+
tail_tag = makeelement(tag)
|
812
911
|
tail_tag.text = child.tail
|
813
912
|
child.tail = None
|
814
|
-
|
913
|
+
child.addnext(tail_tag)
|
815
914
|
else:
|
816
|
-
child_tag =
|
915
|
+
child_tag = makeelement(tag)
|
817
916
|
el.replace(child, child_tag)
|
818
917
|
child_tag.append(child)
|
819
918
|
if el.text:
|
820
|
-
text_tag =
|
919
|
+
text_tag = makeelement(tag)
|
821
920
|
text_tag.text = el.text
|
822
921
|
el.text = None
|
823
922
|
el.insert(0, text_tag)
|
824
|
-
|
923
|
+
|
924
|
+
|
825
925
|
def _merge_element_contents(el):
|
826
926
|
"""
|
827
927
|
Removes an element, but merges its contents into its place, e.g.,
|
@@ -829,50 +929,44 @@ def _merge_element_contents(el):
|
|
829
929
|
<p>Hi there!</p>
|
830
930
|
"""
|
831
931
|
parent = el.getparent()
|
832
|
-
text = el.text
|
833
|
-
|
932
|
+
text = el.text
|
933
|
+
tail = el.tail
|
934
|
+
if tail:
|
834
935
|
if not len(el):
|
835
|
-
text
|
936
|
+
text = (text or '') + tail
|
836
937
|
else:
|
837
|
-
|
838
|
-
el[-1].tail += el.tail
|
839
|
-
else:
|
840
|
-
el[-1].tail = el.tail
|
938
|
+
el[-1].tail = (el[-1].tail or '') + tail
|
841
939
|
index = parent.index(el)
|
842
940
|
if text:
|
843
|
-
|
844
|
-
previous = None
|
845
|
-
else:
|
846
|
-
previous = parent[index-1]
|
941
|
+
previous = el.getprevious()
|
847
942
|
if previous is None:
|
848
|
-
|
849
|
-
parent.text += text
|
850
|
-
else:
|
851
|
-
parent.text = text
|
943
|
+
parent.text = (parent.text or '') + text
|
852
944
|
else:
|
853
|
-
|
854
|
-
previous.tail += text
|
855
|
-
else:
|
856
|
-
previous.tail = text
|
945
|
+
previous.tail = (previous.tail or '') + text
|
857
946
|
parent[index:index+1] = el.getchildren()
|
858
947
|
|
859
|
-
|
948
|
+
|
949
|
+
@cython.final
|
950
|
+
@cython.cclass
|
951
|
+
class InsensitiveSequenceMatcher(SequenceMatcher):
|
860
952
|
"""
|
861
953
|
Acts like SequenceMatcher, but tries not to find very small equal
|
862
954
|
blocks amidst large spans of changes
|
863
955
|
"""
|
864
956
|
|
865
957
|
threshold = 2
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
958
|
+
|
959
|
+
@cython.cfunc
|
960
|
+
def get_matching_blocks(self) -> list:
|
961
|
+
size: cython.Py_ssize_t = min(len(self.b), len(self.b))
|
962
|
+
threshold: cython.Py_ssize_t = self.threshold
|
963
|
+
threshold = min(threshold, size // 4)
|
964
|
+
actual = SequenceMatcher.get_matching_blocks(self)
|
871
965
|
return [item for item in actual
|
872
966
|
if item[2] > threshold
|
873
967
|
or not item[2]]
|
874
968
|
|
969
|
+
|
875
970
|
if __name__ == '__main__':
|
876
971
|
from lxml.html import _diffcommand
|
877
972
|
_diffcommand.main()
|
878
|
-
|