lxml 6.0.0__cp311-cp311-manylinux_2_31_armv7l.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lxml/ElementInclude.py +244 -0
- lxml/__init__.py +22 -0
- lxml/_elementpath.cpython-311-arm-linux-gnueabihf.so +0 -0
- lxml/_elementpath.py +343 -0
- lxml/apihelpers.pxi +1801 -0
- lxml/builder.cpython-311-arm-linux-gnueabihf.so +0 -0
- lxml/builder.py +243 -0
- lxml/classlookup.pxi +580 -0
- lxml/cleanup.pxi +215 -0
- lxml/cssselect.py +101 -0
- lxml/debug.pxi +36 -0
- lxml/docloader.pxi +178 -0
- lxml/doctestcompare.py +488 -0
- lxml/dtd.pxi +479 -0
- lxml/etree.cpython-311-arm-linux-gnueabihf.so +0 -0
- lxml/etree.h +244 -0
- lxml/etree.pyx +3853 -0
- lxml/etree_api.h +204 -0
- lxml/extensions.pxi +830 -0
- lxml/html/ElementSoup.py +10 -0
- lxml/html/__init__.py +1927 -0
- lxml/html/_diffcommand.py +86 -0
- lxml/html/_difflib.cpython-311-arm-linux-gnueabihf.so +0 -0
- lxml/html/_difflib.py +2106 -0
- lxml/html/_html5builder.py +100 -0
- lxml/html/_setmixin.py +56 -0
- lxml/html/builder.py +173 -0
- lxml/html/clean.py +21 -0
- lxml/html/defs.py +135 -0
- lxml/html/diff.cpython-311-arm-linux-gnueabihf.so +0 -0
- lxml/html/diff.py +972 -0
- lxml/html/formfill.py +299 -0
- lxml/html/html5parser.py +260 -0
- lxml/html/soupparser.py +314 -0
- lxml/html/usedoctest.py +13 -0
- lxml/includes/__init__.pxd +0 -0
- lxml/includes/__init__.py +0 -0
- lxml/includes/c14n.pxd +25 -0
- lxml/includes/config.pxd +3 -0
- lxml/includes/dtdvalid.pxd +18 -0
- lxml/includes/etree_defs.h +379 -0
- lxml/includes/etreepublic.pxd +237 -0
- lxml/includes/extlibs/__init__.py +0 -0
- lxml/includes/extlibs/libcharset.h +45 -0
- lxml/includes/extlibs/localcharset.h +137 -0
- lxml/includes/extlibs/zconf.h +543 -0
- lxml/includes/extlibs/zlib.h +1938 -0
- lxml/includes/htmlparser.pxd +56 -0
- lxml/includes/libexslt/__init__.py +0 -0
- lxml/includes/libexslt/exslt.h +108 -0
- lxml/includes/libexslt/exsltconfig.h +70 -0
- lxml/includes/libexslt/exsltexports.h +63 -0
- lxml/includes/libxml/HTMLparser.h +339 -0
- lxml/includes/libxml/HTMLtree.h +148 -0
- lxml/includes/libxml/SAX.h +18 -0
- lxml/includes/libxml/SAX2.h +170 -0
- lxml/includes/libxml/__init__.py +0 -0
- lxml/includes/libxml/c14n.h +115 -0
- lxml/includes/libxml/catalog.h +183 -0
- lxml/includes/libxml/chvalid.h +230 -0
- lxml/includes/libxml/debugXML.h +79 -0
- lxml/includes/libxml/dict.h +82 -0
- lxml/includes/libxml/encoding.h +307 -0
- lxml/includes/libxml/entities.h +147 -0
- lxml/includes/libxml/globals.h +25 -0
- lxml/includes/libxml/hash.h +251 -0
- lxml/includes/libxml/list.h +137 -0
- lxml/includes/libxml/nanoftp.h +16 -0
- lxml/includes/libxml/nanohttp.h +98 -0
- lxml/includes/libxml/parser.h +1633 -0
- lxml/includes/libxml/parserInternals.h +591 -0
- lxml/includes/libxml/relaxng.h +224 -0
- lxml/includes/libxml/schemasInternals.h +959 -0
- lxml/includes/libxml/schematron.h +143 -0
- lxml/includes/libxml/threads.h +81 -0
- lxml/includes/libxml/tree.h +1326 -0
- lxml/includes/libxml/uri.h +106 -0
- lxml/includes/libxml/valid.h +485 -0
- lxml/includes/libxml/xinclude.h +141 -0
- lxml/includes/libxml/xlink.h +193 -0
- lxml/includes/libxml/xmlIO.h +419 -0
- lxml/includes/libxml/xmlautomata.h +163 -0
- lxml/includes/libxml/xmlerror.h +962 -0
- lxml/includes/libxml/xmlexports.h +96 -0
- lxml/includes/libxml/xmlmemory.h +188 -0
- lxml/includes/libxml/xmlmodule.h +61 -0
- lxml/includes/libxml/xmlreader.h +444 -0
- lxml/includes/libxml/xmlregexp.h +116 -0
- lxml/includes/libxml/xmlsave.h +111 -0
- lxml/includes/libxml/xmlschemas.h +254 -0
- lxml/includes/libxml/xmlschemastypes.h +152 -0
- lxml/includes/libxml/xmlstring.h +140 -0
- lxml/includes/libxml/xmlunicode.h +15 -0
- lxml/includes/libxml/xmlversion.h +332 -0
- lxml/includes/libxml/xmlwriter.h +489 -0
- lxml/includes/libxml/xpath.h +569 -0
- lxml/includes/libxml/xpathInternals.h +639 -0
- lxml/includes/libxml/xpointer.h +48 -0
- lxml/includes/libxslt/__init__.py +0 -0
- lxml/includes/libxslt/attributes.h +39 -0
- lxml/includes/libxslt/documents.h +93 -0
- lxml/includes/libxslt/extensions.h +262 -0
- lxml/includes/libxslt/extra.h +72 -0
- lxml/includes/libxslt/functions.h +78 -0
- lxml/includes/libxslt/imports.h +75 -0
- lxml/includes/libxslt/keys.h +53 -0
- lxml/includes/libxslt/namespaces.h +68 -0
- lxml/includes/libxslt/numbersInternals.h +73 -0
- lxml/includes/libxslt/pattern.h +84 -0
- lxml/includes/libxslt/preproc.h +43 -0
- lxml/includes/libxslt/security.h +104 -0
- lxml/includes/libxslt/templates.h +77 -0
- lxml/includes/libxslt/transform.h +207 -0
- lxml/includes/libxslt/variables.h +118 -0
- lxml/includes/libxslt/xslt.h +110 -0
- lxml/includes/libxslt/xsltInternals.h +1995 -0
- lxml/includes/libxslt/xsltconfig.h +146 -0
- lxml/includes/libxslt/xsltexports.h +64 -0
- lxml/includes/libxslt/xsltlocale.h +44 -0
- lxml/includes/libxslt/xsltutils.h +343 -0
- lxml/includes/lxml-version.h +3 -0
- lxml/includes/relaxng.pxd +64 -0
- lxml/includes/schematron.pxd +34 -0
- lxml/includes/tree.pxd +492 -0
- lxml/includes/uri.pxd +5 -0
- lxml/includes/xinclude.pxd +22 -0
- lxml/includes/xmlerror.pxd +852 -0
- lxml/includes/xmlparser.pxd +303 -0
- lxml/includes/xmlschema.pxd +35 -0
- lxml/includes/xpath.pxd +136 -0
- lxml/includes/xslt.pxd +190 -0
- lxml/isoschematron/__init__.py +348 -0
- lxml/isoschematron/resources/rng/iso-schematron.rng +709 -0
- lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl +75 -0
- lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl +77 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl +313 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl +1160 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl +55 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl +1796 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl +588 -0
- lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt +84 -0
- lxml/iterparse.pxi +438 -0
- lxml/lxml.etree.h +244 -0
- lxml/lxml.etree_api.h +204 -0
- lxml/nsclasses.pxi +281 -0
- lxml/objectify.cpython-311-arm-linux-gnueabihf.so +0 -0
- lxml/objectify.pyx +2149 -0
- lxml/objectpath.pxi +332 -0
- lxml/parser.pxi +2059 -0
- lxml/parsertarget.pxi +180 -0
- lxml/proxy.pxi +619 -0
- lxml/public-api.pxi +178 -0
- lxml/pyclasslookup.py +3 -0
- lxml/readonlytree.pxi +565 -0
- lxml/relaxng.pxi +165 -0
- lxml/sax.cpython-311-arm-linux-gnueabihf.so +0 -0
- lxml/sax.py +286 -0
- lxml/saxparser.pxi +875 -0
- lxml/schematron.pxi +173 -0
- lxml/serializer.pxi +1849 -0
- lxml/usedoctest.py +13 -0
- lxml/xinclude.pxi +67 -0
- lxml/xmlerror.pxi +1654 -0
- lxml/xmlid.pxi +179 -0
- lxml/xmlschema.pxi +215 -0
- lxml/xpath.pxi +487 -0
- lxml/xslt.pxi +957 -0
- lxml/xsltext.pxi +242 -0
- lxml-6.0.0.dist-info/METADATA +163 -0
- lxml-6.0.0.dist-info/RECORD +174 -0
- lxml-6.0.0.dist-info/WHEEL +5 -0
- lxml-6.0.0.dist-info/licenses/LICENSE.txt +31 -0
- lxml-6.0.0.dist-info/licenses/LICENSES.txt +29 -0
- lxml-6.0.0.dist-info/top_level.txt +1 -0
lxml/html/diff.py
ADDED
@@ -0,0 +1,972 @@
|
|
1
|
+
# cython: language_level=3
|
2
|
+
|
3
|
+
try:
|
4
|
+
import cython
|
5
|
+
except ImportError:
|
6
|
+
class fake_cython:
|
7
|
+
compiled = False
|
8
|
+
def cfunc(self, func): return func
|
9
|
+
def cclass(self, func): return func
|
10
|
+
def declare(self, _, value): return value
|
11
|
+
def __getattr__(self, type_name): return "object"
|
12
|
+
|
13
|
+
cython = fake_cython()
|
14
|
+
|
15
|
+
try:
|
16
|
+
from . import _difflib as difflib
|
17
|
+
import inspect
|
18
|
+
if inspect.isfunction(difflib.get_close_matches):
|
19
|
+
raise ImportError(
|
20
|
+
"Embedded difflib is not compiled to a fast binary, using the stdlib instead.")
|
21
|
+
from cython.cimports.lxml.html._difflib import SequenceMatcher
|
22
|
+
except ImportError:
|
23
|
+
import difflib
|
24
|
+
if not cython.compiled:
|
25
|
+
from difflib import SequenceMatcher
|
26
|
+
|
27
|
+
import itertools
|
28
|
+
import functools
|
29
|
+
import operator
|
30
|
+
import re
|
31
|
+
|
32
|
+
from lxml import etree
|
33
|
+
from lxml.html import fragment_fromstring
|
34
|
+
from . import defs
|
35
|
+
|
36
|
+
__all__ = ['html_annotate', 'htmldiff']
|
37
|
+
|
38
|
+
group_by_first_item = functools.partial(itertools.groupby, key=operator.itemgetter(0))
|
39
|
+
|
40
|
+
|
41
|
+
############################################################
|
42
|
+
## Annotation
|
43
|
+
############################################################
|
44
|
+
|
45
|
+
@cython.cfunc
|
46
|
+
def html_escape(text: str, _escapes: tuple = ('&', '<', '>', '"', ''')) -> str:
|
47
|
+
# Not so slow compiled version of 'html.escape()'.
|
48
|
+
# Most of the time, we replace little to nothing, so use a fast decision what needs to be done.
|
49
|
+
ch: cython.Py_UCS4
|
50
|
+
replace: cython.char[5] = [False] * 5
|
51
|
+
for ch in text:
|
52
|
+
replace[0] |= ch == '&'
|
53
|
+
replace[1] |= ch == '<'
|
54
|
+
replace[2] |= ch == '>'
|
55
|
+
replace[3] |= ch == '"'
|
56
|
+
replace[4] |= ch == "'"
|
57
|
+
|
58
|
+
for i in range(5):
|
59
|
+
if replace[i]:
|
60
|
+
text = text.replace('&<>"\''[i], _escapes[i])
|
61
|
+
|
62
|
+
return text
|
63
|
+
|
64
|
+
|
65
|
+
if not cython.compiled:
|
66
|
+
from html import escape as html_escape
|
67
|
+
|
68
|
+
|
69
|
+
def default_markup(text, version):
|
70
|
+
return '<span title="%s">%s</span>' % (
|
71
|
+
html_escape(version), text)
|
72
|
+
|
73
|
+
def html_annotate(doclist, markup=default_markup):
|
74
|
+
"""
|
75
|
+
doclist should be ordered from oldest to newest, like::
|
76
|
+
|
77
|
+
>>> version1 = 'Hello World'
|
78
|
+
>>> version2 = 'Goodbye World'
|
79
|
+
>>> print(html_annotate([(version1, 'version 1'),
|
80
|
+
... (version2, 'version 2')]))
|
81
|
+
<span title="version 2">Goodbye</span> <span title="version 1">World</span>
|
82
|
+
|
83
|
+
The documents must be *fragments* (str/UTF8 or unicode), not
|
84
|
+
complete documents
|
85
|
+
|
86
|
+
The markup argument is a function to markup the spans of words.
|
87
|
+
This function is called like markup('Hello', 'version 2'), and
|
88
|
+
returns HTML. The first argument is text and never includes any
|
89
|
+
markup. The default uses a span with a title:
|
90
|
+
|
91
|
+
>>> print(default_markup('Some Text', 'by Joe'))
|
92
|
+
<span title="by Joe">Some Text</span>
|
93
|
+
"""
|
94
|
+
# The basic strategy we have is to split the documents up into
|
95
|
+
# logical tokens (which are words with attached markup). We then
|
96
|
+
# do diffs of each of the versions to track when a token first
|
97
|
+
# appeared in the document; the annotation attached to the token
|
98
|
+
# is the version where it first appeared.
|
99
|
+
tokenlist = [tokenize_annotated(doc, version)
|
100
|
+
for doc, version in doclist]
|
101
|
+
cur_tokens = tokenlist[0]
|
102
|
+
for tokens in tokenlist[1:]:
|
103
|
+
html_annotate_merge_annotations(cur_tokens, tokens)
|
104
|
+
cur_tokens = tokens
|
105
|
+
|
106
|
+
# After we've tracked all the tokens, we can combine spans of text
|
107
|
+
# that are adjacent and have the same annotation
|
108
|
+
cur_tokens = compress_tokens(cur_tokens)
|
109
|
+
# And finally add markup
|
110
|
+
result = markup_serialize_tokens(cur_tokens, markup)
|
111
|
+
return ''.join(result).strip()
|
112
|
+
|
113
|
+
def tokenize_annotated(doc, annotation):
|
114
|
+
"""Tokenize a document and add an annotation attribute to each token
|
115
|
+
"""
|
116
|
+
tokens = tokenize(doc, include_hrefs=False)
|
117
|
+
for tok in tokens:
|
118
|
+
tok.annotation = annotation
|
119
|
+
return tokens
|
120
|
+
|
121
|
+
def html_annotate_merge_annotations(tokens_old, tokens_new):
|
122
|
+
"""Merge the annotations from tokens_old into tokens_new, when the
|
123
|
+
tokens in the new document already existed in the old document.
|
124
|
+
"""
|
125
|
+
s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
|
126
|
+
commands = s.get_opcodes()
|
127
|
+
|
128
|
+
for command, i1, i2, j1, j2 in commands:
|
129
|
+
if command == 'equal':
|
130
|
+
eq_old = tokens_old[i1:i2]
|
131
|
+
eq_new = tokens_new[j1:j2]
|
132
|
+
copy_annotations(eq_old, eq_new)
|
133
|
+
|
134
|
+
def copy_annotations(src, dest):
|
135
|
+
"""
|
136
|
+
Copy annotations from the tokens listed in src to the tokens in dest
|
137
|
+
"""
|
138
|
+
assert len(src) == len(dest)
|
139
|
+
for src_tok, dest_tok in zip(src, dest):
|
140
|
+
dest_tok.annotation = src_tok.annotation
|
141
|
+
|
142
|
+
def compress_tokens(tokens):
|
143
|
+
"""
|
144
|
+
Combine adjacent tokens when there is no HTML between the tokens,
|
145
|
+
and they share an annotation
|
146
|
+
"""
|
147
|
+
result = [tokens[0]]
|
148
|
+
for tok in tokens[1:]:
|
149
|
+
if (not tok.pre_tags and
|
150
|
+
not result[-1].post_tags and
|
151
|
+
result[-1].annotation == tok.annotation):
|
152
|
+
compress_merge_back(result, tok)
|
153
|
+
else:
|
154
|
+
result.append(tok)
|
155
|
+
return result
|
156
|
+
|
157
|
+
@cython.cfunc
|
158
|
+
def compress_merge_back(tokens: list, tok):
|
159
|
+
""" Merge tok into the last element of tokens (modifying the list of
|
160
|
+
tokens in-place). """
|
161
|
+
last = tokens[-1]
|
162
|
+
if type(last) is not token or type(tok) is not token:
|
163
|
+
tokens.append(tok)
|
164
|
+
else:
|
165
|
+
text = last + last.trailing_whitespace + tok
|
166
|
+
merged = token(text,
|
167
|
+
pre_tags=last.pre_tags,
|
168
|
+
post_tags=tok.post_tags,
|
169
|
+
trailing_whitespace=tok.trailing_whitespace)
|
170
|
+
merged.annotation = last.annotation
|
171
|
+
tokens[-1] = merged
|
172
|
+
|
173
|
+
def markup_serialize_tokens(tokens, markup_func):
|
174
|
+
"""
|
175
|
+
Serialize the list of tokens into a list of text chunks, calling
|
176
|
+
markup_func around text to add annotations.
|
177
|
+
"""
|
178
|
+
for token in tokens:
|
179
|
+
yield from token.pre_tags
|
180
|
+
html = token.html()
|
181
|
+
html = markup_func(html, token.annotation) + token.trailing_whitespace
|
182
|
+
yield html
|
183
|
+
yield from token.post_tags
|
184
|
+
|
185
|
+
|
186
|
+
############################################################
|
187
|
+
## HTML Diffs
|
188
|
+
############################################################
|
189
|
+
|
190
|
+
def htmldiff(old_html, new_html):
|
191
|
+
## FIXME: this should take parsed documents too, and use their body
|
192
|
+
## or other content.
|
193
|
+
""" Do a diff of the old and new document. The documents are HTML
|
194
|
+
*fragments* (str/UTF8 or unicode), they are not complete documents
|
195
|
+
(i.e., no <html> tag).
|
196
|
+
|
197
|
+
Returns HTML with <ins> and <del> tags added around the
|
198
|
+
appropriate text.
|
199
|
+
|
200
|
+
Markup is generally ignored, with the markup from new_html
|
201
|
+
preserved, and possibly some markup from old_html (though it is
|
202
|
+
considered acceptable to lose some of the old markup). Only the
|
203
|
+
words in the HTML are diffed. The exception is <img> tags, which
|
204
|
+
are treated like words, and the href attribute of <a> tags, which
|
205
|
+
are noted inside the tag itself when there are changes.
|
206
|
+
"""
|
207
|
+
old_html_tokens = tokenize(old_html)
|
208
|
+
new_html_tokens = tokenize(new_html)
|
209
|
+
result = htmldiff_tokens(old_html_tokens, new_html_tokens)
|
210
|
+
try:
|
211
|
+
result = ''.join(result).strip()
|
212
|
+
except (ValueError, TypeError) as exc:
|
213
|
+
print(exc)
|
214
|
+
result = ''
|
215
|
+
return fixup_ins_del_tags(result)
|
216
|
+
|
217
|
+
|
218
|
+
def htmldiff_tokens(html1_tokens, html2_tokens):
|
219
|
+
""" Does a diff on the tokens themselves, returning a list of text
|
220
|
+
chunks (not tokens).
|
221
|
+
"""
|
222
|
+
# There are several passes as we do the differences. The tokens
|
223
|
+
# isolate the portion of the content we care to diff; difflib does
|
224
|
+
# all the actual hard work at that point.
|
225
|
+
#
|
226
|
+
# Then we must create a valid document from pieces of both the old
|
227
|
+
# document and the new document. We generally prefer to take
|
228
|
+
# markup from the new document, and only do a best effort attempt
|
229
|
+
# to keep markup from the old document; anything that we can't
|
230
|
+
# resolve we throw away. Also we try to put the deletes as close
|
231
|
+
# to the location where we think they would have been -- because
|
232
|
+
# we are only keeping the markup from the new document, it can be
|
233
|
+
# fuzzy where in the new document the old text would have gone.
|
234
|
+
# Again we just do a best effort attempt.
|
235
|
+
s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
|
236
|
+
commands = s.get_opcodes()
|
237
|
+
result = []
|
238
|
+
for command, i1, i2, j1, j2 in commands:
|
239
|
+
if command == 'equal':
|
240
|
+
result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
|
241
|
+
continue
|
242
|
+
if command == 'insert' or command == 'replace':
|
243
|
+
ins_tokens = expand_tokens(html2_tokens[j1:j2])
|
244
|
+
merge_insert(ins_tokens, result)
|
245
|
+
if command == 'delete' or command == 'replace':
|
246
|
+
del_tokens = expand_tokens(html1_tokens[i1:i2])
|
247
|
+
merge_delete(del_tokens, result)
|
248
|
+
|
249
|
+
# If deletes were inserted directly as <del> then we'd have an
|
250
|
+
# invalid document at this point. Instead we put in special
|
251
|
+
# markers, and when the complete diffed document has been created
|
252
|
+
# we try to move the deletes around and resolve any problems.
|
253
|
+
cleanup_delete(result)
|
254
|
+
|
255
|
+
return result
|
256
|
+
|
257
|
+
|
258
|
+
def expand_tokens(tokens, equal=False):
|
259
|
+
"""Given a list of tokens, return a generator of the chunks of
|
260
|
+
text for the data in the tokens.
|
261
|
+
"""
|
262
|
+
for token in tokens:
|
263
|
+
yield from token.pre_tags
|
264
|
+
if not equal or not token.hide_when_equal:
|
265
|
+
yield token.html() + token.trailing_whitespace
|
266
|
+
yield from token.post_tags
|
267
|
+
|
268
|
+
|
269
|
+
def merge_insert(ins_chunks, doc: list):
|
270
|
+
""" doc is the already-handled document (as a list of text chunks);
|
271
|
+
here we add <ins>ins_chunks</ins> to the end of that. """
|
272
|
+
# Though we don't throw away unbalanced start/end tags
|
273
|
+
# (we assume there is accompanying markup later or earlier in the
|
274
|
+
# document), we only put <ins> around the balanced portion.
|
275
|
+
|
276
|
+
# Legacy note: We make a choice here. Originally, we merged all sequences of
|
277
|
+
# unbalanced tags together into separate start and end tag groups. Now, we look at
|
278
|
+
# each sequence separately, leading to more fine-grained diffs but different
|
279
|
+
# tag structure than before.
|
280
|
+
|
281
|
+
item: tuple
|
282
|
+
for balanced, marked_chunks in group_by_first_item(mark_unbalanced(ins_chunks)):
|
283
|
+
chunks = [item[1] for item in marked_chunks]
|
284
|
+
if balanced == 'b':
|
285
|
+
if doc and not doc[-1].endswith(' '):
|
286
|
+
# Fix up the case where the word before the insert didn't end with a space.
|
287
|
+
doc[-1] += ' '
|
288
|
+
doc.append('<ins>')
|
289
|
+
doc.extend(chunks)
|
290
|
+
if doc[-1].endswith(' '):
|
291
|
+
# We move space outside of </ins>.
|
292
|
+
doc[-1] = doc[-1][:-1]
|
293
|
+
doc.append('</ins> ')
|
294
|
+
else:
|
295
|
+
# unmatched start or end
|
296
|
+
doc.extend(chunks)
|
297
|
+
|
298
|
+
|
299
|
+
@cython.cfunc
|
300
|
+
def tag_name_of_chunk(chunk: str) -> str:
|
301
|
+
i: cython.Py_ssize_t
|
302
|
+
ch: cython.Py_UCS4
|
303
|
+
|
304
|
+
if chunk[0] != '<':
|
305
|
+
return ""
|
306
|
+
|
307
|
+
start_pos = 1
|
308
|
+
for i, ch in enumerate(chunk):
|
309
|
+
if ch == '/':
|
310
|
+
start_pos = 2
|
311
|
+
elif ch == '>':
|
312
|
+
return chunk[start_pos:i]
|
313
|
+
elif ch.isspace():
|
314
|
+
return chunk[start_pos:i]
|
315
|
+
|
316
|
+
return chunk[start_pos:]
|
317
|
+
|
318
|
+
if not cython.compiled:
|
319
|
+
# Avoid performance regression in Python due to string iteration.
|
320
|
+
def tag_name_of_chunk(chunk: str) -> str:
|
321
|
+
return chunk.split(None, 1)[0].strip('<>/')
|
322
|
+
|
323
|
+
|
324
|
+
# These are sentinels to represent the start and end of a <del>
|
325
|
+
# segment, until we do the cleanup phase to turn them into proper
|
326
|
+
# markup:
|
327
|
+
class DEL_START:
|
328
|
+
pass
|
329
|
+
class DEL_END:
|
330
|
+
pass
|
331
|
+
|
332
|
+
|
333
|
+
def merge_delete(del_chunks, doc: list):
|
334
|
+
""" Adds the text chunks in del_chunks to the document doc (another
|
335
|
+
list of text chunks) with marker to show it is a delete.
|
336
|
+
cleanup_delete later resolves these markers into <del> tags."""
|
337
|
+
|
338
|
+
doc.append(DEL_START)
|
339
|
+
doc.extend(del_chunks)
|
340
|
+
doc.append(DEL_END)
|
341
|
+
|
342
|
+
|
343
|
+
def cleanup_delete(chunks: list):
|
344
|
+
""" Cleans up any DEL_START/DEL_END markers in the document, replacing
|
345
|
+
them with <del></del>. To do this while keeping the document
|
346
|
+
valid, it may need to drop some tags (either start or end tags).
|
347
|
+
|
348
|
+
It may also move the del into adjacent tags to try to move it to a
|
349
|
+
similar location where it was originally located (e.g., moving a
|
350
|
+
delete into preceding <div> tag, if the del looks like (DEL_START,
|
351
|
+
'Text</div>', DEL_END)
|
352
|
+
"""
|
353
|
+
chunk_count = len(chunks)
|
354
|
+
|
355
|
+
i: cython.Py_ssize_t
|
356
|
+
del_start: cython.Py_ssize_t
|
357
|
+
del_end: cython.Py_ssize_t
|
358
|
+
shift_start_right: cython.Py_ssize_t
|
359
|
+
shift_end_left: cython.Py_ssize_t
|
360
|
+
unbalanced_start: cython.Py_ssize_t
|
361
|
+
unbalanced_end: cython.Py_ssize_t
|
362
|
+
pos: cython.Py_ssize_t
|
363
|
+
start_pos: cython.Py_ssize_t
|
364
|
+
chunk: str
|
365
|
+
|
366
|
+
start_pos = 0
|
367
|
+
while 1:
|
368
|
+
# Find a pending DEL_START/DEL_END, splitting the document
|
369
|
+
# into stuff-preceding-DEL_START, stuff-inside, and
|
370
|
+
# stuff-following-DEL_END
|
371
|
+
try:
|
372
|
+
del_start = chunks.index(DEL_START, start_pos)
|
373
|
+
except ValueError:
|
374
|
+
# Nothing found, we've cleaned up the entire doc
|
375
|
+
break
|
376
|
+
else:
|
377
|
+
del_end = chunks.index(DEL_END, del_start + 1)
|
378
|
+
|
379
|
+
shift_end_left = shift_start_right = 0
|
380
|
+
unbalanced_start = unbalanced_end = 0
|
381
|
+
deleted_chunks = mark_unbalanced(chunks[del_start+1:del_end])
|
382
|
+
|
383
|
+
# For unbalanced start tags at the beginning, find matching (non-deleted)
|
384
|
+
# end tags after the current DEL_END and move the start tag outside.
|
385
|
+
for balanced, del_chunk in deleted_chunks:
|
386
|
+
if balanced != 'us':
|
387
|
+
break
|
388
|
+
unbalanced_start += 1
|
389
|
+
unbalanced_start_name = tag_name_of_chunk(del_chunk)
|
390
|
+
for i in range(del_end+1, chunk_count):
|
391
|
+
if chunks[i] is DEL_START:
|
392
|
+
break
|
393
|
+
chunk = chunks[i]
|
394
|
+
if chunk[0] != '<' or chunk[1] == '/':
|
395
|
+
# Reached a word or closing tag.
|
396
|
+
break
|
397
|
+
name = tag_name_of_chunk(chunk)
|
398
|
+
if name == 'ins':
|
399
|
+
# Cannot move into an insert.
|
400
|
+
break
|
401
|
+
assert name != 'del', f"Unexpected delete tag: {chunk!r}"
|
402
|
+
if name != unbalanced_start_name:
|
403
|
+
# Avoid mixing in other start tags.
|
404
|
+
break
|
405
|
+
# Exclude start tag to balance the end tag.
|
406
|
+
shift_start_right += 1
|
407
|
+
|
408
|
+
# For unbalanced end tags at the end, find matching (non-deleted)
|
409
|
+
# start tags before the currend DEL_START and move the end tag outside.
|
410
|
+
for balanced, del_chunk in reversed(deleted_chunks):
|
411
|
+
if balanced != 'ue':
|
412
|
+
break
|
413
|
+
unbalanced_end += 1
|
414
|
+
unbalanced_end_name = tag_name_of_chunk(del_chunk)
|
415
|
+
for i in range(del_start - 1, -1, -1):
|
416
|
+
if chunks[i] is DEL_END:
|
417
|
+
break
|
418
|
+
chunk = chunks[i]
|
419
|
+
if chunk[0] == '<' and chunk[1] != '/':
|
420
|
+
# Reached an opening tag, can we go further? Maybe not...
|
421
|
+
break
|
422
|
+
name = tag_name_of_chunk(chunk)
|
423
|
+
if name == 'ins' or name == 'del':
|
424
|
+
# Cannot move into an insert or delete.
|
425
|
+
break
|
426
|
+
if name != unbalanced_end_name:
|
427
|
+
# Avoid mixing in other start tags.
|
428
|
+
break
|
429
|
+
# Exclude end tag to balance the start tag.
|
430
|
+
shift_end_left += 1
|
431
|
+
|
432
|
+
"""
|
433
|
+
# This is what we do below in loops, spelled out using slicing and list copying:
|
434
|
+
|
435
|
+
chunks[del_start - shift_end_left : del_end + shift_start_right + 1] = [
|
436
|
+
*chunks[del_start + 1: del_start + shift_start_right + 1],
|
437
|
+
'<del>',
|
438
|
+
*chunks[del_start + unbalanced_start + 1 : del_end - unbalanced_end],
|
439
|
+
'</del> ',
|
440
|
+
*chunks[del_end - shift_end_left: del_end],
|
441
|
+
]
|
442
|
+
|
443
|
+
new_del_end = del_end - 2 * shift_end_left
|
444
|
+
assert chunks[new_del_end] == '</del> '
|
445
|
+
del_end = new_del_end
|
446
|
+
|
447
|
+
if new_del_start > 0 and not chunks[new_del_start - 1].endswith(' '):
|
448
|
+
# Fix up case where the word before us didn't have a trailing space.
|
449
|
+
chunks[new_del_start - 1] += ' '
|
450
|
+
if new_del_end > 0 and chunks[new_del_end - 1].endswith(' '):
|
451
|
+
# Move space outside of </del>.
|
452
|
+
chunks[new_del_end - 1] = chunks[new_del_end - 1][:-1]
|
453
|
+
"""
|
454
|
+
pos = del_start - shift_end_left
|
455
|
+
# Move re-balanced start tags before the '<del>'.
|
456
|
+
for i in range(del_start + 1, del_start + shift_start_right + 1):
|
457
|
+
chunks[pos] = chunks[i]
|
458
|
+
pos += 1
|
459
|
+
if pos and not chunks[pos - 1].endswith(' '):
|
460
|
+
# Fix up the case where the word before '<del>' didn't have a trailing space.
|
461
|
+
chunks[pos - 1] += ' '
|
462
|
+
chunks[pos] = '<del>'
|
463
|
+
pos += 1
|
464
|
+
# Copy only the balanced deleted content between '<del>' and '</del>'.
|
465
|
+
for i in range(del_start + unbalanced_start + 1, del_end - unbalanced_end):
|
466
|
+
chunks[pos] = chunks[i]
|
467
|
+
pos += 1
|
468
|
+
if chunks[pos - 1].endswith(' '):
|
469
|
+
# Move trailing space outside of </del>.
|
470
|
+
chunks[pos - 1] = chunks[pos - 1][:-1]
|
471
|
+
chunks[pos] = '</del> '
|
472
|
+
pos += 1
|
473
|
+
# Move re-balanced end tags after the '</del>'.
|
474
|
+
for i in range(del_end - shift_end_left, del_end):
|
475
|
+
chunks[pos] = chunks[i]
|
476
|
+
pos += 1
|
477
|
+
# Adjust the length of the processed part in 'chunks'.
|
478
|
+
del chunks[pos : del_end + shift_start_right + 1]
|
479
|
+
start_pos = pos
|
480
|
+
|
481
|
+
|
482
|
+
@cython.cfunc
|
483
|
+
def mark_unbalanced(chunks) -> list:
|
484
|
+
tag_stack = []
|
485
|
+
marked = []
|
486
|
+
|
487
|
+
chunk: str
|
488
|
+
parents: list
|
489
|
+
|
490
|
+
for chunk in chunks:
|
491
|
+
if not chunk.startswith('<'):
|
492
|
+
marked.append(('b', chunk))
|
493
|
+
continue
|
494
|
+
|
495
|
+
name = tag_name_of_chunk(chunk)
|
496
|
+
if name in empty_tags:
|
497
|
+
marked.append(('b', chunk))
|
498
|
+
continue
|
499
|
+
|
500
|
+
if chunk[1] == '/':
|
501
|
+
# closing tag found, unwind tag stack
|
502
|
+
while tag_stack:
|
503
|
+
start_name, start_chunk, parents = tag_stack.pop()
|
504
|
+
if start_name == name:
|
505
|
+
# balanced tag closing, keep rest of stack intact
|
506
|
+
parents.append(('b', start_chunk))
|
507
|
+
parents.extend(marked)
|
508
|
+
parents.append(('b', chunk))
|
509
|
+
marked = parents
|
510
|
+
chunk = None
|
511
|
+
break
|
512
|
+
else:
|
513
|
+
# unmatched start tag
|
514
|
+
parents.append(('us', start_chunk))
|
515
|
+
parents.extend(marked)
|
516
|
+
marked = parents
|
517
|
+
|
518
|
+
if chunk is not None:
|
519
|
+
# unmatched end tag left after clearing the stack
|
520
|
+
marked.append(('ue', chunk))
|
521
|
+
else:
|
522
|
+
# new start tag found
|
523
|
+
tag_stack.append((name, chunk, marked))
|
524
|
+
marked = []
|
525
|
+
|
526
|
+
# add any unbalanced start tags
|
527
|
+
while tag_stack:
|
528
|
+
_, start_chunk, parents = tag_stack.pop()
|
529
|
+
parents.append(('us', start_chunk))
|
530
|
+
parents.extend(marked)
|
531
|
+
marked = parents
|
532
|
+
|
533
|
+
return marked
|
534
|
+
|
535
|
+
|
536
|
+
class token(str):
|
537
|
+
""" Represents a diffable token, generally a word that is displayed to
|
538
|
+
the user. Opening tags are attached to this token when they are
|
539
|
+
adjacent (pre_tags) and closing tags that follow the word
|
540
|
+
(post_tags). Some exceptions occur when there are empty tags
|
541
|
+
adjacent to a word, so there may be close tags in pre_tags, or
|
542
|
+
open tags in post_tags.
|
543
|
+
|
544
|
+
We also keep track of whether the word was originally followed by
|
545
|
+
whitespace, even though we do not want to treat the word as
|
546
|
+
equivalent to a similar word that does not have a trailing
|
547
|
+
space."""
|
548
|
+
|
549
|
+
# When this is true, the token will be eliminated from the
|
550
|
+
# displayed diff if no change has occurred:
|
551
|
+
hide_when_equal = False
|
552
|
+
|
553
|
+
def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
|
554
|
+
obj = str.__new__(cls, text)
|
555
|
+
|
556
|
+
obj.pre_tags = pre_tags if pre_tags is not None else []
|
557
|
+
obj.post_tags = post_tags if post_tags is not None else []
|
558
|
+
obj.trailing_whitespace = trailing_whitespace
|
559
|
+
|
560
|
+
return obj
|
561
|
+
|
562
|
+
def __repr__(self):
|
563
|
+
return 'token(%s, %r, %r, %r)' % (
|
564
|
+
str.__repr__(self), self.pre_tags, self.post_tags, self.trailing_whitespace)
|
565
|
+
|
566
|
+
def html(self):
|
567
|
+
return str(self)
|
568
|
+
|
569
|
+
class tag_token(token):
|
570
|
+
|
571
|
+
""" Represents a token that is actually a tag. Currently this is just
|
572
|
+
the <img> tag, which takes up visible space just like a word but
|
573
|
+
is only represented in a document by a tag. """
|
574
|
+
|
575
|
+
def __new__(cls, tag, data, html_repr, pre_tags=None,
|
576
|
+
post_tags=None, trailing_whitespace=""):
|
577
|
+
obj = token.__new__(cls, f"{type}: {data}",
|
578
|
+
pre_tags=pre_tags,
|
579
|
+
post_tags=post_tags,
|
580
|
+
trailing_whitespace=trailing_whitespace)
|
581
|
+
obj.tag = tag
|
582
|
+
obj.data = data
|
583
|
+
obj.html_repr = html_repr
|
584
|
+
return obj
|
585
|
+
|
586
|
+
def __repr__(self):
|
587
|
+
return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
|
588
|
+
self.tag,
|
589
|
+
self.data,
|
590
|
+
self.html_repr,
|
591
|
+
self.pre_tags,
|
592
|
+
self.post_tags,
|
593
|
+
self.trailing_whitespace)
|
594
|
+
def html(self):
|
595
|
+
return self.html_repr
|
596
|
+
|
597
|
+
class href_token(token):
|
598
|
+
|
599
|
+
""" Represents the href in an anchor tag. Unlike other words, we only
|
600
|
+
show the href when it changes. """
|
601
|
+
|
602
|
+
hide_when_equal = True
|
603
|
+
|
604
|
+
def html(self):
|
605
|
+
return ' Link: %s' % self
|
606
|
+
|
607
|
+
|
608
|
+
def tokenize(html, include_hrefs=True):
|
609
|
+
"""
|
610
|
+
Parse the given HTML and returns token objects (words with attached tags).
|
611
|
+
|
612
|
+
This parses only the content of a page; anything in the head is
|
613
|
+
ignored, and the <head> and <body> elements are themselves
|
614
|
+
optional. The content is then parsed by lxml, which ensures the
|
615
|
+
validity of the resulting parsed document (though lxml may make
|
616
|
+
incorrect guesses when the markup is particular bad).
|
617
|
+
|
618
|
+
<ins> and <del> tags are also eliminated from the document, as
|
619
|
+
that gets confusing.
|
620
|
+
|
621
|
+
If include_hrefs is true, then the href attribute of <a> tags is
|
622
|
+
included as a special kind of diffable token."""
|
623
|
+
if etree.iselement(html):
|
624
|
+
body_el = html
|
625
|
+
else:
|
626
|
+
body_el = parse_html(html, cleanup=True)
|
627
|
+
# Then we split the document into text chunks for each tag, word, and end tag:
|
628
|
+
chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
|
629
|
+
# Finally re-joining them into token objects:
|
630
|
+
return fixup_chunks(chunks)
|
631
|
+
|
632
|
+
|
633
|
+
def parse_html(html, cleanup=True):
|
634
|
+
"""
|
635
|
+
Parses an HTML fragment, returning an lxml element. Note that the HTML will be
|
636
|
+
wrapped in a <div> tag that was not in the original document.
|
637
|
+
|
638
|
+
If cleanup is true, make sure there's no <head> or <body>, and get
|
639
|
+
rid of any <ins> and <del> tags.
|
640
|
+
"""
|
641
|
+
if cleanup:
|
642
|
+
# This removes any extra markup or structure like <head>:
|
643
|
+
html = cleanup_html(html)
|
644
|
+
return fragment_fromstring(html, create_parent=True)
|
645
|
+
|
646
|
+
|
647
|
+
_search_body = re.compile(r'<body.*?>', re.I|re.S).search
|
648
|
+
_search_end_body = re.compile(r'</body.*?>', re.I|re.S).search
|
649
|
+
_replace_ins_del = re.compile(r'</?(ins|del).*?>', re.I|re.S).sub
|
650
|
+
|
651
|
+
def cleanup_html(html):
|
652
|
+
""" This 'cleans' the HTML, meaning that any page structure is removed
|
653
|
+
(only the contents of <body> are used, if there is any <body).
|
654
|
+
Also <ins> and <del> tags are removed. """
|
655
|
+
match = _search_body(html)
|
656
|
+
if match:
|
657
|
+
html = html[match.end():]
|
658
|
+
match = _search_end_body(html)
|
659
|
+
if match:
|
660
|
+
html = html[:match.start()]
|
661
|
+
html = _replace_ins_del('', html)
|
662
|
+
return html
|
663
|
+
|
664
|
+
|
665
|
+
def split_trailing_whitespace(word):
|
666
|
+
"""
|
667
|
+
This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
|
668
|
+
"""
|
669
|
+
stripped_length = len(word.rstrip())
|
670
|
+
return word[0:stripped_length], word[stripped_length:]
|
671
|
+
|
672
|
+
|
673
|
+
def fixup_chunks(chunks):
|
674
|
+
"""
|
675
|
+
This function takes a list of chunks and produces a list of tokens.
|
676
|
+
"""
|
677
|
+
tag_accum = []
|
678
|
+
cur_word = None
|
679
|
+
result = []
|
680
|
+
for chunk in chunks:
|
681
|
+
if isinstance(chunk, tuple):
|
682
|
+
if chunk[0] == 'img':
|
683
|
+
src = chunk[1]
|
684
|
+
tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
|
685
|
+
cur_word = tag_token('img', src, html_repr=tag,
|
686
|
+
pre_tags=tag_accum,
|
687
|
+
trailing_whitespace=trailing_whitespace)
|
688
|
+
tag_accum = []
|
689
|
+
result.append(cur_word)
|
690
|
+
|
691
|
+
elif chunk[0] == 'href':
|
692
|
+
href = chunk[1]
|
693
|
+
cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
|
694
|
+
tag_accum = []
|
695
|
+
result.append(cur_word)
|
696
|
+
continue
|
697
|
+
|
698
|
+
if is_word(chunk):
|
699
|
+
chunk, trailing_whitespace = split_trailing_whitespace(chunk)
|
700
|
+
cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
|
701
|
+
tag_accum = []
|
702
|
+
result.append(cur_word)
|
703
|
+
|
704
|
+
elif is_start_tag(chunk):
|
705
|
+
tag_accum.append(chunk)
|
706
|
+
|
707
|
+
elif is_end_tag(chunk):
|
708
|
+
if tag_accum:
|
709
|
+
tag_accum.append(chunk)
|
710
|
+
else:
|
711
|
+
assert cur_word, (
|
712
|
+
"Weird state, cur_word=%r, result=%r, chunks=%r of %r"
|
713
|
+
% (cur_word, result, chunk, chunks))
|
714
|
+
cur_word.post_tags.append(chunk)
|
715
|
+
else:
|
716
|
+
assert False
|
717
|
+
|
718
|
+
if not result:
|
719
|
+
return [token('', pre_tags=tag_accum)]
|
720
|
+
else:
|
721
|
+
result[-1].post_tags.extend(tag_accum)
|
722
|
+
|
723
|
+
return result
|
724
|
+
|
725
|
+
|
726
|
+
# All the tags in HTML that don't require end tags:
|
727
|
+
empty_tags = cython.declare(frozenset, defs.empty_tags)
|
728
|
+
|
729
|
+
block_level_tags = cython.declare(frozenset, frozenset([
|
730
|
+
'address',
|
731
|
+
'blockquote',
|
732
|
+
'center',
|
733
|
+
'dir',
|
734
|
+
'div',
|
735
|
+
'dl',
|
736
|
+
'fieldset',
|
737
|
+
'form',
|
738
|
+
'h1',
|
739
|
+
'h2',
|
740
|
+
'h3',
|
741
|
+
'h4',
|
742
|
+
'h5',
|
743
|
+
'h6',
|
744
|
+
'hr',
|
745
|
+
'isindex',
|
746
|
+
'menu',
|
747
|
+
'noframes',
|
748
|
+
'noscript',
|
749
|
+
'ol',
|
750
|
+
'p',
|
751
|
+
'pre',
|
752
|
+
'table',
|
753
|
+
'ul',
|
754
|
+
]))
|
755
|
+
|
756
|
+
block_level_container_tags = cython.declare(frozenset, frozenset([
|
757
|
+
'dd',
|
758
|
+
'dt',
|
759
|
+
'frameset',
|
760
|
+
'li',
|
761
|
+
'tbody',
|
762
|
+
'td',
|
763
|
+
'tfoot',
|
764
|
+
'th',
|
765
|
+
'thead',
|
766
|
+
'tr',
|
767
|
+
]))
|
768
|
+
|
769
|
+
any_block_level_tag = cython.declare(tuple, tuple(sorted(
|
770
|
+
block_level_tags | block_level_container_tags))
|
771
|
+
)
|
772
|
+
|
773
|
+
|
774
|
+
def flatten_el(el, include_hrefs, skip_tag=False):
|
775
|
+
""" Takes an lxml element el, and generates all the text chunks for
|
776
|
+
that tag. Each start tag is a chunk, each word is a chunk, and each
|
777
|
+
end tag is a chunk.
|
778
|
+
|
779
|
+
If skip_tag is true, then the outermost container tag is
|
780
|
+
not returned (just its contents)."""
|
781
|
+
if not skip_tag:
|
782
|
+
if el.tag == 'img':
|
783
|
+
yield ('img', el.get('src'), start_tag(el))
|
784
|
+
else:
|
785
|
+
yield start_tag(el)
|
786
|
+
if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
|
787
|
+
return
|
788
|
+
start_words = split_words(el.text)
|
789
|
+
for word in start_words:
|
790
|
+
yield html_escape(word)
|
791
|
+
for child in el:
|
792
|
+
yield from flatten_el(child, include_hrefs=include_hrefs)
|
793
|
+
if el.tag == 'a' and el.get('href') and include_hrefs:
|
794
|
+
yield ('href', el.get('href'))
|
795
|
+
if not skip_tag:
|
796
|
+
yield end_tag(el)
|
797
|
+
end_words = split_words(el.tail)
|
798
|
+
for word in end_words:
|
799
|
+
yield html_escape(word)
|
800
|
+
|
801
|
+
_find_words = re.compile(r'\S+(?:\s+|$)', re.U).findall
|
802
|
+
|
803
|
+
def split_words(text):
|
804
|
+
""" Splits some text into words. Includes trailing whitespace
|
805
|
+
on each word when appropriate. """
|
806
|
+
if not text or not text.strip():
|
807
|
+
return []
|
808
|
+
|
809
|
+
words = _find_words(text)
|
810
|
+
return words
|
811
|
+
|
812
|
+
_has_start_whitespace = re.compile(r'^[ \t\n\r]').match
|
813
|
+
|
814
|
+
def start_tag(el):
|
815
|
+
"""
|
816
|
+
The text representation of the start tag for a tag.
|
817
|
+
"""
|
818
|
+
attributes = ''.join([
|
819
|
+
f' {name}="{html_escape(value)}"'
|
820
|
+
for name, value in el.attrib.items()
|
821
|
+
])
|
822
|
+
return f'<{el.tag}{attributes}>'
|
823
|
+
|
824
|
+
def end_tag(el):
|
825
|
+
""" The text representation of an end tag for a tag. Includes
|
826
|
+
trailing whitespace when appropriate. """
|
827
|
+
tail = el.tail
|
828
|
+
extra = ' ' if tail and _has_start_whitespace(tail) else ''
|
829
|
+
return f'</{el.tag}>{extra}'
|
830
|
+
|
831
|
+
def is_word(tok):
|
832
|
+
return not tok.startswith('<')
|
833
|
+
|
834
|
+
def is_end_tag(tok):
|
835
|
+
return tok.startswith('</')
|
836
|
+
|
837
|
+
def is_start_tag(tok):
|
838
|
+
return tok.startswith('<') and not tok.startswith('</')
|
839
|
+
|
840
|
+
def fixup_ins_del_tags(html):
|
841
|
+
""" Given an html string, move any <ins> or <del> tags inside of any
|
842
|
+
block-level elements, e.g. transform <ins><p>word</p></ins> to
|
843
|
+
<p><ins>word</ins></p> """
|
844
|
+
doc = parse_html(html, cleanup=False)
|
845
|
+
_fixup_ins_del_tags(doc)
|
846
|
+
html = serialize_html_fragment(doc, skip_outer=True)
|
847
|
+
return html
|
848
|
+
|
849
|
+
def serialize_html_fragment(el, skip_outer=False):
|
850
|
+
""" Serialize a single lxml element as HTML. The serialized form
|
851
|
+
includes the elements tail.
|
852
|
+
|
853
|
+
If skip_outer is true, then don't serialize the outermost tag
|
854
|
+
"""
|
855
|
+
assert not isinstance(el, str), (
|
856
|
+
f"You should pass in an element, not a string like {el!r}")
|
857
|
+
html = etree.tostring(el, method="html", encoding='unicode')
|
858
|
+
if skip_outer:
|
859
|
+
# Get rid of the extra starting tag:
|
860
|
+
html = html[html.find('>')+1:]
|
861
|
+
# Get rid of the extra end tag:
|
862
|
+
html = html[:html.rfind('<')]
|
863
|
+
return html.strip()
|
864
|
+
else:
|
865
|
+
return html
|
866
|
+
|
867
|
+
|
868
|
+
@cython.cfunc
|
869
|
+
def _fixup_ins_del_tags(doc):
|
870
|
+
"""fixup_ins_del_tags that works on an lxml document in-place
|
871
|
+
"""
|
872
|
+
for el in list(doc.iter('ins', 'del')):
|
873
|
+
if not _contains_block_level_tag(el):
|
874
|
+
continue
|
875
|
+
_move_el_inside_block(el, tag=el.tag)
|
876
|
+
el.drop_tag()
|
877
|
+
#_merge_element_contents(el)
|
878
|
+
|
879
|
+
|
880
|
+
@cython.cfunc
|
881
|
+
def _contains_block_level_tag(el):
|
882
|
+
"""True if the element contains any block-level elements, like <p>, <td>, etc.
|
883
|
+
"""
|
884
|
+
for el in el.iter(*any_block_level_tag):
|
885
|
+
return True
|
886
|
+
return False
|
887
|
+
|
888
|
+
|
889
|
+
@cython.cfunc
|
890
|
+
def _move_el_inside_block(el, tag):
|
891
|
+
""" helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
|
892
|
+
and moves them inside any block-level tags. """
|
893
|
+
makeelement = el.makeelement
|
894
|
+
for block_level_el in el.iter(*any_block_level_tag):
|
895
|
+
if block_level_el is not el:
|
896
|
+
break
|
897
|
+
else:
|
898
|
+
# No block-level tags in any child
|
899
|
+
children_tag = makeelement(tag)
|
900
|
+
children_tag.text = el.text
|
901
|
+
el.text = None
|
902
|
+
children_tag.extend(iter(el))
|
903
|
+
el[:] = [children_tag]
|
904
|
+
return
|
905
|
+
|
906
|
+
for child in list(el):
|
907
|
+
if _contains_block_level_tag(child):
|
908
|
+
_move_el_inside_block(child, tag)
|
909
|
+
if child.tail:
|
910
|
+
tail_tag = makeelement(tag)
|
911
|
+
tail_tag.text = child.tail
|
912
|
+
child.tail = None
|
913
|
+
child.addnext(tail_tag)
|
914
|
+
else:
|
915
|
+
child_tag = makeelement(tag)
|
916
|
+
el.replace(child, child_tag)
|
917
|
+
child_tag.append(child)
|
918
|
+
if el.text:
|
919
|
+
text_tag = makeelement(tag)
|
920
|
+
text_tag.text = el.text
|
921
|
+
el.text = None
|
922
|
+
el.insert(0, text_tag)
|
923
|
+
|
924
|
+
|
925
|
+
def _merge_element_contents(el):
|
926
|
+
"""
|
927
|
+
Removes an element, but merges its contents into its place, e.g.,
|
928
|
+
given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
|
929
|
+
<p>Hi there!</p>
|
930
|
+
"""
|
931
|
+
parent = el.getparent()
|
932
|
+
text = el.text
|
933
|
+
tail = el.tail
|
934
|
+
if tail:
|
935
|
+
if not len(el):
|
936
|
+
text = (text or '') + tail
|
937
|
+
else:
|
938
|
+
el[-1].tail = (el[-1].tail or '') + tail
|
939
|
+
index = parent.index(el)
|
940
|
+
if text:
|
941
|
+
previous = el.getprevious()
|
942
|
+
if previous is None:
|
943
|
+
parent.text = (parent.text or '') + text
|
944
|
+
else:
|
945
|
+
previous.tail = (previous.tail or '') + text
|
946
|
+
parent[index:index+1] = el.getchildren()
|
947
|
+
|
948
|
+
|
949
|
+
@cython.final
|
950
|
+
@cython.cclass
|
951
|
+
class InsensitiveSequenceMatcher(SequenceMatcher):
|
952
|
+
"""
|
953
|
+
Acts like SequenceMatcher, but tries not to find very small equal
|
954
|
+
blocks amidst large spans of changes
|
955
|
+
"""
|
956
|
+
|
957
|
+
threshold = 2
|
958
|
+
|
959
|
+
@cython.cfunc
|
960
|
+
def get_matching_blocks(self) -> list:
|
961
|
+
size: cython.Py_ssize_t = min(len(self.b), len(self.b))
|
962
|
+
threshold: cython.Py_ssize_t = self.threshold
|
963
|
+
threshold = min(threshold, size // 4)
|
964
|
+
actual = SequenceMatcher.get_matching_blocks(self)
|
965
|
+
return [item for item in actual
|
966
|
+
if item[2] > threshold
|
967
|
+
or not item[2]]
|
968
|
+
|
969
|
+
|
970
|
+
if __name__ == '__main__':
|
971
|
+
from lxml.html import _diffcommand
|
972
|
+
_diffcommand.main()
|