lxml 5.3.2__cp38-cp38-win32.whl → 6.0.0__cp38-cp38-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lxml/html/diff.py CHANGED
@@ -1,35 +1,74 @@
1
1
  # cython: language_level=3
2
2
 
3
+ try:
4
+ import cython
5
+ except ImportError:
6
+ class fake_cython:
7
+ compiled = False
8
+ def cfunc(self, func): return func
9
+ def cclass(self, func): return func
10
+ def declare(self, _, value): return value
11
+ def __getattr__(self, type_name): return "object"
12
+
13
+ cython = fake_cython()
14
+
15
+ try:
16
+ from . import _difflib as difflib
17
+ import inspect
18
+ if inspect.isfunction(difflib.get_close_matches):
19
+ raise ImportError(
20
+ "Embedded difflib is not compiled to a fast binary, using the stdlib instead.")
21
+ from cython.cimports.lxml.html._difflib import SequenceMatcher
22
+ except ImportError:
23
+ import difflib
24
+ if not cython.compiled:
25
+ from difflib import SequenceMatcher
26
+
27
+ import itertools
28
+ import functools
29
+ import operator
30
+ import re
3
31
 
4
- import difflib
5
32
  from lxml import etree
6
33
  from lxml.html import fragment_fromstring
7
- import re
34
+ from . import defs
8
35
 
9
36
  __all__ = ['html_annotate', 'htmldiff']
10
37
 
11
- try:
12
- from html import escape as html_escape
13
- except ImportError:
14
- from cgi import escape as html_escape
15
- try:
16
- _unicode = unicode
17
- except NameError:
18
- # Python 3
19
- _unicode = str
20
- try:
21
- basestring
22
- except NameError:
23
- # Python 3
24
- basestring = str
38
+ group_by_first_item = functools.partial(itertools.groupby, key=operator.itemgetter(0))
39
+
25
40
 
26
41
  ############################################################
27
42
  ## Annotation
28
43
  ############################################################
29
44
 
45
+ @cython.cfunc
46
+ def html_escape(text: str, _escapes: tuple = ('&', '<', '>', '"', ''')) -> str:
47
+ # Not so slow compiled version of 'html.escape()'.
48
+ # Most of the time, we replace little to nothing, so use a fast decision what needs to be done.
49
+ ch: cython.Py_UCS4
50
+ replace: cython.char[5] = [False] * 5
51
+ for ch in text:
52
+ replace[0] |= ch == '&'
53
+ replace[1] |= ch == '<'
54
+ replace[2] |= ch == '>'
55
+ replace[3] |= ch == '"'
56
+ replace[4] |= ch == "'"
57
+
58
+ for i in range(5):
59
+ if replace[i]:
60
+ text = text.replace('&<>"\''[i], _escapes[i])
61
+
62
+ return text
63
+
64
+
65
+ if not cython.compiled:
66
+ from html import escape as html_escape
67
+
68
+
30
69
  def default_markup(text, version):
31
70
  return '<span title="%s">%s</span>' % (
32
- html_escape(_unicode(version), 1), text)
71
+ html_escape(version), text)
33
72
 
34
73
  def html_annotate(doclist, markup=default_markup):
35
74
  """
@@ -71,15 +110,15 @@ def html_annotate(doclist, markup=default_markup):
71
110
  result = markup_serialize_tokens(cur_tokens, markup)
72
111
  return ''.join(result).strip()
73
112
 
74
- def tokenize_annotated(doc, annotation):
113
+ def tokenize_annotated(doc, annotation):
75
114
  """Tokenize a document and add an annotation attribute to each token
76
115
  """
77
116
  tokens = tokenize(doc, include_hrefs=False)
78
- for tok in tokens:
117
+ for tok in tokens:
79
118
  tok.annotation = annotation
80
119
  return tokens
81
120
 
82
- def html_annotate_merge_annotations(tokens_old, tokens_new):
121
+ def html_annotate_merge_annotations(tokens_old, tokens_new):
83
122
  """Merge the annotations from tokens_old into tokens_new, when the
84
123
  tokens in the new document already existed in the old document.
85
124
  """
@@ -87,52 +126,50 @@ def html_annotate_merge_annotations(tokens_old, tokens_new):
87
126
  commands = s.get_opcodes()
88
127
 
89
128
  for command, i1, i2, j1, j2 in commands:
90
- if command == 'equal':
129
+ if command == 'equal':
91
130
  eq_old = tokens_old[i1:i2]
92
131
  eq_new = tokens_new[j1:j2]
93
132
  copy_annotations(eq_old, eq_new)
94
133
 
95
- def copy_annotations(src, dest):
134
+ def copy_annotations(src, dest):
96
135
  """
97
136
  Copy annotations from the tokens listed in src to the tokens in dest
98
137
  """
99
138
  assert len(src) == len(dest)
100
- for src_tok, dest_tok in zip(src, dest):
139
+ for src_tok, dest_tok in zip(src, dest):
101
140
  dest_tok.annotation = src_tok.annotation
102
141
 
103
142
  def compress_tokens(tokens):
104
143
  """
105
- Combine adjacent tokens when there is no HTML between the tokens,
144
+ Combine adjacent tokens when there is no HTML between the tokens,
106
145
  and they share an annotation
107
146
  """
108
- result = [tokens[0]]
109
- for tok in tokens[1:]:
110
- if (not result[-1].post_tags and
111
- not tok.pre_tags and
112
- result[-1].annotation == tok.annotation):
147
+ result = [tokens[0]]
148
+ for tok in tokens[1:]:
149
+ if (not tok.pre_tags and
150
+ not result[-1].post_tags and
151
+ result[-1].annotation == tok.annotation):
113
152
  compress_merge_back(result, tok)
114
- else:
153
+ else:
115
154
  result.append(tok)
116
155
  return result
117
156
 
118
- def compress_merge_back(tokens, tok):
157
+ @cython.cfunc
158
+ def compress_merge_back(tokens: list, tok):
119
159
  """ Merge tok into the last element of tokens (modifying the list of
120
160
  tokens in-place). """
121
161
  last = tokens[-1]
122
- if type(last) is not token or type(tok) is not token:
162
+ if type(last) is not token or type(tok) is not token:
123
163
  tokens.append(tok)
124
164
  else:
125
- text = _unicode(last)
126
- if last.trailing_whitespace:
127
- text += last.trailing_whitespace
128
- text += tok
165
+ text = last + last.trailing_whitespace + tok
129
166
  merged = token(text,
130
167
  pre_tags=last.pre_tags,
131
168
  post_tags=tok.post_tags,
132
169
  trailing_whitespace=tok.trailing_whitespace)
133
170
  merged.annotation = last.annotation
134
171
  tokens[-1] = merged
135
-
172
+
136
173
  def markup_serialize_tokens(tokens, markup_func):
137
174
  """
138
175
  Serialize the list of tokens into a list of text chunks, calling
@@ -141,9 +178,7 @@ def markup_serialize_tokens(tokens, markup_func):
141
178
  for token in tokens:
142
179
  yield from token.pre_tags
143
180
  html = token.html()
144
- html = markup_func(html, token.annotation)
145
- if token.trailing_whitespace:
146
- html += token.trailing_whitespace
181
+ html = markup_func(html, token.annotation) + token.trailing_whitespace
147
182
  yield html
148
183
  yield from token.post_tags
149
184
 
@@ -160,7 +195,7 @@ def htmldiff(old_html, new_html):
160
195
  (i.e., no <html> tag).
161
196
 
162
197
  Returns HTML with <ins> and <del> tags added around the
163
- appropriate text.
198
+ appropriate text.
164
199
 
165
200
  Markup is generally ignored, with the markup from new_html
166
201
  preserved, and possibly some markup from old_html (though it is
@@ -168,20 +203,25 @@ def htmldiff(old_html, new_html):
168
203
  words in the HTML are diffed. The exception is <img> tags, which
169
204
  are treated like words, and the href attribute of <a> tags, which
170
205
  are noted inside the tag itself when there are changes.
171
- """
206
+ """
172
207
  old_html_tokens = tokenize(old_html)
173
208
  new_html_tokens = tokenize(new_html)
174
209
  result = htmldiff_tokens(old_html_tokens, new_html_tokens)
175
- result = ''.join(result).strip()
210
+ try:
211
+ result = ''.join(result).strip()
212
+ except (ValueError, TypeError) as exc:
213
+ print(exc)
214
+ result = ''
176
215
  return fixup_ins_del_tags(result)
177
216
 
217
+
178
218
  def htmldiff_tokens(html1_tokens, html2_tokens):
179
219
  """ Does a diff on the tokens themselves, returning a list of text
180
220
  chunks (not tokens).
181
221
  """
182
222
  # There are several passes as we do the differences. The tokens
183
223
  # isolate the portion of the content we care to diff; difflib does
184
- # all the actual hard work at that point.
224
+ # all the actual hard work at that point.
185
225
  #
186
226
  # Then we must create a valid document from pieces of both the old
187
227
  # document and the new document. We generally prefer to take
@@ -205,14 +245,16 @@ def htmldiff_tokens(html1_tokens, html2_tokens):
205
245
  if command == 'delete' or command == 'replace':
206
246
  del_tokens = expand_tokens(html1_tokens[i1:i2])
207
247
  merge_delete(del_tokens, result)
248
+
208
249
  # If deletes were inserted directly as <del> then we'd have an
209
250
  # invalid document at this point. Instead we put in special
210
251
  # markers, and when the complete diffed document has been created
211
252
  # we try to move the deletes around and resolve any problems.
212
- result = cleanup_delete(result)
253
+ cleanup_delete(result)
213
254
 
214
255
  return result
215
256
 
257
+
216
258
  def expand_tokens(tokens, equal=False):
217
259
  """Given a list of tokens, return a generator of the chunks of
218
260
  text for the data in the tokens.
@@ -220,31 +262,64 @@ def expand_tokens(tokens, equal=False):
220
262
  for token in tokens:
221
263
  yield from token.pre_tags
222
264
  if not equal or not token.hide_when_equal:
223
- if token.trailing_whitespace:
224
- yield token.html() + token.trailing_whitespace
225
- else:
226
- yield token.html()
265
+ yield token.html() + token.trailing_whitespace
227
266
  yield from token.post_tags
228
267
 
229
- def merge_insert(ins_chunks, doc):
268
+
269
+ def merge_insert(ins_chunks, doc: list):
230
270
  """ doc is the already-handled document (as a list of text chunks);
231
271
  here we add <ins>ins_chunks</ins> to the end of that. """
232
- # Though we don't throw away unbalanced_start or unbalanced_end
272
+ # Though we don't throw away unbalanced start/end tags
233
273
  # (we assume there is accompanying markup later or earlier in the
234
274
  # document), we only put <ins> around the balanced portion.
235
- unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
236
- doc.extend(unbalanced_start)
237
- if doc and not doc[-1].endswith(' '):
238
- # Fix up the case where the word before the insert didn't end with
239
- # a space
240
- doc[-1] += ' '
241
- doc.append('<ins>')
242
- if balanced and balanced[-1].endswith(' '):
243
- # We move space outside of </ins>
244
- balanced[-1] = balanced[-1][:-1]
245
- doc.extend(balanced)
246
- doc.append('</ins> ')
247
- doc.extend(unbalanced_end)
275
+
276
+ # Legacy note: We make a choice here. Originally, we merged all sequences of
277
+ # unbalanced tags together into separate start and end tag groups. Now, we look at
278
+ # each sequence separately, leading to more fine-grained diffs but different
279
+ # tag structure than before.
280
+
281
+ item: tuple
282
+ for balanced, marked_chunks in group_by_first_item(mark_unbalanced(ins_chunks)):
283
+ chunks = [item[1] for item in marked_chunks]
284
+ if balanced == 'b':
285
+ if doc and not doc[-1].endswith(' '):
286
+ # Fix up the case where the word before the insert didn't end with a space.
287
+ doc[-1] += ' '
288
+ doc.append('<ins>')
289
+ doc.extend(chunks)
290
+ if doc[-1].endswith(' '):
291
+ # We move space outside of </ins>.
292
+ doc[-1] = doc[-1][:-1]
293
+ doc.append('</ins> ')
294
+ else:
295
+ # unmatched start or end
296
+ doc.extend(chunks)
297
+
298
+
299
+ @cython.cfunc
300
+ def tag_name_of_chunk(chunk: str) -> str:
301
+ i: cython.Py_ssize_t
302
+ ch: cython.Py_UCS4
303
+
304
+ if chunk[0] != '<':
305
+ return ""
306
+
307
+ start_pos = 1
308
+ for i, ch in enumerate(chunk):
309
+ if ch == '/':
310
+ start_pos = 2
311
+ elif ch == '>':
312
+ return chunk[start_pos:i]
313
+ elif ch.isspace():
314
+ return chunk[start_pos:i]
315
+
316
+ return chunk[start_pos:]
317
+
318
+ if not cython.compiled:
319
+ # Avoid performance regression in Python due to string iteration.
320
+ def tag_name_of_chunk(chunk: str) -> str:
321
+ return chunk.split(None, 1)[0].strip('<>/')
322
+
248
323
 
249
324
  # These are sentinels to represent the start and end of a <del>
250
325
  # segment, until we do the cleanup phase to turn them into proper
@@ -254,19 +329,18 @@ class DEL_START:
254
329
  class DEL_END:
255
330
  pass
256
331
 
257
- class NoDeletes(Exception):
258
- """ Raised when the document no longer contains any pending deletes
259
- (DEL_START/DEL_END) """
260
332
 
261
- def merge_delete(del_chunks, doc):
333
+ def merge_delete(del_chunks, doc: list):
262
334
  """ Adds the text chunks in del_chunks to the document doc (another
263
335
  list of text chunks) with marker to show it is a delete.
264
336
  cleanup_delete later resolves these markers into <del> tags."""
337
+
265
338
  doc.append(DEL_START)
266
339
  doc.extend(del_chunks)
267
340
  doc.append(DEL_END)
268
341
 
269
- def cleanup_delete(chunks):
342
+
343
+ def cleanup_delete(chunks: list):
270
344
  """ Cleans up any DEL_START/DEL_END markers in the document, replacing
271
345
  them with <del></del>. To do this while keeping the document
272
346
  valid, it may need to drop some tags (either start or end tags).
@@ -274,166 +348,192 @@ def cleanup_delete(chunks):
274
348
  It may also move the del into adjacent tags to try to move it to a
275
349
  similar location where it was originally located (e.g., moving a
276
350
  delete into preceding <div> tag, if the del looks like (DEL_START,
277
- 'Text</div>', DEL_END)"""
351
+ 'Text</div>', DEL_END)
352
+ """
353
+ chunk_count = len(chunks)
354
+
355
+ i: cython.Py_ssize_t
356
+ del_start: cython.Py_ssize_t
357
+ del_end: cython.Py_ssize_t
358
+ shift_start_right: cython.Py_ssize_t
359
+ shift_end_left: cython.Py_ssize_t
360
+ unbalanced_start: cython.Py_ssize_t
361
+ unbalanced_end: cython.Py_ssize_t
362
+ pos: cython.Py_ssize_t
363
+ start_pos: cython.Py_ssize_t
364
+ chunk: str
365
+
366
+ start_pos = 0
278
367
  while 1:
279
368
  # Find a pending DEL_START/DEL_END, splitting the document
280
369
  # into stuff-preceding-DEL_START, stuff-inside, and
281
370
  # stuff-following-DEL_END
282
371
  try:
283
- pre_delete, delete, post_delete = split_delete(chunks)
284
- except NoDeletes:
372
+ del_start = chunks.index(DEL_START, start_pos)
373
+ except ValueError:
285
374
  # Nothing found, we've cleaned up the entire doc
286
375
  break
287
- # The stuff-inside-DEL_START/END may not be well balanced
288
- # markup. First we figure out what unbalanced portions there are:
289
- unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
290
- # Then we move the span forward and/or backward based on these
291
- # unbalanced portions:
292
- locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
293
- locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
294
- doc = pre_delete
295
- if doc and not doc[-1].endswith(' '):
296
- # Fix up case where the word before us didn't have a trailing space
297
- doc[-1] += ' '
298
- doc.append('<del>')
299
- if balanced and balanced[-1].endswith(' '):
300
- # We move space outside of </del>
301
- balanced[-1] = balanced[-1][:-1]
302
- doc.extend(balanced)
303
- doc.append('</del> ')
304
- doc.extend(post_delete)
305
- chunks = doc
306
- return chunks
307
-
308
- def split_unbalanced(chunks):
309
- """Return (unbalanced_start, balanced, unbalanced_end), where each is
310
- a list of text and tag chunks.
311
-
312
- unbalanced_start is a list of all the tags that are opened, but
313
- not closed in this span. Similarly, unbalanced_end is a list of
314
- tags that are closed but were not opened. Extracting these might
315
- mean some reordering of the chunks."""
316
- start = []
317
- end = []
376
+ else:
377
+ del_end = chunks.index(DEL_END, del_start + 1)
378
+
379
+ shift_end_left = shift_start_right = 0
380
+ unbalanced_start = unbalanced_end = 0
381
+ deleted_chunks = mark_unbalanced(chunks[del_start+1:del_end])
382
+
383
+ # For unbalanced start tags at the beginning, find matching (non-deleted)
384
+ # end tags after the current DEL_END and move the start tag outside.
385
+ for balanced, del_chunk in deleted_chunks:
386
+ if balanced != 'us':
387
+ break
388
+ unbalanced_start += 1
389
+ unbalanced_start_name = tag_name_of_chunk(del_chunk)
390
+ for i in range(del_end+1, chunk_count):
391
+ if chunks[i] is DEL_START:
392
+ break
393
+ chunk = chunks[i]
394
+ if chunk[0] != '<' or chunk[1] == '/':
395
+ # Reached a word or closing tag.
396
+ break
397
+ name = tag_name_of_chunk(chunk)
398
+ if name == 'ins':
399
+ # Cannot move into an insert.
400
+ break
401
+ assert name != 'del', f"Unexpected delete tag: {chunk!r}"
402
+ if name != unbalanced_start_name:
403
+ # Avoid mixing in other start tags.
404
+ break
405
+ # Exclude start tag to balance the end tag.
406
+ shift_start_right += 1
407
+
408
+ # For unbalanced end tags at the end, find matching (non-deleted)
409
+ # start tags before the currend DEL_START and move the end tag outside.
410
+ for balanced, del_chunk in reversed(deleted_chunks):
411
+ if balanced != 'ue':
412
+ break
413
+ unbalanced_end += 1
414
+ unbalanced_end_name = tag_name_of_chunk(del_chunk)
415
+ for i in range(del_start - 1, -1, -1):
416
+ if chunks[i] is DEL_END:
417
+ break
418
+ chunk = chunks[i]
419
+ if chunk[0] == '<' and chunk[1] != '/':
420
+ # Reached an opening tag, can we go further? Maybe not...
421
+ break
422
+ name = tag_name_of_chunk(chunk)
423
+ if name == 'ins' or name == 'del':
424
+ # Cannot move into an insert or delete.
425
+ break
426
+ if name != unbalanced_end_name:
427
+ # Avoid mixing in other start tags.
428
+ break
429
+ # Exclude end tag to balance the start tag.
430
+ shift_end_left += 1
431
+
432
+ """
433
+ # This is what we do below in loops, spelled out using slicing and list copying:
434
+
435
+ chunks[del_start - shift_end_left : del_end + shift_start_right + 1] = [
436
+ *chunks[del_start + 1: del_start + shift_start_right + 1],
437
+ '<del>',
438
+ *chunks[del_start + unbalanced_start + 1 : del_end - unbalanced_end],
439
+ '</del> ',
440
+ *chunks[del_end - shift_end_left: del_end],
441
+ ]
442
+
443
+ new_del_end = del_end - 2 * shift_end_left
444
+ assert chunks[new_del_end] == '</del> '
445
+ del_end = new_del_end
446
+
447
+ if new_del_start > 0 and not chunks[new_del_start - 1].endswith(' '):
448
+ # Fix up case where the word before us didn't have a trailing space.
449
+ chunks[new_del_start - 1] += ' '
450
+ if new_del_end > 0 and chunks[new_del_end - 1].endswith(' '):
451
+ # Move space outside of </del>.
452
+ chunks[new_del_end - 1] = chunks[new_del_end - 1][:-1]
453
+ """
454
+ pos = del_start - shift_end_left
455
+ # Move re-balanced start tags before the '<del>'.
456
+ for i in range(del_start + 1, del_start + shift_start_right + 1):
457
+ chunks[pos] = chunks[i]
458
+ pos += 1
459
+ if pos and not chunks[pos - 1].endswith(' '):
460
+ # Fix up the case where the word before '<del>' didn't have a trailing space.
461
+ chunks[pos - 1] += ' '
462
+ chunks[pos] = '<del>'
463
+ pos += 1
464
+ # Copy only the balanced deleted content between '<del>' and '</del>'.
465
+ for i in range(del_start + unbalanced_start + 1, del_end - unbalanced_end):
466
+ chunks[pos] = chunks[i]
467
+ pos += 1
468
+ if chunks[pos - 1].endswith(' '):
469
+ # Move trailing space outside of </del>.
470
+ chunks[pos - 1] = chunks[pos - 1][:-1]
471
+ chunks[pos] = '</del> '
472
+ pos += 1
473
+ # Move re-balanced end tags after the '</del>'.
474
+ for i in range(del_end - shift_end_left, del_end):
475
+ chunks[pos] = chunks[i]
476
+ pos += 1
477
+ # Adjust the length of the processed part in 'chunks'.
478
+ del chunks[pos : del_end + shift_start_right + 1]
479
+ start_pos = pos
480
+
481
+
482
+ @cython.cfunc
483
+ def mark_unbalanced(chunks) -> list:
318
484
  tag_stack = []
319
- balanced = []
485
+ marked = []
486
+
487
+ chunk: str
488
+ parents: list
489
+
320
490
  for chunk in chunks:
321
491
  if not chunk.startswith('<'):
322
- balanced.append(chunk)
492
+ marked.append(('b', chunk))
323
493
  continue
324
- endtag = chunk[1] == '/'
325
- name = chunk.split()[0].strip('<>/')
494
+
495
+ name = tag_name_of_chunk(chunk)
326
496
  if name in empty_tags:
327
- balanced.append(chunk)
497
+ marked.append(('b', chunk))
328
498
  continue
329
- if endtag:
330
- if tag_stack and tag_stack[-1][0] == name:
331
- balanced.append(chunk)
332
- name, pos, tag = tag_stack.pop()
333
- balanced[pos] = tag
334
- elif tag_stack:
335
- start.extend([tag for name, pos, tag in tag_stack])
336
- tag_stack = []
337
- end.append(chunk)
338
- else:
339
- end.append(chunk)
340
- else:
341
- tag_stack.append((name, len(balanced), chunk))
342
- balanced.append(None)
343
- start.extend(
344
- [chunk for name, pos, chunk in tag_stack])
345
- balanced = [chunk for chunk in balanced if chunk is not None]
346
- return start, balanced, end
347
-
348
- def split_delete(chunks):
349
- """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
350
- stuff_after_DEL_END). Returns the first case found (there may be
351
- more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
352
- there's no DEL_START found. """
353
- try:
354
- pos = chunks.index(DEL_START)
355
- except ValueError:
356
- raise NoDeletes
357
- pos2 = chunks.index(DEL_END)
358
- return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
359
-
360
- def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
361
- """ pre_delete and post_delete implicitly point to a place in the
362
- document (where the two were split). This moves that point (by
363
- popping items from one and pushing them onto the other). It moves
364
- the point to try to find a place where unbalanced_start applies.
365
-
366
- As an example::
367
-
368
- >>> unbalanced_start = ['<div>']
369
- >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
370
- >>> pre, post = doc[:3], doc[3:]
371
- >>> pre, post
372
- (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
373
- >>> locate_unbalanced_start(unbalanced_start, pre, post)
374
- >>> pre, post
375
- (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
376
-
377
- As you can see, we moved the point so that the dangling <div> that
378
- we found will be effectively replaced by the div in the original
379
- document. If this doesn't work out, we just throw away
380
- unbalanced_start without doing anything.
381
- """
382
- while 1:
383
- if not unbalanced_start:
384
- # We have totally succeeded in finding the position
385
- break
386
- finding = unbalanced_start[0]
387
- finding_name = finding.split()[0].strip('<>')
388
- if not post_delete:
389
- break
390
- next = post_delete[0]
391
- if next is DEL_START or not next.startswith('<'):
392
- # Reached a word, we can't move the delete text forward
393
- break
394
- if next[1] == '/':
395
- # Reached a closing tag, can we go further? Maybe not...
396
- break
397
- name = next.split()[0].strip('<>')
398
- if name == 'ins':
399
- # Can't move into an insert
400
- break
401
- assert name != 'del', (
402
- "Unexpected delete tag: %r" % next)
403
- if name == finding_name:
404
- unbalanced_start.pop(0)
405
- pre_delete.append(post_delete.pop(0))
406
- else:
407
- # Found a tag that doesn't match
408
- break
409
499
 
410
- def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
411
- """ like locate_unbalanced_start, except handling end tags and
412
- possibly moving the point earlier in the document. """
413
- while 1:
414
- if not unbalanced_end:
415
- # Success
416
- break
417
- finding = unbalanced_end[-1]
418
- finding_name = finding.split()[0].strip('<>/')
419
- if not pre_delete:
420
- break
421
- next = pre_delete[-1]
422
- if next is DEL_END or not next.startswith('</'):
423
- # A word or a start tag
424
- break
425
- name = next.split()[0].strip('<>/')
426
- if name == 'ins' or name == 'del':
427
- # Can't move into an insert or delete
428
- break
429
- if name == finding_name:
430
- unbalanced_end.pop()
431
- post_delete.insert(0, pre_delete.pop())
500
+ if chunk[1] == '/':
501
+ # closing tag found, unwind tag stack
502
+ while tag_stack:
503
+ start_name, start_chunk, parents = tag_stack.pop()
504
+ if start_name == name:
505
+ # balanced tag closing, keep rest of stack intact
506
+ parents.append(('b', start_chunk))
507
+ parents.extend(marked)
508
+ parents.append(('b', chunk))
509
+ marked = parents
510
+ chunk = None
511
+ break
512
+ else:
513
+ # unmatched start tag
514
+ parents.append(('us', start_chunk))
515
+ parents.extend(marked)
516
+ marked = parents
517
+
518
+ if chunk is not None:
519
+ # unmatched end tag left after clearing the stack
520
+ marked.append(('ue', chunk))
432
521
  else:
433
- # Found a tag that doesn't match
434
- break
522
+ # new start tag found
523
+ tag_stack.append((name, chunk, marked))
524
+ marked = []
435
525
 
436
- class token(_unicode):
526
+ # add any unbalanced start tags
527
+ while tag_stack:
528
+ _, start_chunk, parents = tag_stack.pop()
529
+ parents.append(('us', start_chunk))
530
+ parents.extend(marked)
531
+ marked = parents
532
+
533
+ return marked
534
+
535
+
536
+ class token(str):
437
537
  """ Represents a diffable token, generally a word that is displayed to
438
538
  the user. Opening tags are attached to this token when they are
439
539
  adjacent (pre_tags) and closing tags that follow the word
@@ -451,28 +551,20 @@ class token(_unicode):
451
551
  hide_when_equal = False
452
552
 
453
553
  def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
454
- obj = _unicode.__new__(cls, text)
455
-
456
- if pre_tags is not None:
457
- obj.pre_tags = pre_tags
458
- else:
459
- obj.pre_tags = []
460
-
461
- if post_tags is not None:
462
- obj.post_tags = post_tags
463
- else:
464
- obj.post_tags = []
554
+ obj = str.__new__(cls, text)
465
555
 
556
+ obj.pre_tags = pre_tags if pre_tags is not None else []
557
+ obj.post_tags = post_tags if post_tags is not None else []
466
558
  obj.trailing_whitespace = trailing_whitespace
467
559
 
468
560
  return obj
469
561
 
470
562
  def __repr__(self):
471
- return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
472
- self.post_tags, self.trailing_whitespace)
563
+ return 'token(%s, %r, %r, %r)' % (
564
+ str.__repr__(self), self.pre_tags, self.post_tags, self.trailing_whitespace)
473
565
 
474
566
  def html(self):
475
- return _unicode(self)
567
+ return str(self)
476
568
 
477
569
  class tag_token(token):
478
570
 
@@ -480,11 +572,11 @@ class tag_token(token):
480
572
  the <img> tag, which takes up visible space just like a word but
481
573
  is only represented in a document by a tag. """
482
574
 
483
- def __new__(cls, tag, data, html_repr, pre_tags=None,
575
+ def __new__(cls, tag, data, html_repr, pre_tags=None,
484
576
  post_tags=None, trailing_whitespace=""):
485
- obj = token.__new__(cls, "%s: %s" % (type, data),
486
- pre_tags=pre_tags,
487
- post_tags=post_tags,
577
+ obj = token.__new__(cls, f"{type}: {data}",
578
+ pre_tags=pre_tags,
579
+ post_tags=post_tags,
488
580
  trailing_whitespace=trailing_whitespace)
489
581
  obj.tag = tag
490
582
  obj.data = data
@@ -493,11 +585,11 @@ class tag_token(token):
493
585
 
494
586
  def __repr__(self):
495
587
  return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
496
- self.tag,
497
- self.data,
498
- self.html_repr,
499
- self.pre_tags,
500
- self.post_tags,
588
+ self.tag,
589
+ self.data,
590
+ self.html_repr,
591
+ self.pre_tags,
592
+ self.post_tags,
501
593
  self.trailing_whitespace)
502
594
  def html(self):
503
595
  return self.html_repr
@@ -512,6 +604,7 @@ class href_token(token):
512
604
  def html(self):
513
605
  return ' Link: %s' % self
514
606
 
607
+
515
608
  def tokenize(html, include_hrefs=True):
516
609
  """
517
610
  Parse the given HTML and returns token objects (words with attached tags).
@@ -536,6 +629,7 @@ def tokenize(html, include_hrefs=True):
536
629
  # Finally re-joining them into token objects:
537
630
  return fixup_chunks(chunks)
538
631
 
632
+
539
633
  def parse_html(html, cleanup=True):
540
634
  """
541
635
  Parses an HTML fragment, returning an lxml element. Note that the HTML will be
@@ -549,25 +643,24 @@ def parse_html(html, cleanup=True):
549
643
  html = cleanup_html(html)
550
644
  return fragment_fromstring(html, create_parent=True)
551
645
 
552
- _body_re = re.compile(r'<body.*?>', re.I|re.S)
553
- _end_body_re = re.compile(r'</body.*?>', re.I|re.S)
554
- _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
646
+
647
+ _search_body = re.compile(r'<body.*?>', re.I|re.S).search
648
+ _search_end_body = re.compile(r'</body.*?>', re.I|re.S).search
649
+ _replace_ins_del = re.compile(r'</?(ins|del).*?>', re.I|re.S).sub
555
650
 
556
651
  def cleanup_html(html):
557
652
  """ This 'cleans' the HTML, meaning that any page structure is removed
558
653
  (only the contents of <body> are used, if there is any <body).
559
654
  Also <ins> and <del> tags are removed. """
560
- match = _body_re.search(html)
655
+ match = _search_body(html)
561
656
  if match:
562
657
  html = html[match.end():]
563
- match = _end_body_re.search(html)
658
+ match = _search_end_body(html)
564
659
  if match:
565
660
  html = html[:match.start()]
566
- html = _ins_del_re.sub('', html)
661
+ html = _replace_ins_del('', html)
567
662
  return html
568
-
569
663
 
570
- end_whitespace_re = re.compile(r'[ \t\n\r]$')
571
664
 
572
665
  def split_trailing_whitespace(word):
573
666
  """
@@ -631,11 +724,9 @@ def fixup_chunks(chunks):
631
724
 
632
725
 
633
726
  # All the tags in HTML that don't require end tags:
634
- empty_tags = (
635
- 'param', 'img', 'area', 'br', 'basefont', 'input',
636
- 'base', 'meta', 'link', 'col')
727
+ empty_tags = cython.declare(frozenset, defs.empty_tags)
637
728
 
638
- block_level_tags = (
729
+ block_level_tags = cython.declare(frozenset, frozenset([
639
730
  'address',
640
731
  'blockquote',
641
732
  'center',
@@ -660,9 +751,9 @@ block_level_tags = (
660
751
  'pre',
661
752
  'table',
662
753
  'ul',
663
- )
754
+ ]))
664
755
 
665
- block_level_container_tags = (
756
+ block_level_container_tags = cython.declare(frozenset, frozenset([
666
757
  'dd',
667
758
  'dt',
668
759
  'frameset',
@@ -673,7 +764,11 @@ block_level_container_tags = (
673
764
  'th',
674
765
  'thead',
675
766
  'tr',
676
- )
767
+ ]))
768
+
769
+ any_block_level_tag = cython.declare(tuple, tuple(sorted(
770
+ block_level_tags | block_level_container_tags))
771
+ )
677
772
 
678
773
 
679
774
  def flatten_el(el, include_hrefs, skip_tag=False):
@@ -703,7 +798,7 @@ def flatten_el(el, include_hrefs, skip_tag=False):
703
798
  for word in end_words:
704
799
  yield html_escape(word)
705
800
 
706
- split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
801
+ _find_words = re.compile(r'\S+(?:\s+|$)', re.U).findall
707
802
 
708
803
  def split_words(text):
709
804
  """ Splits some text into words. Includes trailing whitespace
@@ -711,27 +806,27 @@ def split_words(text):
711
806
  if not text or not text.strip():
712
807
  return []
713
808
 
714
- words = split_words_re.findall(text)
809
+ words = _find_words(text)
715
810
  return words
716
811
 
717
- start_whitespace_re = re.compile(r'^[ \t\n\r]')
812
+ _has_start_whitespace = re.compile(r'^[ \t\n\r]').match
718
813
 
719
814
  def start_tag(el):
720
815
  """
721
816
  The text representation of the start tag for a tag.
722
817
  """
723
- return '<%s%s>' % (
724
- el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
725
- for name, value in el.attrib.items()]))
818
+ attributes = ''.join([
819
+ f' {name}="{html_escape(value)}"'
820
+ for name, value in el.attrib.items()
821
+ ])
822
+ return f'<{el.tag}{attributes}>'
726
823
 
727
824
  def end_tag(el):
728
825
  """ The text representation of an end tag for a tag. Includes
729
826
  trailing whitespace when appropriate. """
730
- if el.tail and start_whitespace_re.search(el.tail):
731
- extra = ' '
732
- else:
733
- extra = ''
734
- return '</%s>%s' % (el.tag, extra)
827
+ tail = el.tail
828
+ extra = ' ' if tail and _has_start_whitespace(tail) else ''
829
+ return f'</{el.tag}>{extra}'
735
830
 
736
831
  def is_word(tok):
737
832
  return not tok.startswith('<')
@@ -753,13 +848,13 @@ def fixup_ins_del_tags(html):
753
848
 
754
849
  def serialize_html_fragment(el, skip_outer=False):
755
850
  """ Serialize a single lxml element as HTML. The serialized form
756
- includes the elements tail.
851
+ includes the elements tail.
757
852
 
758
853
  If skip_outer is true, then don't serialize the outermost tag
759
854
  """
760
- assert not isinstance(el, basestring), (
761
- "You should pass in an element, not a string like %r" % el)
762
- html = etree.tostring(el, method="html", encoding=_unicode)
855
+ assert not isinstance(el, str), (
856
+ f"You should pass in an element, not a string like {el!r}")
857
+ html = etree.tostring(el, method="html", encoding='unicode')
763
858
  if skip_outer:
764
859
  # Get rid of the extra starting tag:
765
860
  html = html[html.find('>')+1:]
@@ -769,59 +864,64 @@ def serialize_html_fragment(el, skip_outer=False):
769
864
  else:
770
865
  return html
771
866
 
867
+
868
+ @cython.cfunc
772
869
  def _fixup_ins_del_tags(doc):
773
870
  """fixup_ins_del_tags that works on an lxml document in-place
774
871
  """
775
- for tag in ['ins', 'del']:
776
- for el in doc.xpath('descendant-or-self::%s' % tag):
777
- if not _contains_block_level_tag(el):
778
- continue
779
- _move_el_inside_block(el, tag=tag)
780
- el.drop_tag()
781
- #_merge_element_contents(el)
872
+ for el in list(doc.iter('ins', 'del')):
873
+ if not _contains_block_level_tag(el):
874
+ continue
875
+ _move_el_inside_block(el, tag=el.tag)
876
+ el.drop_tag()
877
+ #_merge_element_contents(el)
878
+
782
879
 
880
+ @cython.cfunc
783
881
  def _contains_block_level_tag(el):
784
882
  """True if the element contains any block-level elements, like <p>, <td>, etc.
785
883
  """
786
- if el.tag in block_level_tags or el.tag in block_level_container_tags:
884
+ for el in el.iter(*any_block_level_tag):
787
885
  return True
788
- for child in el:
789
- if _contains_block_level_tag(child):
790
- return True
791
886
  return False
792
887
 
888
+
889
+ @cython.cfunc
793
890
  def _move_el_inside_block(el, tag):
794
891
  """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
795
892
  and moves them inside any block-level tags. """
796
- for child in el:
797
- if _contains_block_level_tag(child):
893
+ makeelement = el.makeelement
894
+ for block_level_el in el.iter(*any_block_level_tag):
895
+ if block_level_el is not el:
798
896
  break
799
897
  else:
800
898
  # No block-level tags in any child
801
- children_tag = etree.Element(tag)
899
+ children_tag = makeelement(tag)
802
900
  children_tag.text = el.text
803
901
  el.text = None
804
- children_tag.extend(list(el))
902
+ children_tag.extend(iter(el))
805
903
  el[:] = [children_tag]
806
904
  return
905
+
807
906
  for child in list(el):
808
907
  if _contains_block_level_tag(child):
809
908
  _move_el_inside_block(child, tag)
810
909
  if child.tail:
811
- tail_tag = etree.Element(tag)
910
+ tail_tag = makeelement(tag)
812
911
  tail_tag.text = child.tail
813
912
  child.tail = None
814
- el.insert(el.index(child)+1, tail_tag)
913
+ child.addnext(tail_tag)
815
914
  else:
816
- child_tag = etree.Element(tag)
915
+ child_tag = makeelement(tag)
817
916
  el.replace(child, child_tag)
818
917
  child_tag.append(child)
819
918
  if el.text:
820
- text_tag = etree.Element(tag)
919
+ text_tag = makeelement(tag)
821
920
  text_tag.text = el.text
822
921
  el.text = None
823
922
  el.insert(0, text_tag)
824
-
923
+
924
+
825
925
  def _merge_element_contents(el):
826
926
  """
827
927
  Removes an element, but merges its contents into its place, e.g.,
@@ -829,50 +929,44 @@ def _merge_element_contents(el):
829
929
  <p>Hi there!</p>
830
930
  """
831
931
  parent = el.getparent()
832
- text = el.text or ''
833
- if el.tail:
932
+ text = el.text
933
+ tail = el.tail
934
+ if tail:
834
935
  if not len(el):
835
- text += el.tail
936
+ text = (text or '') + tail
836
937
  else:
837
- if el[-1].tail:
838
- el[-1].tail += el.tail
839
- else:
840
- el[-1].tail = el.tail
938
+ el[-1].tail = (el[-1].tail or '') + tail
841
939
  index = parent.index(el)
842
940
  if text:
843
- if index == 0:
844
- previous = None
845
- else:
846
- previous = parent[index-1]
941
+ previous = el.getprevious()
847
942
  if previous is None:
848
- if parent.text:
849
- parent.text += text
850
- else:
851
- parent.text = text
943
+ parent.text = (parent.text or '') + text
852
944
  else:
853
- if previous.tail:
854
- previous.tail += text
855
- else:
856
- previous.tail = text
945
+ previous.tail = (previous.tail or '') + text
857
946
  parent[index:index+1] = el.getchildren()
858
947
 
859
- class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
948
+
949
+ @cython.final
950
+ @cython.cclass
951
+ class InsensitiveSequenceMatcher(SequenceMatcher):
860
952
  """
861
953
  Acts like SequenceMatcher, but tries not to find very small equal
862
954
  blocks amidst large spans of changes
863
955
  """
864
956
 
865
957
  threshold = 2
866
-
867
- def get_matching_blocks(self):
868
- size = min(len(self.b), len(self.b))
869
- threshold = min(self.threshold, size / 4)
870
- actual = difflib.SequenceMatcher.get_matching_blocks(self)
958
+
959
+ @cython.cfunc
960
+ def get_matching_blocks(self) -> list:
961
+ size: cython.Py_ssize_t = min(len(self.b), len(self.b))
962
+ threshold: cython.Py_ssize_t = self.threshold
963
+ threshold = min(threshold, size // 4)
964
+ actual = SequenceMatcher.get_matching_blocks(self)
871
965
  return [item for item in actual
872
966
  if item[2] > threshold
873
967
  or not item[2]]
874
968
 
969
+
875
970
  if __name__ == '__main__':
876
971
  from lxml.html import _diffcommand
877
972
  _diffcommand.main()
878
-