pdoc 14.5.1__py3-none-any.whl → 14.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  # fmt: off
2
2
  # flake8: noqa
3
3
  # type: ignore
4
- # Taken from here: https://github.com/trentm/python-markdown2/blob/bce3f18ed86a19b418c8114a712bb6fee790c4c2/lib/markdown2.py
4
+ # Taken from here: https://github.com/trentm/python-markdown2/blob/8d3a65bc7d4f8b64af89f668eb6c60841dc0578c/lib/markdown2.py
5
5
 
6
6
  #!/usr/bin/env python
7
7
  # Copyright (c) 2012 Trent Mick.
@@ -46,7 +46,11 @@ Supported extra syntax options (see -x|--extras option below and
46
46
  see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
47
47
 
48
48
  * admonitions: Enable parsing of RST admonitions.
49
- * break-on-newline: Replace single new line characters with <br> when True
49
+ * breaks: Control where hard breaks are inserted in the markdown.
50
+ Options include:
51
+ - on_newline: Replace single new line characters with <br> when True
52
+ - on_backslash: Replace backslashes at the end of a line with <br>
53
+ * break-on-newline: Alias for the on_newline option in the breaks extra.
50
54
  * code-friendly: Disable _ and __ for em and strong.
51
55
  * cuddled-lists: Allow lists to be cuddled to the preceding paragraph.
52
56
  * fenced-code-blocks: Allows a code block to not have to be indented
@@ -71,6 +75,9 @@ see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
71
75
  some limitations.
72
76
  * metadata: Extract metadata from a leading '---'-fenced block.
73
77
  See <https://github.com/trentm/python-markdown2/issues/77> for details.
78
+ * middle-word-em: Allows or disallows emphasis syntax in the middle of words,
79
+ defaulting to allow. Disabling this means that `this_text_here` will not be
80
+ converted to `this<em>text</em>here`.
74
81
  * nofollow: Add `rel="nofollow"` to add `<a>` tags with an href. See
75
82
  <http://en.wikipedia.org/wiki/Nofollow>.
76
83
  * numbering: Support of generic counters. Non standard extension to
@@ -104,7 +111,7 @@ see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
104
111
  # not yet sure if there implications with this. Compare 'pydoc sre'
105
112
  # and 'perldoc perlre'.
106
113
 
107
- __version_info__ = (2, 4, 9)
114
+ __version_info__ = (2, 5, 1)
108
115
  __version__ = '.'.join(map(str, __version_info__))
109
116
  __author__ = "Trent Mick"
110
117
 
@@ -113,9 +120,24 @@ import codecs
113
120
  import logging
114
121
  import re
115
122
  import sys
116
- from collections import defaultdict
123
+ from collections import defaultdict, OrderedDict
124
+ from abc import ABC, abstractmethod
125
+ import functools
117
126
  from hashlib import sha256
118
127
  from random import randint, random
128
+ from typing import Any, Callable, Collection, Dict, List, Literal, Optional, Tuple, Type, TypedDict, Union
129
+ from enum import IntEnum, auto
130
+
131
+ if sys.version_info[1] < 9:
132
+ from typing import Iterable
133
+ else:
134
+ from collections.abc import Iterable
135
+
136
+ # ---- type defs
137
+ _safe_mode = Literal['replace', 'escape']
138
+ _extras_dict = Dict[str, Any]
139
+ _extras_param = Union[List[str], _extras_dict]
140
+ _link_patterns = Iterable[Tuple[re.Pattern, Union[str, Callable[[re.Match], str]]]]
119
141
 
120
142
  # ---- globals
121
143
 
@@ -128,7 +150,7 @@ DEFAULT_TAB_WIDTH = 4
128
150
  SECRET_SALT = bytes(randint(0, 1000000))
129
151
  # MD5 function was previously used for this; the "md5" prefix was kept for
130
152
  # backwards compatibility.
131
- def _hash_text(s):
153
+ def _hash_text(s: str) -> str:
132
154
  return 'md5-' + sha256(SECRET_SALT + s.encode("utf-8")).hexdigest()[32:]
133
155
 
134
156
  # Table of hash values for escaped characters:
@@ -147,11 +169,18 @@ class MarkdownError(Exception):
147
169
 
148
170
  # ---- public api
149
171
 
150
- def markdown_path(path, encoding="utf-8",
151
- html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
152
- safe_mode=None, extras=None, link_patterns=None,
153
- footnote_title=None, footnote_return_symbol=None,
154
- use_file_vars=False):
172
+ def markdown_path(
173
+ path: str,
174
+ encoding: str = "utf-8",
175
+ html4tags: bool = False,
176
+ tab_width: int = DEFAULT_TAB_WIDTH,
177
+ safe_mode: Optional[_safe_mode] = None,
178
+ extras: Optional[_extras_param] = None,
179
+ link_patterns: Optional[_link_patterns] = None,
180
+ footnote_title: Optional[str] = None,
181
+ footnote_return_symbol: Optional[str] = None,
182
+ use_file_vars: bool = False
183
+ ) -> 'UnicodeWithAttrs':
155
184
  fp = codecs.open(path, 'r', encoding)
156
185
  text = fp.read()
157
186
  fp.close()
@@ -163,10 +192,18 @@ def markdown_path(path, encoding="utf-8",
163
192
  use_file_vars=use_file_vars).convert(text)
164
193
 
165
194
 
166
- def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
167
- safe_mode=None, extras=None, link_patterns=None,
168
- footnote_title=None, footnote_return_symbol=None,
169
- use_file_vars=False, cli=False):
195
+ def markdown(
196
+ text: str,
197
+ html4tags: bool = False,
198
+ tab_width: int = DEFAULT_TAB_WIDTH,
199
+ safe_mode: Optional[_safe_mode] = None,
200
+ extras: Optional[_extras_param] = None,
201
+ link_patterns: Optional[_link_patterns] = None,
202
+ footnote_title: Optional[str] = None,
203
+ footnote_return_symbol: Optional[str] =None,
204
+ use_file_vars: bool = False,
205
+ cli: bool = False
206
+ ) -> 'UnicodeWithAttrs':
170
207
  return Markdown(html4tags=html4tags, tab_width=tab_width,
171
208
  safe_mode=safe_mode, extras=extras,
172
209
  link_patterns=link_patterns,
@@ -175,6 +212,66 @@ def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
175
212
  use_file_vars=use_file_vars, cli=cli).convert(text)
176
213
 
177
214
 
215
+ class Stage(IntEnum):
216
+ PREPROCESS = auto()
217
+ HASH_HTML = auto()
218
+ LINK_DEFS = auto()
219
+
220
+ BLOCK_GAMUT = auto()
221
+ HEADERS = auto()
222
+ LISTS = auto()
223
+ CODE_BLOCKS = auto()
224
+ BLOCK_QUOTES = auto()
225
+ PARAGRAPHS = auto()
226
+
227
+ SPAN_GAMUT = auto()
228
+ CODE_SPANS = auto()
229
+ ESCAPE_SPECIAL = auto()
230
+ LINKS = auto() # and auto links
231
+ ITALIC_AND_BOLD = auto()
232
+
233
+ POSTPROCESS = auto()
234
+ UNHASH_HTML = auto()
235
+
236
+
237
+ def mark_stage(stage: Stage):
238
+ '''
239
+ Decorator that handles executing relevant `Extra`s before and after this `Stage` executes.
240
+ '''
241
+ def wrapper(func):
242
+ @functools.wraps(func)
243
+ def inner(md: 'Markdown', text, *args, **kwargs):
244
+ md.stage = stage
245
+ # set "order" prop so extras can tell if they're being invoked before/after the stage
246
+ md.order = stage - 0.5
247
+
248
+ if stage in Extra._exec_order:
249
+ for klass in Extra._exec_order[stage][0]:
250
+ if klass.name not in md.extra_classes:
251
+ continue
252
+ extra = md.extra_classes[klass.name]
253
+ if extra.test(text):
254
+ text = extra.run(text)
255
+
256
+ md.order = stage
257
+ text = func(md, text, *args, **kwargs)
258
+ md.order = stage + 0.5
259
+
260
+ if stage in Extra._exec_order:
261
+ for klass in Extra._exec_order[stage][1]:
262
+ if klass.name not in md.extra_classes:
263
+ continue
264
+ extra = md.extra_classes[klass.name]
265
+ if extra.test(text):
266
+ text = extra.run(text)
267
+
268
+ return text
269
+
270
+ return inner
271
+
272
+ return wrapper
273
+
274
+
178
275
  class Markdown(object):
179
276
  # The dict of "extras" to enable in processing -- a mapping of
180
277
  # extra name to argument for the extra. Most extras do not have an
@@ -182,27 +279,47 @@ class Markdown(object):
182
279
  #
183
280
  # This can be set via (a) subclassing and (b) the constructor
184
281
  # "extras" argument.
185
- extras = None
282
+ extras: _extras_dict
283
+ # dict of `Extra` names and associated class instances, populated during _setup_extras
284
+ extra_classes: Dict[str, 'Extra']
186
285
 
187
- urls = None
188
- titles = None
189
- html_blocks = None
190
- html_spans = None
191
- html_removed_text = "{(#HTML#)}" # placeholder removed text that does not trigger bold
192
- html_removed_text_compat = "[HTML_REMOVED]" # for compat with markdown.py
286
+ urls: Dict[str, str]
287
+ titles: Dict[str, str]
288
+ html_blocks: Dict[str, str]
289
+ html_spans: Dict[str, str]
290
+ html_removed_text: str = "{(#HTML#)}" # placeholder removed text that does not trigger bold
291
+ html_removed_text_compat: str = "[HTML_REMOVED]" # for compat with markdown.py
292
+ safe_mode: Optional[_safe_mode]
193
293
 
194
- _toc = None
294
+ _toc: List[Tuple[int, str, str]]
195
295
 
196
296
  # Used to track when we're inside an ordered or unordered list
197
297
  # (see _ProcessListItems() for details):
198
298
  list_level = 0
199
299
 
300
+ stage: Stage
301
+ '''Current "stage" of markdown conversion taking place'''
302
+ order: float
303
+ '''
304
+ Same as `Stage` but will be +/- 0.5 of the value of `Stage`.
305
+ This allows extras to check if they are running before or after a particular stage
306
+ with `if md.order < md.stage`.
307
+ '''
308
+
200
309
  _ws_only_line_re = re.compile(r"^[ \t]+$", re.M)
201
310
 
202
- def __init__(self, html4tags=False, tab_width=4, safe_mode=None,
203
- extras=None, link_patterns=None,
204
- footnote_title=None, footnote_return_symbol=None,
205
- use_file_vars=False, cli=False):
311
+ def __init__(
312
+ self,
313
+ html4tags: bool = False,
314
+ tab_width: int = DEFAULT_TAB_WIDTH,
315
+ safe_mode: Optional[_safe_mode] = None,
316
+ extras: Optional[_extras_param] = None,
317
+ link_patterns: Optional[_link_patterns] = None,
318
+ footnote_title: Optional[str] = None,
319
+ footnote_return_symbol: Optional[str] = None,
320
+ use_file_vars: bool = False,
321
+ cli: bool = False
322
+ ):
206
323
  if html4tags:
207
324
  self.empty_element_suffix = ">"
208
325
  else:
@@ -219,10 +336,13 @@ class Markdown(object):
219
336
  self.safe_mode = safe_mode
220
337
 
221
338
  # Massaging and building the "extras" info.
222
- if self.extras is None:
339
+ if getattr(self, 'extras', None) is None:
223
340
  self.extras = {}
224
341
  elif not isinstance(self.extras, dict):
225
- self.extras = dict([(e, None) for e in self.extras])
342
+ # inheriting classes may set `self.extras` as List[str].
343
+ # we can't allow it through type hints but we can convert it
344
+ self.extras = dict([(e, None) for e in self.extras]) # type:ignore
345
+
226
346
  if extras:
227
347
  if not isinstance(extras, dict):
228
348
  extras = dict([(e, None) for e in extras])
@@ -237,14 +357,30 @@ class Markdown(object):
237
357
  self._toc_depth = 6
238
358
  else:
239
359
  self._toc_depth = self.extras["toc"].get("depth", 6)
240
- self._instance_extras = self.extras.copy()
360
+
361
+ if 'header-ids' in self.extras:
362
+ if not isinstance(self.extras['header-ids'], dict):
363
+ self.extras['header-ids'] = {
364
+ 'mixed': False,
365
+ 'prefix': self.extras['header-ids'],
366
+ 'reset-count': True
367
+ }
368
+
369
+ if 'break-on-newline' in self.extras:
370
+ self.extras.setdefault('breaks', {})
371
+ self.extras['breaks']['on_newline'] = True
241
372
 
242
373
  if 'link-patterns' in self.extras:
374
+ # allow link patterns via extras dict without kwarg explicitly set
375
+ link_patterns = link_patterns or self.extras['link-patterns']
243
376
  if link_patterns is None:
244
377
  # if you have specified that the link-patterns extra SHOULD
245
378
  # be used (via self.extras) but you haven't provided anything
246
379
  # via the link_patterns argument then an error is raised
247
380
  raise MarkdownError("If the 'link-patterns' extra is used, an argument for 'link_patterns' is required")
381
+ self.extras['link-patterns'] = link_patterns
382
+
383
+ self._instance_extras = self.extras.copy()
248
384
  self.link_patterns = link_patterns
249
385
  self.footnote_title = footnote_title
250
386
  self.footnote_return_symbol = footnote_return_symbol
@@ -266,16 +402,25 @@ class Markdown(object):
266
402
  self.list_level = 0
267
403
  self.extras = self._instance_extras.copy()
268
404
  self._setup_extras()
269
- self._toc = None
405
+ self._toc = []
270
406
 
271
407
  def _setup_extras(self):
272
408
  if "footnotes" in self.extras:
273
- self.footnotes = {}
409
+ # order of insertion matters for footnotes. Use ordered dict for Python < 3.7
410
+ # https://docs.python.org/3/whatsnew/3.7.html#summary-release-highlights
411
+ self.footnotes = OrderedDict()
274
412
  self.footnote_ids = []
275
413
  if "header-ids" in self.extras:
276
- self._count_from_header_id = defaultdict(int)
414
+ if not hasattr(self, '_count_from_header_id') or self.extras['header-ids'].get('reset-count', False):
415
+ self._count_from_header_id = defaultdict(int)
277
416
  if "metadata" in self.extras:
278
- self.metadata = {}
417
+ self.metadata: Dict[str, Any] = {}
418
+
419
+ self.extra_classes = {}
420
+ for name, klass in Extra._registry.items():
421
+ if name not in self.extras:
422
+ continue
423
+ self.extra_classes[name] = klass(self, (self.extras.get(name, {})))
279
424
 
280
425
  # Per <https://developer.mozilla.org/en-US/docs/HTML/Element/a> "rel"
281
426
  # should only be used in <a> tags with an "href" attribute.
@@ -295,7 +440,7 @@ class Markdown(object):
295
440
  re.IGNORECASE | re.VERBOSE
296
441
  )
297
442
 
298
- def convert(self, text):
443
+ def convert(self, text: str) -> 'UnicodeWithAttrs':
299
444
  """Convert the given text."""
300
445
  # Main function. The order in which other subs are called here is
301
446
  # essential. Link and image substitutions need to happen before
@@ -353,29 +498,12 @@ class Markdown(object):
353
498
 
354
499
  text = self.preprocess(text)
355
500
 
356
- if 'wavedrom' in self.extras:
357
- text = self._do_wavedrom_blocks(text)
358
-
359
- if "fenced-code-blocks" in self.extras and not self.safe_mode:
360
- text = self._do_fenced_code_blocks(text)
361
-
362
501
  if self.safe_mode:
363
502
  text = self._hash_html_spans(text)
364
503
 
365
504
  # Turn block-level HTML blocks into hash entries
366
505
  text = self._hash_html_blocks(text, raw=True)
367
506
 
368
- if "fenced-code-blocks" in self.extras and self.safe_mode:
369
- text = self._do_fenced_code_blocks(text)
370
-
371
- if 'admonitions' in self.extras:
372
- text = self._do_admonitions(text)
373
-
374
- # Because numbering references aren't links (yet?) then we can do everything associated with counters
375
- # before we get started
376
- if "numbering" in self.extras:
377
- text = self._do_numbering(text)
378
-
379
507
  # Strip link definitions, store in hashes.
380
508
  if "footnotes" in self.extras:
381
509
  # Must do footnotes first because an unlucky footnote defn
@@ -409,10 +537,22 @@ class Markdown(object):
409
537
  text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="nofollow"\2', text)
410
538
 
411
539
  if "toc" in self.extras and self._toc:
540
+ if self.extras['header-ids'].get('mixed'):
541
+ # TOC will only be out of order if mixed headers is enabled
542
+ def toc_sort(entry):
543
+ '''Sort the TOC by order of appearance in text'''
544
+ match = re.search(
545
+ # header tag, any attrs, the ID, any attrs, the text, close tag
546
+ r'^<(h%d).*?id=(["\'])%s\2.*>%s</\1>$' % (entry[0], entry[1], re.escape(entry[2])),
547
+ text, re.M
548
+ )
549
+ return match.start() if match else 0
550
+
551
+ self._toc.sort(key=toc_sort)
412
552
  self._toc_html = calculate_toc_html(self._toc)
413
553
 
414
554
  # Prepend toc html to output
415
- if self.cli:
555
+ if self.cli or (self.extras['toc'] is not None and self.extras['toc'].get('prepend', False)):
416
556
  text = '{}\n{}'.format(self._toc_html, text)
417
557
 
418
558
  text += "\n"
@@ -427,14 +567,16 @@ class Markdown(object):
427
567
  rv.metadata = self.metadata
428
568
  return rv
429
569
 
430
- def postprocess(self, text):
570
+ @mark_stage(Stage.POSTPROCESS)
571
+ def postprocess(self, text: str) -> str:
431
572
  """A hook for subclasses to do some postprocessing of the html, if
432
573
  desired. This is called before unescaping of special chars and
433
574
  unhashing of raw HTML spans.
434
575
  """
435
576
  return text
436
577
 
437
- def preprocess(self, text):
578
+ @mark_stage(Stage.PREPROCESS)
579
+ def preprocess(self, text: str) -> str:
438
580
  """A hook for subclasses to do some preprocessing of the Markdown, if
439
581
  desired. This is called after basic formatting of the text, but prior
440
582
  to any extras, safe mode, etc. processing.
@@ -477,29 +619,32 @@ class Markdown(object):
477
619
  _meta_data_fence_pattern = re.compile(r'^---[\ \t]*\n', re.MULTILINE)
478
620
  _meta_data_newline = re.compile("^\n", re.MULTILINE)
479
621
 
480
- def _extract_metadata(self, text):
622
+ def _extract_metadata(self, text: str) -> str:
481
623
  if text.startswith("---"):
482
624
  fence_splits = re.split(self._meta_data_fence_pattern, text, maxsplit=2)
483
625
  metadata_content = fence_splits[1]
484
- match = re.findall(self._meta_data_pattern, metadata_content)
485
- if not match:
486
- return text
487
626
  tail = fence_splits[2]
488
627
  else:
489
628
  metadata_split = re.split(self._meta_data_newline, text, maxsplit=1)
490
629
  metadata_content = metadata_split[0]
491
- match = re.findall(self._meta_data_pattern, metadata_content)
492
- if not match:
493
- return text
494
630
  tail = metadata_split[1]
495
631
 
496
- def parse_structured_value(value):
632
+ # _meta_data_pattern only has one capturing group, so we can assume
633
+ # the returned type to be list[str]
634
+ match: List[str] = re.findall(self._meta_data_pattern, metadata_content)
635
+ if not match:
636
+ return text
637
+
638
+ def parse_structured_value(value: str) -> Union[List[Any], Dict[str, Any]]:
497
639
  vs = value.lstrip()
498
640
  vs = value.replace(v[: len(value) - len(vs)], "\n")[1:]
499
641
 
500
642
  # List
501
643
  if vs.startswith("-"):
502
- r = []
644
+ r: List[Any] = []
645
+ # the regex used has multiple capturing groups, so
646
+ # returned type from findall will be List[List[str]]
647
+ match: List[str]
503
648
  for match in re.findall(self._key_val_list_pat, vs):
504
649
  if match[0] and not match[1] and not match[2]:
505
650
  r.append(match[0].strip())
@@ -564,7 +709,7 @@ class Markdown(object):
564
709
  (?P<content>.*?\1End:)
565
710
  """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)
566
711
 
567
- def _emacs_vars_oneliner_sub(self, match):
712
+ def _emacs_vars_oneliner_sub(self, match: re.Match) -> str:
568
713
  if match.group(1).strip() == '-*-' and match.group(4).strip() == '-*-':
569
714
  lead_ws = re.findall(r'^\s*', match.group(1))[0]
570
715
  tail_ws = re.findall(r'\s*$', match.group(4))[0]
@@ -573,7 +718,7 @@ class Markdown(object):
573
718
  start, end = match.span()
574
719
  return match.string[start: end]
575
720
 
576
- def _get_emacs_vars(self, text):
721
+ def _get_emacs_vars(self, text: str) -> Dict[str, str]:
577
722
  """Return a dictionary of emacs-style local variables.
578
723
 
579
724
  Parsing is done loosely according to this spec (and according to
@@ -616,7 +761,7 @@ class Markdown(object):
616
761
  if match:
617
762
  prefix = match.group("prefix")
618
763
  suffix = match.group("suffix")
619
- lines = match.group("content").splitlines(0)
764
+ lines = match.group("content").splitlines(False)
620
765
  # print "prefix=%r, suffix=%r, content=%r, lines: %s"\
621
766
  # % (prefix, suffix, match.group("content"), lines)
622
767
 
@@ -639,8 +784,10 @@ class Markdown(object):
639
784
  # Parse out one emacs var per line.
640
785
  continued_for = None
641
786
  for line in lines[:-1]: # no var on the last line ("PREFIX End:")
642
- if prefix: line = line[len(prefix):] # strip prefix
643
- if suffix: line = line[:-len(suffix)] # strip suffix
787
+ if prefix:
788
+ line = line[len(prefix):] # strip prefix
789
+ if suffix:
790
+ line = line[:-len(suffix)] # strip suffix
644
791
  line = line.strip()
645
792
  if continued_for:
646
793
  variable = continued_for
@@ -674,7 +821,7 @@ class Markdown(object):
674
821
 
675
822
  return emacs_vars
676
823
 
677
- def _detab_line(self, line):
824
+ def _detab_line(self, line: str) -> str:
678
825
  r"""Recusively convert tabs to spaces in a single line.
679
826
 
680
827
  Called from _detab()."""
@@ -685,7 +832,7 @@ class Markdown(object):
685
832
  output = chunk1 + chunk2
686
833
  return self._detab_line(output)
687
834
 
688
- def _detab(self, text):
835
+ def _detab(self, text: str) -> str:
689
836
  r"""Iterate text line by line and convert tabs to spaces.
690
837
 
691
838
  >>> m = Markdown()
@@ -711,7 +858,7 @@ class Markdown(object):
711
858
  # _block_tags_b. This way html5 tags are easy to keep track of.
712
859
  _html5tags = '|article|aside|header|hgroup|footer|nav|section|figure|figcaption'
713
860
 
714
- _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del'
861
+ _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del|style|html|head|body'
715
862
  _block_tags_a += _html5tags
716
863
 
717
864
  _strict_tag_block_re = re.compile(r"""
@@ -730,6 +877,11 @@ class Markdown(object):
730
877
  _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math'
731
878
  _block_tags_b += _html5tags
732
879
 
880
+ _span_tags = (
881
+ 'a|abbr|acronym|b|bdo|big|br|button|cite|code|dfn|em|i|img|input|kbd|label|map|object|output|q'
882
+ '|samp|script|select|small|span|strong|sub|sup|textarea|time|tt|var'
883
+ )
884
+
733
885
  _liberal_tag_block_re = re.compile(r"""
734
886
  ( # save in \1
735
887
  ^ # start of line (with re.M)
@@ -745,11 +897,26 @@ class Markdown(object):
745
897
 
746
898
  _html_markdown_attr_re = re.compile(
747
899
  r'''\s+markdown=("1"|'1')''')
748
- def _hash_html_block_sub(self, match, raw=False):
900
+ def _hash_html_block_sub(
901
+ self,
902
+ match: Union[re.Match, str],
903
+ raw: bool = False
904
+ ) -> str:
749
905
  if isinstance(match, str):
750
906
  html = match
907
+ tag = None
751
908
  else:
752
909
  html = match.group(1)
910
+ try:
911
+ tag = match.group(2)
912
+ except IndexError:
913
+ tag = None
914
+
915
+ if not tag:
916
+ m = re.match(r'.*?<(\S).*?\s*>', html)
917
+ # tag shouldn't be none but make the assertion for type checker
918
+ assert m is not None
919
+ tag = m.group(1)
753
920
 
754
921
  if raw and self.safe_mode:
755
922
  html = self._sanitize_html(html)
@@ -758,9 +925,17 @@ class Markdown(object):
758
925
  m = self._html_markdown_attr_re.search(first_line)
759
926
  if m:
760
927
  lines = html.split('\n')
928
+ # if MD is on same line as opening tag then split across two lines
929
+ lines = list(filter(None, (re.split(r'(.*?<%s.*markdown=.*?>)' % tag, lines[0])))) + lines[1:]
930
+ # if MD on same line as closing tag, split across two lines
931
+ lines = lines[:-1] + list(filter(None, re.split(r'(\s*?</%s>.*?$)' % tag, lines[-1])))
932
+ # extract key sections of the match
933
+ first_line = lines[0]
761
934
  middle = '\n'.join(lines[1:-1])
762
935
  last_line = lines[-1]
936
+ # remove `markdown="1"` attr from tag
763
937
  first_line = first_line[:m.start()] + first_line[m.end():]
938
+ # hash the HTML segments to protect them
764
939
  f_key = _hash_text(first_line)
765
940
  self.html_blocks[f_key] = first_line
766
941
  l_key = _hash_text(last_line)
@@ -768,11 +943,14 @@ class Markdown(object):
768
943
  return ''.join(["\n\n", f_key,
769
944
  "\n\n", middle, "\n\n",
770
945
  l_key, "\n\n"])
946
+ elif self.extras.get('header-ids', {}).get('mixed') and self._h_tag_re.match(html):
947
+ html = self._h_tag_re.sub(self._h_tag_sub, html)
771
948
  key = _hash_text(html)
772
949
  self.html_blocks[key] = html
773
950
  return "\n\n" + key + "\n\n"
774
951
 
775
- def _hash_html_blocks(self, text, raw=False):
952
+ @mark_stage(Stage.HASH_HTML)
953
+ def _hash_html_blocks(self, text: str, raw: bool = False) -> str:
776
954
  """Hashify HTML blocks
777
955
 
778
956
  We only want to do this for block-level HTML tags, such as headers,
@@ -806,6 +984,14 @@ class Markdown(object):
806
984
  # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
807
985
  text = self._liberal_tag_block_re.sub(hash_html_block_sub, text)
808
986
 
987
+ # now do the same for spans that are acting like blocks
988
+ # eg: an anchor split over multiple lines for readability
989
+ text = self._strict_tag_block_sub(
990
+ text, self._span_tags,
991
+ # inline elements can't contain block level elements, so only span gamut is required
992
+ lambda t: hash_html_block_sub(self._run_span_gamut(t))
993
+ )
994
+
809
995
  # Special case just for <hr />. It was easier to make a special
810
996
  # case than to make the other regex more complicated.
811
997
  if "<hr" in text:
@@ -880,27 +1066,45 @@ class Markdown(object):
880
1066
 
881
1067
  return text
882
1068
 
883
- def _strict_tag_block_sub(self, text, html_tags_re, callback):
1069
+ def _strict_tag_block_sub(
1070
+ self,
1071
+ text: str,
1072
+ html_tags_re: str,
1073
+ callback: Callable[[str], str],
1074
+ allow_indent: bool = False
1075
+ ) -> str:
1076
+ '''
1077
+ Finds and substitutes HTML blocks within blocks of text
1078
+
1079
+ Args:
1080
+ text: the text to search
1081
+ html_tags_re: a regex pattern of HTML block tags to match against.
1082
+ For example, `Markdown._block_tags_a`
1083
+ callback: callback function that receives the found HTML text block and returns a new str
1084
+ allow_indent: allow matching HTML blocks that are not completely outdented
1085
+ '''
884
1086
  tag_count = 0
885
1087
  current_tag = html_tags_re
886
1088
  block = ''
887
1089
  result = ''
888
1090
 
889
1091
  for chunk in text.splitlines(True):
890
- is_markup = re.match(r'^(?:</code>(?=</pre>))?(</?(%s)\b>?)' % current_tag, chunk)
1092
+ is_markup = re.match(
1093
+ r'^(\s{0,%s})(?:</code>(?=</pre>))?(</?(%s)\b>?)' % ('' if allow_indent else '0', current_tag), chunk
1094
+ )
891
1095
  block += chunk
892
1096
 
893
1097
  if is_markup:
894
- if chunk.startswith('</'):
1098
+ if chunk.startswith('%s</' % is_markup.group(1)):
895
1099
  tag_count -= 1
896
1100
  else:
897
1101
  # if close tag is in same line
898
- if '</%s>' % is_markup.group(2) in chunk[is_markup.end():]:
1102
+ if self._tag_is_closed(is_markup.group(3), chunk):
899
1103
  # we must ignore these
900
1104
  is_markup = None
901
1105
  else:
902
1106
  tag_count += 1
903
- current_tag = is_markup.group(2)
1107
+ current_tag = is_markup.group(3)
904
1108
 
905
1109
  if tag_count == 0:
906
1110
  if is_markup:
@@ -913,7 +1117,12 @@ class Markdown(object):
913
1117
 
914
1118
  return result
915
1119
 
916
- def _strip_link_definitions(self, text):
1120
+ def _tag_is_closed(self, tag_name: str, text: str) -> bool:
1121
+ # super basic check if number of open tags == number of closing tags
1122
+ return len(re.findall('<%s(?:.*?)>' % tag_name, text)) == len(re.findall('</%s>' % tag_name, text))
1123
+
1124
+ @mark_stage(Stage.LINK_DEFS)
1125
+ def _strip_link_definitions(self, text: str) -> str:
917
1126
  # Strips link definitions from text, stores the URLs and titles in
918
1127
  # hash references.
919
1128
  less_than_tab = self.tab_width - 1
@@ -940,7 +1149,7 @@ class Markdown(object):
940
1149
  """ % less_than_tab, re.X | re.M | re.U)
941
1150
  return _link_def_re.sub(self._extract_link_def_sub, text)
942
1151
 
943
- def _extract_link_def_sub(self, match):
1152
+ def _extract_link_def_sub(self, match: re.Match) -> str:
944
1153
  id, url, title = match.groups()
945
1154
  key = id.lower() # Link IDs are case-insensitive
946
1155
  self.urls[key] = self._encode_amps_and_angles(url)
@@ -948,65 +1157,7 @@ class Markdown(object):
948
1157
  self.titles[key] = title
949
1158
  return ""
950
1159
 
951
- def _do_numbering(self, text):
952
- ''' We handle the special extension for generic numbering for
953
- tables, figures etc.
954
- '''
955
- # First pass to define all the references
956
- self.regex_defns = re.compile(r'''
957
- \[\#(\w+) # the counter. Open square plus hash plus a word \1
958
- ([^@]*) # Some optional characters, that aren't an @. \2
959
- @(\w+) # the id. Should this be normed? \3
960
- ([^\]]*)\] # The rest of the text up to the terminating ] \4
961
- ''', re.VERBOSE)
962
- self.regex_subs = re.compile(r"\[@(\w+)\s*\]") # [@ref_id]
963
- counters = {}
964
- references = {}
965
- replacements = []
966
- definition_html = '<figcaption class="{}" id="counter-ref-{}">{}{}{}</figcaption>'
967
- reference_html = '<a class="{}" href="#counter-ref-{}">{}</a>'
968
- for match in self.regex_defns.finditer(text):
969
- # We must have four match groups otherwise this isn't a numbering reference
970
- if len(match.groups()) != 4:
971
- continue
972
- counter = match.group(1)
973
- text_before = match.group(2).strip()
974
- ref_id = match.group(3)
975
- text_after = match.group(4)
976
- number = counters.get(counter, 1)
977
- references[ref_id] = (number, counter)
978
- replacements.append((match.start(0),
979
- definition_html.format(counter,
980
- ref_id,
981
- text_before,
982
- number,
983
- text_after),
984
- match.end(0)))
985
- counters[counter] = number + 1
986
- for repl in reversed(replacements):
987
- text = text[:repl[0]] + repl[1] + text[repl[2]:]
988
-
989
- # Second pass to replace the references with the right
990
- # value of the counter
991
- # Fwiw, it's vaguely annoying to have to turn the iterator into
992
- # a list and then reverse it but I can't think of a better thing to do.
993
- for match in reversed(list(self.regex_subs.finditer(text))):
994
- number, counter = references.get(match.group(1), (None, None))
995
- if number is not None:
996
- repl = reference_html.format(counter,
997
- match.group(1),
998
- number)
999
- else:
1000
- repl = reference_html.format(match.group(1),
1001
- 'countererror',
1002
- '?' + match.group(1) + '?')
1003
- if "smarty-pants" in self.extras:
1004
- repl = repl.replace('"', self._escape_table['"'])
1005
-
1006
- text = text[:match.start()] + repl + text[match.end():]
1007
- return text
1008
-
1009
- def _extract_footnote_def_sub(self, match):
1160
+ def _extract_footnote_def_sub(self, match: re.Match) -> str:
1010
1161
  id, text = match.groups()
1011
1162
  text = _dedent(text, skip_first_line=not text.startswith('\n')).strip()
1012
1163
  normed_id = re.sub(r'\W', '-', id)
@@ -1015,7 +1166,7 @@ class Markdown(object):
1015
1166
  self.footnotes[normed_id] = text + "\n\n"
1016
1167
  return ""
1017
1168
 
1018
- def _strip_footnote_definitions(self, text):
1169
+ def _strip_footnote_definitions(self, text: str) -> str:
1019
1170
  """A footnote definition looks like this:
1020
1171
 
1021
1172
  [^note-id]: Text of the note.
@@ -1050,19 +1201,11 @@ class Markdown(object):
1050
1201
 
1051
1202
  _hr_re = re.compile(r'^[ ]{0,3}([-_*])[ ]{0,2}(\1[ ]{0,2}){2,}$', re.M)
1052
1203
 
1053
- def _run_block_gamut(self, text):
1204
+ @mark_stage(Stage.BLOCK_GAMUT)
1205
+ def _run_block_gamut(self, text: str) -> str:
1054
1206
  # These are all the transformations that form block-level
1055
1207
  # tags like paragraphs, headers, and list items.
1056
1208
 
1057
- if 'admonitions' in self.extras:
1058
- text = self._do_admonitions(text)
1059
-
1060
- if 'wavedrom' in self.extras:
1061
- text = self._do_wavedrom_blocks(text)
1062
-
1063
- if "fenced-code-blocks" in self.extras:
1064
- text = self._do_fenced_code_blocks(text)
1065
-
1066
1209
  text = self._do_headers(text)
1067
1210
 
1068
1211
  # Do Horizontal Rules:
@@ -1075,13 +1218,6 @@ class Markdown(object):
1075
1218
 
1076
1219
  text = self._do_lists(text)
1077
1220
 
1078
- if "pyshell" in self.extras:
1079
- text = self._prepare_pyshell_blocks(text)
1080
- if "wiki-tables" in self.extras:
1081
- text = self._do_wiki_tables(text)
1082
- if "tables" in self.extras:
1083
- text = self._do_tables(text)
1084
-
1085
1221
  text = self._do_code_blocks(text)
1086
1222
 
1087
1223
  text = self._do_block_quotes(text)
@@ -1096,164 +1232,8 @@ class Markdown(object):
1096
1232
 
1097
1233
  return text
1098
1234
 
1099
- def _pyshell_block_sub(self, match):
1100
- if "fenced-code-blocks" in self.extras:
1101
- dedented = _dedent(match.group(0))
1102
- return self._do_fenced_code_blocks("```pycon\n" + dedented + "```\n")
1103
- lines = match.group(0).splitlines(0)
1104
- _dedentlines(lines)
1105
- indent = ' ' * self.tab_width
1106
- s = ('\n' # separate from possible cuddled paragraph
1107
- + indent + ('\n'+indent).join(lines)
1108
- + '\n')
1109
- return s
1110
-
1111
- def _prepare_pyshell_blocks(self, text):
1112
- """Ensure that Python interactive shell sessions are put in
1113
- code blocks -- even if not properly indented.
1114
- """
1115
- if ">>>" not in text:
1116
- return text
1117
-
1118
- less_than_tab = self.tab_width - 1
1119
- _pyshell_block_re = re.compile(r"""
1120
- ^([ ]{0,%d})>>>[ ].*\n # first line
1121
- ^(\1[^\S\n]*\S.*\n)* # any number of subsequent lines with at least one character
1122
- (?=^\1?\n|\Z) # ends with a blank line or end of document
1123
- """ % less_than_tab, re.M | re.X)
1124
-
1125
- return _pyshell_block_re.sub(self._pyshell_block_sub, text)
1126
-
1127
- def _table_sub(self, match):
1128
- trim_space_re = '^[ \t\n]+|[ \t\n]+$'
1129
- trim_bar_re = r'^\||\|$'
1130
- split_bar_re = r'^\||(?<![\`\\])\|'
1131
- escape_bar_re = r'\\\|'
1132
-
1133
- head, underline, body = match.groups()
1134
-
1135
- # Determine aligns for columns.
1136
- cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", underline)))]
1137
- align_from_col_idx = {}
1138
- for col_idx, col in enumerate(cols):
1139
- if col[0] == ':' and col[-1] == ':':
1140
- align_from_col_idx[col_idx] = ' style="text-align:center;"'
1141
- elif col[0] == ':':
1142
- align_from_col_idx[col_idx] = ' style="text-align:left;"'
1143
- elif col[-1] == ':':
1144
- align_from_col_idx[col_idx] = ' style="text-align:right;"'
1145
-
1146
- # thead
1147
- hlines = ['<table%s>' % self._html_class_str_from_tag('table'), '<thead%s>' % self._html_class_str_from_tag('thead'), '<tr>']
1148
- cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)))]
1149
- for col_idx, col in enumerate(cols):
1150
- hlines.append(' <th%s>%s</th>' % (
1151
- align_from_col_idx.get(col_idx, ''),
1152
- self._run_span_gamut(col)
1153
- ))
1154
- hlines.append('</tr>')
1155
- hlines.append('</thead>')
1156
-
1157
- # tbody
1158
- hlines.append('<tbody>')
1159
- for line in body.strip('\n').split('\n'):
1160
- hlines.append('<tr>')
1161
- cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)))]
1162
- for col_idx, col in enumerate(cols):
1163
- hlines.append(' <td%s>%s</td>' % (
1164
- align_from_col_idx.get(col_idx, ''),
1165
- self._run_span_gamut(col)
1166
- ))
1167
- hlines.append('</tr>')
1168
- hlines.append('</tbody>')
1169
- hlines.append('</table>')
1170
-
1171
- return '\n'.join(hlines) + '\n'
1172
-
1173
- def _do_tables(self, text):
1174
- """Copying PHP-Markdown and GFM table syntax. Some regex borrowed from
1175
- https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538
1176
- """
1177
- less_than_tab = self.tab_width - 1
1178
- table_re = re.compile(r'''
1179
- (?:(?<=\n\n)|\A\n?) # leading blank line
1180
-
1181
- ^[ ]{0,%d} # allowed whitespace
1182
- (.*[|].*) \n # $1: header row (at least one pipe)
1183
-
1184
- ^[ ]{0,%d} # allowed whitespace
1185
- ( # $2: underline row
1186
- # underline row with leading bar
1187
- (?: \|\ *:?-+:?\ * )+ \|? \s? \n
1188
- |
1189
- # or, underline row without leading bar
1190
- (?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \s? \n
1191
- )
1192
-
1193
- ( # $3: data rows
1194
- (?:
1195
- ^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces
1196
- .*\|.* \n
1197
- )+
1198
- )
1199
- ''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X)
1200
- return table_re.sub(self._table_sub, text)
1201
-
1202
- def _wiki_table_sub(self, match):
1203
- ttext = match.group(0).strip()
1204
- # print('wiki table: %r' % match.group(0))
1205
- rows = []
1206
- for line in ttext.splitlines(0):
1207
- line = line.strip()[2:-2].strip()
1208
- row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)]
1209
- rows.append(row)
1210
- # from pprint import pprint
1211
- # pprint(rows)
1212
- hlines = []
1213
-
1214
- def add_hline(line, indents=0):
1215
- hlines.append((self.tab * indents) + line)
1216
-
1217
- def format_cell(text):
1218
- return self._run_span_gamut(re.sub(r"^\s*~", "", cell).strip(" "))
1219
-
1220
- add_hline('<table%s>' % self._html_class_str_from_tag('table'))
1221
- # Check if first cell of first row is a header cell. If so, assume the whole row is a header row.
1222
- if rows and rows[0] and re.match(r"^\s*~", rows[0][0]):
1223
- add_hline('<thead%s>' % self._html_class_str_from_tag('thead'), 1)
1224
- add_hline('<tr>', 2)
1225
- for cell in rows[0]:
1226
- add_hline("<th>{}</th>".format(format_cell(cell)), 3)
1227
- add_hline('</tr>', 2)
1228
- add_hline('</thead>', 1)
1229
- # Only one header row allowed.
1230
- rows = rows[1:]
1231
- # If no more rows, don't create a tbody.
1232
- if rows:
1233
- add_hline('<tbody>', 1)
1234
- for row in rows:
1235
- add_hline('<tr>', 2)
1236
- for cell in row:
1237
- add_hline('<td>{}</td>'.format(format_cell(cell)), 3)
1238
- add_hline('</tr>', 2)
1239
- add_hline('</tbody>', 1)
1240
- add_hline('</table>')
1241
- return '\n'.join(hlines) + '\n'
1242
-
1243
- def _do_wiki_tables(self, text):
1244
- # Optimization.
1245
- if "||" not in text:
1246
- return text
1247
-
1248
- less_than_tab = self.tab_width - 1
1249
- wiki_table_re = re.compile(r'''
1250
- (?:(?<=\n\n)|\A\n?) # leading blank line
1251
- ^([ ]{0,%d})\|\|.+?\|\|[ ]*\n # first line
1252
- (^\1\|\|.+?\|\|\n)* # any number of subsequent lines
1253
- ''' % less_than_tab, re.M | re.X)
1254
- return wiki_table_re.sub(self._wiki_table_sub, text)
1255
-
1256
- def _run_span_gamut(self, text):
1235
+ @mark_stage(Stage.SPAN_GAMUT)
1236
+ def _run_span_gamut(self, text: str) -> str:
1257
1237
  # These are all the transformations that occur *within* block-level
1258
1238
  # tags like paragraphs, headers, and list items.
1259
1239
 
@@ -1262,9 +1242,6 @@ class Markdown(object):
1262
1242
  text = self._escape_special_chars(text)
1263
1243
 
1264
1244
  # Process anchor and image tags.
1265
- if "link-patterns" in self.extras:
1266
- text = self._do_link_patterns(text)
1267
-
1268
1245
  text = self._do_links(text)
1269
1246
 
1270
1247
  # Make links out of things like `<http://example.com/>`
@@ -1274,25 +1251,10 @@ class Markdown(object):
1274
1251
 
1275
1252
  text = self._encode_amps_and_angles(text)
1276
1253
 
1277
- if "strike" in self.extras:
1278
- text = self._do_strike(text)
1279
-
1280
- if "underline" in self.extras:
1281
- text = self._do_underline(text)
1282
-
1283
1254
  text = self._do_italics_and_bold(text)
1284
1255
 
1285
- if "tg-spoiler" in self.extras:
1286
- text = self._do_tg_spoiler(text)
1287
-
1288
- if "smarty-pants" in self.extras:
1289
- text = self._do_smart_punctuation(text)
1290
-
1291
- # Do hard breaks:
1292
- if "break-on-newline" in self.extras:
1293
- text = re.sub(r" *\n(?!\<(?:\/?(ul|ol|li))\>)", "<br%s\n" % self.empty_element_suffix, text)
1294
- else:
1295
- text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
1256
+ # Do hard breaks
1257
+ text = re.sub(r" {2,}\n(?!\<(?:\/?(ul|ol|li))\>)", "<br%s\n" % self.empty_element_suffix, text)
1296
1258
 
1297
1259
  return text
1298
1260
 
@@ -1317,7 +1279,8 @@ class Markdown(object):
1317
1279
  )
1318
1280
  """, re.X)
1319
1281
 
1320
- def _escape_special_chars(self, text):
1282
+ @mark_stage(Stage.ESCAPE_SPECIAL)
1283
+ def _escape_special_chars(self, text: str) -> str:
1321
1284
  # Python markdown note: the HTML tokenization here differs from
1322
1285
  # that in Markdown.pl, hence the behaviour for subtle cases can
1323
1286
  # differ (I believe the tokenizer here does a better job because
@@ -1348,7 +1311,8 @@ class Markdown(object):
1348
1311
  is_html_markup = not is_html_markup
1349
1312
  return ''.join(escaped)
1350
1313
 
1351
- def _hash_html_spans(self, text):
1314
+ @mark_stage(Stage.HASH_HTML)
1315
+ def _hash_html_spans(self, text: str) -> str:
1352
1316
  # Used for safe_mode.
1353
1317
 
1354
1318
  def _is_auto_link(s):
@@ -1371,26 +1335,41 @@ class Markdown(object):
1371
1335
 
1372
1336
  return re.match(r'<code>md5-[A-Fa-f0-9]{32}</code>', ''.join(peek_tokens))
1373
1337
 
1338
+ def _is_comment(token):
1339
+ if self.safe_mode == 'replace':
1340
+ # don't bother processing each section of comment in replace mode. Just do the whole thing
1341
+ return
1342
+ return re.match(r'(<!--)(.*)(-->)', token)
1343
+
1344
+ def _hash(token):
1345
+ key = _hash_text(token)
1346
+ self.html_spans[key] = token
1347
+ return key
1348
+
1374
1349
  tokens = []
1375
1350
  split_tokens = self._sorta_html_tokenize_re.split(text)
1376
1351
  is_html_markup = False
1377
1352
  for index, token in enumerate(split_tokens):
1378
1353
  if is_html_markup and not _is_auto_link(token) and not _is_code_span(index, token):
1379
- sanitized = self._sanitize_html(token)
1380
- key = _hash_text(sanitized)
1381
- self.html_spans[key] = sanitized
1382
- tokens.append(key)
1354
+ is_comment = _is_comment(token)
1355
+ if is_comment:
1356
+ tokens.append(_hash(self._sanitize_html(is_comment.group(1))))
1357
+ # sanitise but leave comment body intact for further markdown processing
1358
+ tokens.append(self._sanitize_html(is_comment.group(2)))
1359
+ tokens.append(_hash(self._sanitize_html(is_comment.group(3))))
1360
+ else:
1361
+ tokens.append(_hash(self._sanitize_html(token)))
1383
1362
  else:
1384
1363
  tokens.append(self._encode_incomplete_tags(token))
1385
1364
  is_html_markup = not is_html_markup
1386
1365
  return ''.join(tokens)
1387
1366
 
1388
- def _unhash_html_spans(self, text):
1367
+ def _unhash_html_spans(self, text: str) -> str:
1389
1368
  for key, sanitized in list(self.html_spans.items()):
1390
1369
  text = text.replace(key, sanitized)
1391
1370
  return text
1392
1371
 
1393
- def _sanitize_html(self, s):
1372
+ def _sanitize_html(self, s: str) -> str:
1394
1373
  if self.safe_mode == "replace":
1395
1374
  return self.html_removed_text
1396
1375
  elif self.safe_mode == "escape":
@@ -1428,14 +1407,14 @@ class Markdown(object):
1428
1407
 
1429
1408
  _strip_anglebrackets = re.compile(r'<(.*)>.*')
1430
1409
 
1431
- def _find_non_whitespace(self, text, start):
1410
+ def _find_non_whitespace(self, text: str, start: int) -> int:
1432
1411
  """Returns the index of the first non-whitespace character in text
1433
1412
  after (and including) start
1434
1413
  """
1435
1414
  match = self._whitespace.match(text, start)
1436
- return match.end()
1415
+ return match.end() if match else len(text)
1437
1416
 
1438
- def _find_balanced(self, text, start, open_c, close_c):
1417
+ def _find_balanced(self, text: str, start: int, open_c: str, close_c: str) -> int:
1439
1418
  """Returns the index where the open_c and close_c characters balance
1440
1419
  out - the same number of open_c and close_c are encountered - or the
1441
1420
  end of string if it's reached before the balance point is found.
@@ -1451,7 +1430,7 @@ class Markdown(object):
1451
1430
  i += 1
1452
1431
  return i
1453
1432
 
1454
- def _extract_url_and_title(self, text, start):
1433
+ def _extract_url_and_title(self, text: str, start: int) -> Union[Tuple[str, str, int], Tuple[None, None, None]]:
1455
1434
  """Extracts the url and (optional) title from the tail of a link"""
1456
1435
  # text[start] equals the opening parenthesis
1457
1436
  idx = self._find_non_whitespace(text, start+1)
@@ -1470,19 +1449,56 @@ class Markdown(object):
1470
1449
  url = self._strip_anglebrackets.sub(r'\1', url)
1471
1450
  return url, title, end_idx
1472
1451
 
1473
- def _protect_url(self, url):
1452
+ # https://developer.mozilla.org/en-US/docs/web/http/basics_of_http/data_urls
1453
+ # https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types
1454
+ _data_url_re = re.compile(r'''
1455
+ data:
1456
+ # in format type/subtype;parameter=optional
1457
+ (?P<mime>\w+/[\w+\.-]+(?:;\w+=[\w+\.-]+)?)?
1458
+ # optional base64 token
1459
+ (?P<token>;base64)?
1460
+ ,(?P<data>.*)
1461
+ ''', re.X)
1462
+
1463
+ def _protect_url(self, url: str) -> str:
1474
1464
  '''
1475
1465
  Function that passes a URL through `_html_escape_url` to remove any nasty characters,
1476
1466
  and then hashes the now "safe" URL to prevent other safety mechanisms from tampering
1477
1467
  with it (eg: escaping "&" in URL parameters)
1478
1468
  '''
1479
- url = _html_escape_url(url, safe_mode=self.safe_mode)
1469
+ data_url = self._data_url_re.match(url)
1470
+ charset = None
1471
+ if data_url is not None:
1472
+ mime = data_url.group('mime') or ''
1473
+ if mime.startswith('image/') and data_url.group('token') == ';base64':
1474
+ charset='base64'
1475
+ url = _html_escape_url(url, safe_mode=self.safe_mode, charset=charset)
1480
1476
  key = _hash_text(url)
1481
1477
  self._escape_table[url] = key
1482
1478
  return key
1483
1479
 
1484
- _safe_protocols = re.compile(r'(https?|ftp):', re.I)
1485
- def _do_links(self, text):
1480
+ _safe_protocols = r'(?:https?|ftp):\/\/|(?:mailto|tel):'
1481
+
1482
+ @property
1483
+ def _safe_href(self):
1484
+ '''
1485
+ _safe_href is adapted from pagedown's Markdown.Sanitizer.js
1486
+ From: https://github.com/StackExchange/pagedown/blob/master/LICENSE.txt
1487
+ Original Showdown code copyright (c) 2007 John Fraser
1488
+ Modifications and bugfixes (c) 2009 Dana Robinson
1489
+ Modifications and bugfixes (c) 2009-2014 Stack Exchange Inc.
1490
+ '''
1491
+ safe = r'-\w'
1492
+ # omitted ['"<>] for XSS reasons
1493
+ less_safe = r'#/\.!#$%&\(\)\+,/:;=\?@\[\]^`\{\}\|~'
1494
+ # dot seperated hostname, optional port number, not followed by protocol seperator
1495
+ domain = r'(?:[%s]+(?:\.[%s]+)*)(?:(?<!tel):\d+/?)?(?![^:/]*:/*)' % (safe, safe)
1496
+ fragment = r'[%s]*' % (safe + less_safe)
1497
+
1498
+ return re.compile(r'^(?:(%s)?(%s)(%s)|(#|\.{,2}/)(%s))$' % (self._safe_protocols, domain, fragment, fragment), re.I)
1499
+
1500
+ @mark_stage(Stage.LINKS)
1501
+ def _do_links(self, text: str) -> str:
1486
1502
  """Turn Markdown link shortcuts into XHTML <a> and <img> tags.
1487
1503
 
1488
1504
  This is a combination of Markdown.pl's _DoAnchors() and
@@ -1599,7 +1615,7 @@ class Markdown(object):
1599
1615
  anchor_allowed_pos = start_idx + len(result)
1600
1616
  text = text[:start_idx] + result + text[url_end_idx:]
1601
1617
  elif start_idx >= anchor_allowed_pos:
1602
- safe_link = self._safe_protocols.match(url) or url.startswith('#')
1618
+ safe_link = self._safe_href.match(url)
1603
1619
  if self.safe_mode and not safe_link:
1604
1620
  result_head = '<a href="#"%s>' % (title_str)
1605
1621
  else:
@@ -1655,7 +1671,7 @@ class Markdown(object):
1655
1671
  curr_pos = start_idx + len(result)
1656
1672
  text = text[:start_idx] + result + text[match.end():]
1657
1673
  elif start_idx >= anchor_allowed_pos:
1658
- if self.safe_mode and not self._safe_protocols.match(url):
1674
+ if self.safe_mode and not self._safe_href.match(url):
1659
1675
  result_head = '<a href="#"%s>' % (title_str)
1660
1676
  else:
1661
1677
  result_head = '<a href="%s"%s>' % (self._protect_url(url), title_str)
@@ -1672,7 +1688,8 @@ class Markdown(object):
1672
1688
  curr_pos = start_idx + 1
1673
1689
  else:
1674
1690
  # This id isn't defined, leave the markup alone.
1675
- curr_pos = match.end()
1691
+ # set current pos to end of link title and continue from there
1692
+ curr_pos = p
1676
1693
  continue
1677
1694
 
1678
1695
  # Otherwise, it isn't markup.
@@ -1680,7 +1697,11 @@ class Markdown(object):
1680
1697
 
1681
1698
  return text
1682
1699
 
1683
- def header_id_from_text(self, text, prefix, n):
1700
+ def header_id_from_text(self,
1701
+ text: str,
1702
+ prefix: str,
1703
+ n: Optional[int] = None
1704
+ ) -> str:
1684
1705
  """Generate a header id attribute value from the given header
1685
1706
  HTML content.
1686
1707
 
@@ -1690,7 +1711,7 @@ class Markdown(object):
1690
1711
  @param text {str} The text of the header tag
1691
1712
  @param prefix {str} The requested prefix for header ids. This is the
1692
1713
  value of the "header-ids" extra key, if any. Otherwise, None.
1693
- @param n {int} The <hN> tag number, i.e. `1` for an <h1> tag.
1714
+ @param n {int} (unused) The <hN> tag number, i.e. `1` for an <h1> tag.
1694
1715
  @returns {str} The value for the header tag's "id" attribute. Return
1695
1716
  None to not have an id attribute and to exclude this header from
1696
1717
  the TOC (if the "toc" extra is specified).
@@ -1705,7 +1726,14 @@ class Markdown(object):
1705
1726
 
1706
1727
  return header_id
1707
1728
 
1708
- def _toc_add_entry(self, level, id, name):
1729
+ def _header_id_exists(self, text: str) -> bool:
1730
+ header_id = _slugify(text)
1731
+ prefix = self.extras['header-ids'].get('prefix')
1732
+ if prefix and isinstance(prefix, str):
1733
+ header_id = prefix + '-' + header_id
1734
+ return header_id in self._count_from_header_id or header_id in map(lambda x: x[1], self._toc)
1735
+
1736
+ def _toc_add_entry(self, level: int, id: str, name: str) -> None:
1709
1737
  if level > self._toc_depth:
1710
1738
  return
1711
1739
  if self._toc is None:
@@ -1728,7 +1756,8 @@ class Markdown(object):
1728
1756
  _h_re = re.compile(_h_re_base % '*', re.X | re.M)
1729
1757
  _h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M)
1730
1758
 
1731
- def _h_sub(self, match):
1759
+ def _h_sub(self, match: re.Match) -> str:
1760
+ '''Handles processing markdown headers'''
1732
1761
  if match.group(1) is not None and match.group(3) == "-":
1733
1762
  return match.group(1)
1734
1763
  elif match.group(1) is not None:
@@ -1746,7 +1775,7 @@ class Markdown(object):
1746
1775
  header_id_attr = ""
1747
1776
  if "header-ids" in self.extras:
1748
1777
  header_id = self.header_id_from_text(header_group,
1749
- self.extras["header-ids"], n)
1778
+ self.extras["header-ids"].get('prefix'), n)
1750
1779
  if header_id:
1751
1780
  header_id_attr = ' id="%s"' % header_id
1752
1781
  html = self._run_span_gamut(header_group)
@@ -1754,7 +1783,39 @@ class Markdown(object):
1754
1783
  self._toc_add_entry(n, header_id, html)
1755
1784
  return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
1756
1785
 
1757
- def _do_headers(self, text):
1786
+ _h_tag_re = re.compile(r'''
1787
+ ^<h([1-6])(.*)> # \1 tag num, \2 attrs
1788
+ (.*) # \3 text
1789
+ </h\1>
1790
+ ''', re.X | re.M)
1791
+
1792
+ def _h_tag_sub(self, match: re.Match) -> str:
1793
+ '''Different to `_h_sub` in that this function handles existing HTML headers'''
1794
+ text = match.string[match.start(): match.end()]
1795
+ h_level = int(match.group(1))
1796
+ # extract id= attr from tag, trying to account for regex "misses"
1797
+ id_attr = (re.match(r'.*?id=(\S+)?.*', match.group(2) or '') or '')
1798
+ if id_attr:
1799
+ # if id attr exists, extract that
1800
+ id_attr = id_attr.group(1) or ''
1801
+ id_attr = id_attr.strip('\'" ')
1802
+ h_text = match.group(3)
1803
+
1804
+ # check if header was already processed (ie: was a markdown header rather than HTML)
1805
+ if id_attr and self._header_id_exists(id_attr):
1806
+ return text
1807
+
1808
+ # generate new header id if none existed
1809
+ header_id = id_attr or self.header_id_from_text(h_text, self.extras['header-ids'].get('prefix'), h_level)
1810
+ if "toc" in self.extras:
1811
+ self._toc_add_entry(h_level, header_id, h_text)
1812
+ if header_id and not id_attr:
1813
+ # '<h[digit]' + new ID + '...'
1814
+ return text[:3] + ' id="%s"' % header_id + text[3:]
1815
+ return text
1816
+
1817
+ @mark_stage(Stage.HEADERS)
1818
+ def _do_headers(self, text: str) -> str:
1758
1819
  # Setext-style headers:
1759
1820
  # Header 1
1760
1821
  # ========
@@ -1778,7 +1839,7 @@ class Markdown(object):
1778
1839
  _marker_ul = '(?:[%s])' % _marker_ul_chars
1779
1840
  _marker_ol = r'(?:\d+\.)'
1780
1841
 
1781
- def _list_sub(self, match):
1842
+ def _list_sub(self, match: re.Match) -> str:
1782
1843
  lst = match.group(1)
1783
1844
  lst_type = match.group(4) in self._marker_ul_chars and "ul" or "ol"
1784
1845
 
@@ -1796,7 +1857,8 @@ class Markdown(object):
1796
1857
  else:
1797
1858
  return "<%s%s>\n%s</%s>\n\n" % (lst_type, lst_opts, result, lst_type)
1798
1859
 
1799
- def _do_lists(self, text):
1860
+ @mark_stage(Stage.LISTS)
1861
+ def _do_lists(self, text: str) -> str:
1800
1862
  # Form HTML ordered (numbered) and unordered (bulleted) lists.
1801
1863
 
1802
1864
  # Iterate over each *non-overlapping* list match.
@@ -1872,20 +1934,24 @@ class Markdown(object):
1872
1934
 
1873
1935
  _task_list_warpper_str = r'<input type="checkbox" class="task-list-item-checkbox" %sdisabled> %s'
1874
1936
 
1875
- def _task_list_item_sub(self, match):
1937
+ def _task_list_item_sub(self, match: re.Match) -> str:
1876
1938
  marker = match.group(1)
1877
1939
  item_text = match.group(2)
1878
1940
  if marker in ['[x]','[X]']:
1879
- return self._task_list_warpper_str % ('checked ', item_text)
1941
+ return self._task_list_warpper_str % ('checked ', item_text)
1880
1942
  elif marker == '[ ]':
1881
- return self._task_list_warpper_str % ('', item_text)
1943
+ return self._task_list_warpper_str % ('', item_text)
1944
+ # returning None has same effect as returning empty str, but only
1945
+ # one makes the type checker happy
1946
+ return ''
1882
1947
 
1883
1948
  _last_li_endswith_two_eols = False
1884
- def _list_item_sub(self, match):
1949
+ def _list_item_sub(self, match: re.Match) -> str:
1885
1950
  item = match.group(4)
1886
1951
  leading_line = match.group(1)
1887
1952
  if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
1888
- item = self._run_block_gamut(self._outdent(item))
1953
+ item = self._uniform_outdent(item, min_outdent=' ', max_outdent=self.tab)[1]
1954
+ item = self._run_block_gamut(item)
1889
1955
  else:
1890
1956
  # Recursion for sub-lists:
1891
1957
  item = self._do_lists(self._uniform_outdent(item, min_outdent=' ')[1])
@@ -1899,7 +1965,7 @@ class Markdown(object):
1899
1965
 
1900
1966
  return "<li>%s</li>\n" % item
1901
1967
 
1902
- def _process_list_items(self, list_str):
1968
+ def _process_list_items(self, list_str: str) -> str:
1903
1969
  # Process the contents of a single ordered or unordered list,
1904
1970
  # splitting it into individual list items.
1905
1971
 
@@ -1930,7 +1996,12 @@ class Markdown(object):
1930
1996
  self.list_level -= 1
1931
1997
  return list_str
1932
1998
 
1933
- def _get_pygments_lexer(self, lexer_name):
1999
+ def _get_pygments_lexer(self, lexer_name: str):
2000
+ '''
2001
+ Returns:
2002
+ `pygments.Lexer` or None if a lexer matching `lexer_name` is
2003
+ not found
2004
+ '''
1934
2005
  try:
1935
2006
  from pygments import lexers, util
1936
2007
  except ImportError:
@@ -1940,7 +2011,21 @@ class Markdown(object):
1940
2011
  except util.ClassNotFound:
1941
2012
  return None
1942
2013
 
1943
- def _color_with_pygments(self, codeblock, lexer, **formatter_opts):
2014
+ def _color_with_pygments(
2015
+ self,
2016
+ codeblock: str,
2017
+ lexer,
2018
+ **formatter_opts
2019
+ ) -> str:
2020
+ '''
2021
+ TODO: this function is only referenced by the `FencedCodeBlocks`
2022
+ extra. May be worth moving over there
2023
+
2024
+ Args:
2025
+ codeblock: the codeblock to highlight
2026
+ lexer (pygments.Lexer): lexer to use
2027
+ formatter_opts: pygments HtmlFormatter options
2028
+ '''
1944
2029
  import pygments
1945
2030
  import pygments.formatters
1946
2031
 
@@ -1973,82 +2058,22 @@ class Markdown(object):
1973
2058
  formatter = HtmlCodeFormatter(**formatter_opts)
1974
2059
  return pygments.highlight(codeblock, lexer, formatter)
1975
2060
 
1976
- def _code_block_sub(self, match, is_fenced_code_block=False):
1977
- lexer_name = None
1978
- if is_fenced_code_block:
1979
- lexer_name = match.group(2)
1980
- codeblock = match.group(3)
1981
- codeblock = codeblock[:-1] # drop one trailing newline
1982
- else:
1983
- codeblock = match.group(1)
1984
- codeblock = self._outdent(codeblock)
1985
- codeblock = self._detab(codeblock)
1986
- codeblock = codeblock.lstrip('\n') # trim leading newlines
1987
- codeblock = codeblock.rstrip() # trim trailing whitespace
1988
-
1989
- # Use pygments only if not using the highlightjs-lang extra
1990
- if lexer_name and "highlightjs-lang" not in self.extras:
1991
- lexer = self._get_pygments_lexer(lexer_name)
1992
- if lexer:
1993
- leading_indent = ' '*(len(match.group(1)) - len(match.group(1).lstrip()))
1994
- return self._code_block_with_lexer_sub(codeblock, leading_indent, lexer, is_fenced_code_block)
2061
+ def _code_block_sub(self, match: re.Match) -> str:
2062
+ codeblock = match.group(1)
2063
+ codeblock = self._outdent(codeblock)
2064
+ codeblock = self._detab(codeblock)
2065
+ codeblock = codeblock.lstrip('\n') # trim leading newlines
2066
+ codeblock = codeblock.rstrip() # trim trailing whitespace
1995
2067
 
1996
2068
  pre_class_str = self._html_class_str_from_tag("pre")
2069
+ code_class_str = self._html_class_str_from_tag("code")
1997
2070
 
1998
- if "highlightjs-lang" in self.extras and lexer_name:
1999
- code_class_str = ' class="%s language-%s"' % (lexer_name, lexer_name)
2000
- else:
2001
- code_class_str = self._html_class_str_from_tag("code")
2002
-
2003
- if is_fenced_code_block:
2004
- # Fenced code blocks need to be outdented before encoding, and then reapplied
2005
- leading_indent = ' ' * (len(match.group(1)) - len(match.group(1).lstrip()))
2006
- if codeblock:
2007
- # only run the codeblock through the outdenter if not empty
2008
- leading_indent, codeblock = self._uniform_outdent(codeblock, max_outdent=leading_indent)
2009
-
2010
- codeblock = self._encode_code(codeblock)
2011
-
2012
- if lexer_name == 'mermaid' and 'mermaid' in self.extras:
2013
- return '\n%s<pre class="mermaid-pre"><div class="mermaid">%s\n</div></pre>\n' % (
2014
- leading_indent, codeblock)
2015
-
2016
- return "\n%s<pre%s><code%s>%s\n</code></pre>\n" % (
2017
- leading_indent, pre_class_str, code_class_str, codeblock)
2018
- else:
2019
- codeblock = self._encode_code(codeblock)
2020
-
2021
- return "\n<pre%s><code%s>%s\n</code></pre>\n" % (
2022
- pre_class_str, code_class_str, codeblock)
2023
-
2024
- def _code_block_with_lexer_sub(self, codeblock, leading_indent, lexer, is_fenced_code_block):
2025
- if is_fenced_code_block:
2026
- formatter_opts = self.extras['fenced-code-blocks'] or {}
2027
- else:
2028
- formatter_opts = {}
2029
-
2030
- def unhash_code(codeblock):
2031
- for key, sanitized in list(self.html_spans.items()):
2032
- codeblock = codeblock.replace(key, sanitized)
2033
- replacements = [
2034
- ("&amp;", "&"),
2035
- ("&lt;", "<"),
2036
- ("&gt;", ">")
2037
- ]
2038
- for old, new in replacements:
2039
- codeblock = codeblock.replace(old, new)
2040
- return codeblock
2041
- # remove leading indent from code block
2042
- _, codeblock = self._uniform_outdent(codeblock, max_outdent=leading_indent)
2043
-
2044
- codeblock = unhash_code(codeblock)
2045
- colored = self._color_with_pygments(codeblock, lexer,
2046
- **formatter_opts)
2071
+ codeblock = self._encode_code(codeblock)
2047
2072
 
2048
- # add back the indent to all lines
2049
- return "\n%s\n" % self._uniform_indent(colored, leading_indent, True)
2073
+ return "\n<pre%s><code%s>%s\n</code></pre>\n" % (
2074
+ pre_class_str, code_class_str, codeblock)
2050
2075
 
2051
- def _html_class_str_from_tag(self, tag):
2076
+ def _html_class_str_from_tag(self, tag: str) -> str:
2052
2077
  """Get the appropriate ' class="..."' string (note the leading
2053
2078
  space), if any, for the given tag.
2054
2079
  """
@@ -2064,7 +2089,8 @@ class Markdown(object):
2064
2089
  return ' class="%s"' % html_classes_from_tag[tag]
2065
2090
  return ""
2066
2091
 
2067
- def _do_code_blocks(self, text):
2092
+ @mark_stage(Stage.CODE_BLOCKS)
2093
+ def _do_code_blocks(self, text: str) -> str:
2068
2094
  """Process Markdown `<pre><code>` blocks."""
2069
2095
  code_block_re = re.compile(r'''
2070
2096
  (?:\n\n|\A\n?)
@@ -2082,20 +2108,6 @@ class Markdown(object):
2082
2108
  re.M | re.X)
2083
2109
  return code_block_re.sub(self._code_block_sub, text)
2084
2110
 
2085
- _fenced_code_block_re = re.compile(r'''
2086
- (?:\n+|\A\n?|(?<=\n))
2087
- (^[ \t]*`{3,})\s{0,99}?([\w+-]+)?\s{0,99}?\n # $1 = opening fence (captured for back-referencing), $2 = optional lang
2088
- (.*?) # $3 = code block content
2089
- \1[ \t]*\n # closing fence
2090
- ''', re.M | re.X | re.S)
2091
-
2092
- def _fenced_code_block_sub(self, match):
2093
- return self._code_block_sub(match, is_fenced_code_block=True)
2094
-
2095
- def _do_fenced_code_blocks(self, text):
2096
- """Process ```-fenced unindented code blocks ('fenced-code-blocks' extra)."""
2097
- return self._fenced_code_block_re.sub(self._fenced_code_block_sub, text)
2098
-
2099
2111
  # Rules for a code span:
2100
2112
  # - backslash escapes are not interpreted in a code span
2101
2113
  # - to include one or or a run of more backticks the delimiters must
@@ -2114,12 +2126,13 @@ class Markdown(object):
2114
2126
  (?!`)
2115
2127
  ''', re.X | re.S)
2116
2128
 
2117
- def _code_span_sub(self, match):
2129
+ def _code_span_sub(self, match: re.Match) -> str:
2118
2130
  c = match.group(2).strip(" \t")
2119
2131
  c = self._encode_code(c)
2120
2132
  return "<code%s>%s</code>" % (self._html_class_str_from_tag("code"), c)
2121
2133
 
2122
- def _do_code_spans(self, text):
2134
+ @mark_stage(Stage.CODE_SPANS)
2135
+ def _do_code_spans(self, text: str) -> str:
2123
2136
  # * Backtick quotes are used for <code></code> spans.
2124
2137
  #
2125
2138
  # * You can use multiple backticks as the delimiters if you want to
@@ -2144,7 +2157,7 @@ class Markdown(object):
2144
2157
  # ... type <code>`bar`</code> ...
2145
2158
  return self._code_span_re.sub(self._code_span_sub, text)
2146
2159
 
2147
- def _encode_code(self, text):
2160
+ def _encode_code(self, text: str) -> str:
2148
2161
  """Encode/escape certain characters inside Markdown code runs.
2149
2162
  The point is that in code, these characters are literals,
2150
2163
  and lose their special Markdown meanings.
@@ -2163,160 +2176,14 @@ class Markdown(object):
2163
2176
  self._code_table[text] = hashed
2164
2177
  return hashed
2165
2178
 
2166
- def _wavedrom_block_sub(self, match):
2167
- # if this isn't a wavedrom diagram block, exit now
2168
- if match.group(2) != 'wavedrom':
2169
- return match.string[match.start():match.end()]
2170
-
2171
- # dedent the block for processing
2172
- lead_indent, waves = self._uniform_outdent(match.group(3))
2173
- # default tags to wrap the wavedrom block in
2174
- open_tag, close_tag = '<script type="WaveDrom">\n', '</script>'
2175
-
2176
- # check if the user would prefer to have the SVG embedded directly
2177
- if not isinstance(self.extras['wavedrom'], dict):
2178
- embed_svg = True
2179
- else:
2180
- # default behaviour is to embed SVGs
2181
- embed_svg = self.extras['wavedrom'].get('prefer_embed_svg', True)
2182
-
2183
- if embed_svg:
2184
- try:
2185
- import wavedrom
2186
- waves = wavedrom.render(waves).tostring()
2187
- open_tag, close_tag = '<div>', '\n</div>'
2188
- except ImportError:
2189
- pass
2190
-
2191
- # hash SVG to prevent <> chars being messed with
2192
- self._escape_table[waves] = _hash_text(waves)
2193
-
2194
- return self._uniform_indent(
2195
- '\n%s%s%s\n' % (open_tag, self._escape_table[waves], close_tag),
2196
- lead_indent, include_empty_lines=True
2197
- )
2198
-
2199
- def _do_wavedrom_blocks(self, text):
2200
- return self._fenced_code_block_re.sub(self._wavedrom_block_sub, text)
2201
-
2202
- _admonitions = r'admonition|attention|caution|danger|error|hint|important|note|tip|warning'
2203
- _admonitions_re = re.compile(r'''
2204
- ^(\ *)\.\.\ (%s)::\ * # $1 leading indent, $2 the admonition
2205
- (.*)? # $3 admonition title
2206
- ((?:\s*\n\1\ {3,}.*)+?) # $4 admonition body (required)
2207
- (?=\s*(?:\Z|\n{4,}|\n\1?\ {0,2}\S)) # until EOF, 3 blank lines or something less indented
2208
- ''' % _admonitions,
2209
- re.IGNORECASE | re.MULTILINE | re.VERBOSE
2210
- )
2211
-
2212
- def _do_admonitions_sub(self, match):
2213
- lead_indent, admonition_name, title, body = match.groups()
2214
-
2215
- admonition_type = '<strong>%s</strong>' % admonition_name
2216
-
2217
- # figure out the class names to assign the block
2218
- if admonition_name.lower() == 'admonition':
2219
- admonition_class = 'admonition'
2220
- else:
2221
- admonition_class = 'admonition %s' % admonition_name.lower()
2222
-
2223
- # titles are generally optional
2224
- if title:
2225
- title = '<em>%s</em>' % title
2226
-
2227
- # process the admonition body like regular markdown
2228
- body = self._run_block_gamut("\n%s\n" % self._uniform_outdent(body)[1])
2229
-
2230
- # indent the body before placing inside the aside block
2231
- admonition = self._uniform_indent('%s\n%s\n\n%s\n' % (admonition_type, title, body), self.tab, False)
2232
- # wrap it in an aside
2233
- admonition = '<aside class="%s">\n%s</aside>' % (admonition_class, admonition)
2234
- # now indent the whole admonition back to where it started
2235
- return self._uniform_indent(admonition, lead_indent, False)
2236
-
2237
- def _do_admonitions(self, text):
2238
- return self._admonitions_re.sub(self._do_admonitions_sub, text)
2239
-
2240
- _strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S)
2241
- def _do_strike(self, text):
2242
- text = self._strike_re.sub(r"<s>\1</s>", text)
2243
- return text
2244
-
2245
- _underline_re = re.compile(r"(?<!<!)--(?!>)(?=\S)(.+?)(?<=\S)(?<!<!)--(?!>)", re.S)
2246
- def _do_underline(self, text):
2247
- text = self._underline_re.sub(r"<u>\1</u>", text)
2248
- return text
2249
-
2250
- _tg_spoiler_re = re.compile(r"\|\|\s?(.+?)\s?\|\|", re.S)
2251
- def _do_tg_spoiler(self, text):
2252
- text = self._tg_spoiler_re.sub(r"<tg-spoiler>\1</tg-spoiler>", text)
2253
- return text
2179
+ _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]?)(?<=\S)\1", re.S)
2180
+ _em_re = re.compile(r"(\*|_)(?=\S)(.*?\S)\1", re.S)
2254
2181
 
2255
- _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
2256
- _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
2257
- _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
2258
- _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
2259
- def _do_italics_and_bold(self, text):
2182
+ @mark_stage(Stage.ITALIC_AND_BOLD)
2183
+ def _do_italics_and_bold(self, text: str) -> str:
2260
2184
  # <strong> must go first:
2261
- if "code-friendly" in self.extras:
2262
- text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text)
2263
- text = self._code_friendly_em_re.sub(r"<em>\1</em>", text)
2264
- else:
2265
- text = self._strong_re.sub(r"<strong>\2</strong>", text)
2266
- text = self._em_re.sub(r"<em>\2</em>", text)
2267
- return text
2268
-
2269
- # "smarty-pants" extra: Very liberal in interpreting a single prime as an
2270
- # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and
2271
- # "twixt" can be written without an initial apostrophe. This is fine because
2272
- # using scare quotes (single quotation marks) is rare.
2273
- _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))")
2274
- _contractions = ["tis", "twas", "twer", "neath", "o", "n",
2275
- "round", "bout", "twixt", "nuff", "fraid", "sup"]
2276
- def _do_smart_contractions(self, text):
2277
- text = self._apostrophe_year_re.sub(r"&#8217;\1", text)
2278
- for c in self._contractions:
2279
- text = text.replace("'%s" % c, "&#8217;%s" % c)
2280
- text = text.replace("'%s" % c.capitalize(),
2281
- "&#8217;%s" % c.capitalize())
2282
- return text
2283
-
2284
- # Substitute double-quotes before single-quotes.
2285
- _opening_single_quote_re = re.compile(r"(?<!\S)'(?=\S)")
2286
- _opening_double_quote_re = re.compile(r'(?<!\S)"(?=\S)')
2287
- _closing_single_quote_re = re.compile(r"(?<=\S)'")
2288
- _closing_double_quote_re = re.compile(r'(?<=\S)"(?=(\s|,|;|\.|\?|!|$))')
2289
- def _do_smart_punctuation(self, text):
2290
- """Fancifies 'single quotes', "double quotes", and apostrophes.
2291
- Converts --, ---, and ... into en dashes, em dashes, and ellipses.
2292
-
2293
- Inspiration is: <http://daringfireball.net/projects/smartypants/>
2294
- See "test/tm-cases/smarty_pants.text" for a full discussion of the
2295
- support here and
2296
- <http://code.google.com/p/python-markdown2/issues/detail?id=42> for a
2297
- discussion of some diversion from the original SmartyPants.
2298
- """
2299
- if "'" in text: # guard for perf
2300
- text = self._do_smart_contractions(text)
2301
- text = self._opening_single_quote_re.sub("&#8216;", text)
2302
- text = self._closing_single_quote_re.sub("&#8217;", text)
2303
-
2304
- if '"' in text: # guard for perf
2305
- text = self._opening_double_quote_re.sub("&#8220;", text)
2306
- text = self._closing_double_quote_re.sub("&#8221;", text)
2307
-
2308
- text = text.replace("---", "&#8212;")
2309
- text = text.replace("--", "&#8211;")
2310
- text = text.replace("...", "&#8230;")
2311
- text = text.replace(" . . . ", "&#8230;")
2312
- text = text.replace(". . .", "&#8230;")
2313
-
2314
- # TODO: Temporary hack to fix https://github.com/trentm/python-markdown2/issues/150
2315
- if "footnotes" in self.extras and "footnote-ref" in text:
2316
- # Quotes in the footnote back ref get converted to "smart" quotes
2317
- # Change them back here to ensure they work.
2318
- text = text.replace('class="footnote-ref&#8221;', 'class="footnote-ref"')
2319
-
2185
+ text = self._strong_re.sub(r"<strong>\2</strong>", text)
2186
+ text = self._em_re.sub(r"<em>\2</em>", text)
2320
2187
  return text
2321
2188
 
2322
2189
  _block_quote_base = r'''
@@ -2334,10 +2201,10 @@ class Markdown(object):
2334
2201
  _bq_one_level_re_spoiler = re.compile('^[ \t]*>[ \t]*?![ \t]?', re.M)
2335
2202
  _bq_all_lines_spoilers = re.compile(r'\A(?:^[ \t]*>[ \t]*?!.*[\n\r]*)+\Z', re.M)
2336
2203
  _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
2337
- def _dedent_two_spaces_sub(self, match):
2204
+ def _dedent_two_spaces_sub(self, match: re.Match) -> str:
2338
2205
  return re.sub(r'(?m)^ ', '', match.group(1))
2339
2206
 
2340
- def _block_quote_sub(self, match):
2207
+ def _block_quote_sub(self, match: re.Match) -> str:
2341
2208
  bq = match.group(1)
2342
2209
  is_spoiler = 'spoiler' in self.extras and self._bq_all_lines_spoilers.match(bq)
2343
2210
  # trim one level of quoting
@@ -2358,7 +2225,8 @@ class Markdown(object):
2358
2225
  else:
2359
2226
  return '<blockquote>\n%s\n</blockquote>\n\n' % bq
2360
2227
 
2361
- def _do_block_quotes(self, text):
2228
+ @mark_stage(Stage.BLOCK_QUOTES)
2229
+ def _do_block_quotes(self, text: str) -> str:
2362
2230
  if '>' not in text:
2363
2231
  return text
2364
2232
  if 'spoiler' in self.extras:
@@ -2366,7 +2234,8 @@ class Markdown(object):
2366
2234
  else:
2367
2235
  return self._block_quote_re.sub(self._block_quote_sub, text)
2368
2236
 
2369
- def _form_paragraphs(self, text):
2237
+ @mark_stage(Stage.PARAGRAPHS)
2238
+ def _form_paragraphs(self, text: str) -> str:
2370
2239
  # Strip leading and trailing lines:
2371
2240
  text = text.strip('\n')
2372
2241
 
@@ -2396,8 +2265,13 @@ class Markdown(object):
2396
2265
  ):
2397
2266
  start = li.start()
2398
2267
  cuddled_list = self._do_lists(graf[start:]).rstrip("\n")
2399
- assert re.match(r'^<(?:ul|ol).*?>', cuddled_list)
2400
- graf = graf[:start]
2268
+ if re.match(r'^<(?:ul|ol).*?>', cuddled_list):
2269
+ graf = graf[:start]
2270
+ else:
2271
+ # Not quite a cuddled list. (See not_quite_a_list_cuddled_lists test case)
2272
+ # Store as a simple paragraph.
2273
+ graf = cuddled_list
2274
+ cuddled_list = None
2401
2275
 
2402
2276
  # Wrap <p> tags.
2403
2277
  graf = self._run_span_gamut(graf)
@@ -2408,7 +2282,7 @@ class Markdown(object):
2408
2282
 
2409
2283
  return "\n\n".join(grafs)
2410
2284
 
2411
- def _add_footnotes(self, text):
2285
+ def _add_footnotes(self, text: str) -> str:
2412
2286
  if self.footnotes:
2413
2287
  footer = [
2414
2288
  '<div class="footnotes">',
@@ -2421,6 +2295,10 @@ class Markdown(object):
2421
2295
  if not self.footnote_return_symbol:
2422
2296
  self.footnote_return_symbol = "&#8617;"
2423
2297
 
2298
+ # self.footnotes is generated in _strip_footnote_definitions, which runs re.sub on the whole
2299
+ # text. This means that the dict keys are inserted in order of appearance. Use the dict to
2300
+ # sort footnote ids by that same order
2301
+ self.footnote_ids.sort(key=lambda a: list(self.footnotes.keys()).index(a))
2424
2302
  for i, id in enumerate(self.footnote_ids):
2425
2303
  if i != 0:
2426
2304
  footer.append('')
@@ -2455,7 +2333,7 @@ class Markdown(object):
2455
2333
  _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I)
2456
2334
  _naked_gt_re = re.compile(r'''(?<![a-z0-9?!/'"-])>''', re.I)
2457
2335
 
2458
- def _encode_amps_and_angles(self, text):
2336
+ def _encode_amps_and_angles(self, text: str) -> str:
2459
2337
  # Smart processing for ampersands and angle brackets that need
2460
2338
  # to be encoded.
2461
2339
  text = _AMPERSAND_RE.sub('&amp;', text)
@@ -2469,9 +2347,9 @@ class Markdown(object):
2469
2347
  text = self._naked_gt_re.sub('&gt;', text)
2470
2348
  return text
2471
2349
 
2472
- _incomplete_tags_re = re.compile(r"<(/?\w+?(?!\w)\s*?.+?[\s/]+?)")
2350
+ _incomplete_tags_re = re.compile(r"<(!--|/?\w+?(?!\w)\s*?.+?(?:[\s/]+?|$))")
2473
2351
 
2474
- def _encode_incomplete_tags(self, text):
2352
+ def _encode_incomplete_tags(self, text: str) -> str:
2475
2353
  if self.safe_mode not in ("replace", "escape"):
2476
2354
  return text
2477
2355
 
@@ -2483,13 +2361,13 @@ class Markdown(object):
2483
2361
 
2484
2362
  return self._incomplete_tags_re.sub(incomplete_tags_sub, text)
2485
2363
 
2486
- def _encode_backslash_escapes(self, text):
2364
+ def _encode_backslash_escapes(self, text: str) -> str:
2487
2365
  for ch, escape in list(self._escape_table.items()):
2488
2366
  text = text.replace("\\"+ch, escape)
2489
2367
  return text
2490
2368
 
2491
2369
  _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I)
2492
- def _auto_link_sub(self, match):
2370
+ def _auto_link_sub(self, match: re.Match) -> str:
2493
2371
  g1 = match.group(1)
2494
2372
  return '<a href="%s">%s</a>' % (self._protect_url(g1), g1)
2495
2373
 
@@ -2503,16 +2381,16 @@ class Markdown(object):
2503
2381
  )
2504
2382
  >
2505
2383
  """, re.I | re.X | re.U)
2506
- def _auto_email_link_sub(self, match):
2384
+ def _auto_email_link_sub(self, match: re.Match) -> str:
2507
2385
  return self._encode_email_address(
2508
2386
  self._unescape_special_chars(match.group(1)))
2509
2387
 
2510
- def _do_auto_links(self, text):
2388
+ def _do_auto_links(self, text: str) -> str:
2511
2389
  text = self._auto_link_re.sub(self._auto_link_sub, text)
2512
2390
  text = self._auto_email_link_re.sub(self._auto_email_link_sub, text)
2513
2391
  return text
2514
2392
 
2515
- def _encode_email_address(self, addr):
2393
+ def _encode_email_address(self, addr: str) -> str:
2516
2394
  # Input: an email address, e.g. "foo@example.com"
2517
2395
  #
2518
2396
  # Output: the email address as a mailto link, with each character
@@ -2532,88 +2410,40 @@ class Markdown(object):
2532
2410
  % (''.join(chars), ''.join(chars[7:]))
2533
2411
  return addr
2534
2412
 
2535
- _basic_link_re = re.compile(r'!?\[.*?\]\(.*?\)')
2536
- def _do_link_patterns(self, text):
2537
- link_from_hash = {}
2538
- for regex, repl in self.link_patterns:
2539
- replacements = []
2540
- for match in regex.finditer(text):
2541
- if any(self._match_overlaps_substr(text, match, h) for h in link_from_hash):
2542
- continue
2543
-
2544
- if hasattr(repl, "__call__"):
2545
- href = repl(match)
2546
- else:
2547
- href = match.expand(repl)
2548
- replacements.append((match.span(), href))
2549
- for (start, end), href in reversed(replacements):
2550
-
2551
- # Do not match against links inside brackets.
2552
- if text[start - 1:start] == '[' and text[end:end + 1] == ']':
2553
- continue
2554
-
2555
- # Do not match against links in the standard markdown syntax.
2556
- if text[start - 2:start] == '](' or text[end:end + 2] == '")':
2557
- continue
2558
-
2559
- # Do not match against links which are escaped.
2560
- if text[start - 3:start] == '"""' and text[end:end + 3] == '"""':
2561
- text = text[:start - 3] + text[start:end] + text[end + 3:]
2562
- continue
2563
-
2564
- # search the text for anything that looks like a link
2565
- is_inside_link = False
2566
- for link_re in (self._auto_link_re, self._basic_link_re):
2567
- for match in link_re.finditer(text):
2568
- if any((r[0] <= start and end <= r[1]) for r in match.regs):
2569
- # if the link pattern start and end pos is within the bounds of
2570
- # something that looks like a link, then don't process it
2571
- is_inside_link = True
2572
- break
2573
- else:
2574
- continue
2575
- break
2576
-
2577
- if is_inside_link:
2578
- continue
2579
-
2580
- escaped_href = (
2581
- href.replace('"', '&quot;') # b/c of attr quote
2582
- # To avoid markdown <em> and <strong>:
2583
- .replace('*', self._escape_table['*'])
2584
- .replace('_', self._escape_table['_']))
2585
- link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])
2586
- hash = _hash_text(link)
2587
- link_from_hash[hash] = link
2588
- text = text[:start] + hash + text[end:]
2589
- for hash, link in list(link_from_hash.items()):
2590
- text = text.replace(hash, link)
2591
- return text
2592
-
2593
- def _unescape_special_chars(self, text):
2413
+ def _unescape_special_chars(self, text: str) -> str:
2594
2414
  # Swap back in all the special characters we've hidden.
2415
+ hashmap = tuple(self._escape_table.items()) + tuple(self._code_table.items())
2416
+ # html_blocks table is in format {hash: item} compared to usual {item: hash}
2417
+ hashmap += tuple(tuple(reversed(i)) for i in self.html_blocks.items())
2595
2418
  while True:
2596
2419
  orig_text = text
2597
- for ch, hash in list(self._escape_table.items()) + list(self._code_table.items()):
2420
+ for ch, hash in hashmap:
2598
2421
  text = text.replace(hash, ch)
2599
2422
  if text == orig_text:
2600
2423
  break
2601
2424
  return text
2602
2425
 
2603
- def _outdent(self, text):
2426
+ def _outdent(self, text: str) -> str:
2604
2427
  # Remove one level of line-leading tabs or spaces
2605
2428
  return self._outdent_re.sub('', text)
2606
2429
 
2607
- def _uniform_outdent(self, text, min_outdent=None, max_outdent=None):
2608
- # Removes the smallest common leading indentation from each (non empty)
2609
- # line of `text` and returns said indent along with the outdented text.
2610
- # The `min_outdent` kwarg makes sure the smallest common whitespace
2611
- # must be at least this size
2612
- # The `max_outdent` sets the maximum amount a line can be
2613
- # outdented by
2430
+ @staticmethod
2431
+ def _uniform_outdent(
2432
+ text: str,
2433
+ min_outdent: Optional[str] = None,
2434
+ max_outdent: Optional[str] = None
2435
+ ) -> Tuple[str, str]:
2436
+ '''
2437
+ Removes the smallest common leading indentation from each (non empty)
2438
+ line of `text` and returns said indent along with the outdented text.
2439
+
2440
+ Args:
2441
+ min_outdent: make sure the smallest common whitespace is at least this size
2442
+ max_outdent: the maximum amount a line can be outdented by
2443
+ '''
2614
2444
 
2615
2445
  # find the leading whitespace for every line
2616
- whitespace = [
2446
+ whitespace: List[Union[str, None]] = [
2617
2447
  re.findall(r'^[ \t]*', line)[0] if line else None
2618
2448
  for line in text.splitlines()
2619
2449
  ]
@@ -2644,14 +2474,34 @@ class Markdown(object):
2644
2474
 
2645
2475
  return outdent, ''.join(outdented)
2646
2476
 
2647
- def _uniform_indent(self, text, indent, include_empty_lines=False):
2648
- return ''.join(
2649
- (indent + line if line.strip() or include_empty_lines else '')
2650
- for line in text.splitlines(True)
2651
- )
2477
+ @staticmethod
2478
+ def _uniform_indent(
2479
+ text: str,
2480
+ indent: str,
2481
+ include_empty_lines: bool = False,
2482
+ indent_empty_lines: bool = False
2483
+ ) -> str:
2484
+ '''
2485
+ Uniformly indent a block of text by a fixed amount
2486
+
2487
+ Args:
2488
+ text: the text to indent
2489
+ indent: a string containing the indent to apply
2490
+ include_empty_lines: don't remove whitespace only lines
2491
+ indent_empty_lines: indent whitespace only lines with the rest of the text
2492
+ '''
2493
+ blocks = []
2494
+ for line in text.splitlines(True):
2495
+ if line.strip() or indent_empty_lines:
2496
+ blocks.append(indent + line)
2497
+ elif include_empty_lines:
2498
+ blocks.append(line)
2499
+ else:
2500
+ blocks.append('')
2501
+ return ''.join(blocks)
2652
2502
 
2653
2503
  @staticmethod
2654
- def _match_overlaps_substr(text, match, substr):
2504
+ def _match_overlaps_substr(text, match: re.Match, substr: str) -> bool:
2655
2505
  '''
2656
2506
  Checks if a regex match overlaps with a substring in the given text.
2657
2507
  '''
@@ -2676,58 +2526,1093 @@ class MarkdownWithExtras(Markdown):
2676
2526
  - link-patterns (because you need to specify some actual
2677
2527
  link-patterns anyway)
2678
2528
  """
2679
- extras = ["footnotes", "fenced-code-blocks"]
2529
+ extras = ["footnotes", "fenced-code-blocks"] # type: ignore
2680
2530
 
2681
2531
 
2682
- # ---- internal support functions
2532
+ # ----------------------------------------------------------
2533
+ # Extras
2534
+ # ----------------------------------------------------------
2683
2535
 
2536
+ # Base classes
2537
+ # ----------------------------------------------------------
2684
2538
 
2685
- def calculate_toc_html(toc):
2686
- """Return the HTML for the current TOC.
2539
+ class Extra(ABC):
2540
+ _registry: Dict[str, Type['Extra']] = {}
2541
+ _exec_order: Dict[Stage, Tuple[List[Type['Extra']], List[Type['Extra']]]] = {}
2687
2542
 
2688
- This expects the `_toc` attribute to have been set on this instance.
2689
- """
2690
- if toc is None:
2691
- return None
2543
+ name: str
2544
+ '''
2545
+ An identifiable name that users can use to invoke the extra
2546
+ in the Markdown class
2547
+ '''
2548
+ order: Tuple[Collection[Union[Stage, Type['Extra']]], Collection[Union[Stage, Type['Extra']]]]
2549
+ '''
2550
+ Tuple of two iterables containing the stages/extras this extra will run before and
2551
+ after, respectively
2552
+ '''
2692
2553
 
2693
- def indent():
2694
- return ' ' * (len(h_stack) - 1)
2695
- lines = []
2696
- h_stack = [0] # stack of header-level numbers
2697
- for level, id, name in toc:
2698
- if level > h_stack[-1]:
2699
- lines.append("%s<ul>" % indent())
2700
- h_stack.append(level)
2701
- elif level == h_stack[-1]:
2702
- lines[-1] += "</li>"
2703
- else:
2704
- while level < h_stack[-1]:
2705
- h_stack.pop()
2706
- if not lines[-1].endswith("</li>"):
2707
- lines[-1] += "</li>"
2708
- lines.append("%s</ul></li>" % indent())
2709
- lines.append('%s<li><a href="#%s">%s</a>' % (
2710
- indent(), id, name))
2711
- while len(h_stack) > 1:
2712
- h_stack.pop()
2713
- if not lines[-1].endswith("</li>"):
2714
- lines[-1] += "</li>"
2715
- lines.append("%s</ul>" % indent())
2716
- return '\n'.join(lines) + '\n'
2554
+ def __init__(self, md: Markdown, options: Optional[dict]):
2555
+ '''
2556
+ Args:
2557
+ md: An instance of `Markdown`
2558
+ options: a dict of settings to alter the extra's behaviour
2559
+ '''
2560
+ self.md = md
2561
+ self.options = options if options is not None else {}
2717
2562
 
2563
+ @classmethod
2564
+ def deregister(cls):
2565
+ '''
2566
+ Removes the class from the extras registry and unsets its execution order.
2567
+ '''
2568
+ if cls.name in cls._registry:
2569
+ del cls._registry[cls.name]
2718
2570
 
2719
- class UnicodeWithAttrs(str):
2720
- """A subclass of unicode used for the return value of conversion to
2721
- possibly attach some attributes. E.g. the "toc_html" attribute when
2722
- the "toc" extra is used.
2723
- """
2724
- metadata = None
2725
- toc_html = None
2571
+ for exec_order in Extra._exec_order.values():
2572
+ # find everywhere this extra is mentioned and remove it
2573
+ for section in exec_order:
2574
+ while cls in section:
2575
+ section.remove(cls)
2726
2576
 
2727
- ## {{{ http://code.activestate.com/recipes/577257/ (r1)
2728
- _slugify_strip_re = re.compile(r'[^\w\s-]')
2729
- _slugify_hyphenate_re = re.compile(r'[-\s]+')
2730
- def _slugify(value):
2577
+ @classmethod
2578
+ def register(cls):
2579
+ '''
2580
+ Registers the class for use with `Markdown` and calculates its execution order based on
2581
+ the `order` class attribute.
2582
+ '''
2583
+ cls._registry[cls.name] = cls
2584
+
2585
+ for index, item in enumerate((*cls.order[0], *cls.order[1])):
2586
+ before = index < len(cls.order[0])
2587
+ if not isinstance(item, Stage) and issubclass(item, Extra):
2588
+ # eg: FencedCodeBlocks
2589
+ for exec_orders in Extra._exec_order.values():
2590
+ # insert this extra everywhere the other one is mentioned
2591
+ for section in exec_orders:
2592
+ if item in section:
2593
+ to_index = section.index(item)
2594
+ if not before:
2595
+ to_index += 1
2596
+ section.insert(to_index, cls)
2597
+ else:
2598
+ # eg: Stage.PREPROCESS
2599
+ Extra._exec_order.setdefault(item, ([], []))
2600
+ if cls in Extra._exec_order[item][0 if before else 1]:
2601
+ # extra is already runnig after this stage. Don't duplicate that effort
2602
+ continue
2603
+ if before:
2604
+ Extra._exec_order[item][0].insert(0, cls)
2605
+ else:
2606
+ Extra._exec_order[item][1].append(cls)
2607
+
2608
+ @abstractmethod
2609
+ def run(self, text: str) -> str:
2610
+ '''
2611
+ Run the extra against the given text.
2612
+
2613
+ Returns:
2614
+ The new text after being modified by the extra
2615
+ '''
2616
+ ...
2617
+
2618
+ def test(self, text: str) -> bool:
2619
+ '''
2620
+ Check a section of markdown to see if this extra should be run upon it.
2621
+ The default implementation will always return True but it's recommended to override
2622
+ this behaviour to improve performance.
2623
+ '''
2624
+ return True
2625
+
2626
+
2627
+ class ItalicAndBoldProcessor(Extra):
2628
+ '''
2629
+ An ABC that provides hooks for dealing with italics and bold syntax.
2630
+ This class is set to trigger both before AND after the italics and bold stage.
2631
+ This allows any child classes to intercept instances of bold or italic syntax and
2632
+ change the output or hash it to prevent it from being processed.
2633
+
2634
+ After the I&B stage any hashes in the `hash_tables` instance variable are replaced.
2635
+ '''
2636
+ name = 'italic-and-bold-processor'
2637
+ order = (Stage.ITALIC_AND_BOLD,), (Stage.ITALIC_AND_BOLD,)
2638
+
2639
+ strong_re = Markdown._strong_re
2640
+ em_re = Markdown._em_re
2641
+
2642
+ def __init__(self, md: Markdown, options: dict):
2643
+ super().__init__(md, options)
2644
+ self.hash_table = {}
2645
+
2646
+ def run(self, text):
2647
+ if self.md.order < Stage.ITALIC_AND_BOLD:
2648
+ text = self.strong_re.sub(self.sub, text)
2649
+ text = self.em_re.sub(self.sub, text)
2650
+ else:
2651
+ # push any hashed values back, using a while loop to deal with recursive hashes
2652
+ orig_text = ''
2653
+ while orig_text != text:
2654
+ orig_text = text
2655
+ for key, substr in self.hash_table.items():
2656
+ text = text.replace(key, substr)
2657
+ return text
2658
+
2659
+ @abstractmethod
2660
+ def sub(self, match: re.Match) -> str:
2661
+ # do nothing. Let `Markdown._do_italics_and_bold` do its thing later
2662
+ return match.string[match.start(): match.end()]
2663
+
2664
+ def sub_hash(self, match: re.Match) -> str:
2665
+ substr = match.string[match.start(): match.end()]
2666
+ key = _hash_text(substr)
2667
+ self.hash_table[key] = substr
2668
+ return key
2669
+
2670
+ def test(self, text):
2671
+ if self.md.order < Stage.ITALIC_AND_BOLD:
2672
+ return '*' in text or '_' in text
2673
+ return self.hash_table and re.search(r'md5-[0-9a-z]{32}', text)
2674
+
2675
+ # User facing extras
2676
+ # ----------------------------------------------------------
2677
+
2678
+
2679
+ class Admonitions(Extra):
2680
+ '''
2681
+ Enable parsing of RST admonitions
2682
+ '''
2683
+
2684
+ name = 'admonitions'
2685
+ order = (Stage.BLOCK_GAMUT, Stage.LINK_DEFS), ()
2686
+
2687
+ admonitions = r'admonition|attention|caution|danger|error|hint|important|note|tip|warning'
2688
+
2689
+ admonitions_re = re.compile(r'''
2690
+ ^(\ *)\.\.\ (%s)::\ * # $1 leading indent, $2 the admonition
2691
+ (.*)? # $3 admonition title
2692
+ ((?:\s*\n\1\ {3,}.*)+?) # $4 admonition body (required)
2693
+ (?=\s*(?:\Z|\n{4,}|\n\1?\ {0,2}\S)) # until EOF, 3 blank lines or something less indented
2694
+ ''' % admonitions,
2695
+ re.IGNORECASE | re.MULTILINE | re.VERBOSE
2696
+ )
2697
+
2698
+ def test(self, text):
2699
+ return self.admonitions_re.search(text) is not None
2700
+
2701
+ def sub(self, match: re.Match) -> str:
2702
+ lead_indent, admonition_name, title, body = match.groups()
2703
+
2704
+ admonition_type = '<strong>%s</strong>' % admonition_name
2705
+
2706
+ # figure out the class names to assign the block
2707
+ if admonition_name.lower() == 'admonition':
2708
+ admonition_class = 'admonition'
2709
+ else:
2710
+ admonition_class = 'admonition %s' % admonition_name.lower()
2711
+
2712
+ # titles are generally optional
2713
+ if title:
2714
+ title = '<em>%s</em>' % title
2715
+
2716
+ # process the admonition body like regular markdown
2717
+ body = self.md._run_block_gamut("\n%s\n" % self.md._uniform_outdent(body)[1])
2718
+
2719
+ # indent the body before placing inside the aside block
2720
+ admonition = self.md._uniform_indent(
2721
+ '%s\n%s\n\n%s\n' % (admonition_type, title, body),
2722
+ self.md.tab, False
2723
+ )
2724
+ # wrap it in an aside
2725
+ admonition = '<aside class="%s">\n%s</aside>' % (admonition_class, admonition)
2726
+ # now indent the whole admonition back to where it started
2727
+ return self.md._uniform_indent(admonition, lead_indent, False)
2728
+
2729
+ def run(self, text):
2730
+ return self.admonitions_re.sub(self.sub, text)
2731
+
2732
+
2733
+ class Alerts(Extra):
2734
+ '''
2735
+ Markdown Alerts as per
2736
+ https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#alerts
2737
+ '''
2738
+
2739
+ name = 'alerts'
2740
+ order = (), (Stage.BLOCK_QUOTES, )
2741
+
2742
+ alert_re = re.compile(r'''
2743
+ <blockquote>\s*
2744
+ <p>
2745
+ \[!(?P<type>NOTE|TIP|IMPORTANT|WARNING|CAUTION)\]
2746
+ (?P<closing_tag></p>[ \t]*\n?)?
2747
+ (?P<contents>[\s\S]+?)
2748
+ </blockquote>
2749
+ ''', re.X
2750
+ )
2751
+
2752
+ def test(self, text):
2753
+ return "<blockquote>" in text
2754
+
2755
+ def sub(self, match: re.Match) -> str:
2756
+ typ = match["type"].lower()
2757
+ heading = f"<em>{match['type'].title()}</em>"
2758
+ contents = match["contents"].strip()
2759
+ if match["closing_tag"]:
2760
+ return f'<div class="alert {typ}">\n{heading}\n{contents}\n</div>'
2761
+ else:
2762
+ return f'<div class="alert {typ}">\n{heading}\n<p>{contents}\n</div>'
2763
+
2764
+ def run(self, text):
2765
+ return self.alert_re.sub(self.sub, text)
2766
+
2767
+
2768
+ class _BreaksExtraOpts(TypedDict, total=False):
2769
+ '''Options for the `Breaks` extra'''
2770
+ on_backslash: bool
2771
+ '''Replace backslashes at the end of a line with <br>'''
2772
+ on_newline: bool
2773
+ '''Replace single new line characters with <br> when True'''
2774
+
2775
+
2776
+ class Breaks(Extra):
2777
+ name = 'breaks'
2778
+ order = (), (Stage.ITALIC_AND_BOLD,)
2779
+ options: _BreaksExtraOpts
2780
+
2781
+ def run(self, text):
2782
+ on_backslash = self.options.get('on_backslash', False)
2783
+ on_newline = self.options.get('on_newline', False)
2784
+
2785
+ if on_backslash and on_newline:
2786
+ pattern = r' *\\?'
2787
+ elif on_backslash:
2788
+ pattern = r'(?: *\\| {2,})'
2789
+ elif on_newline:
2790
+ pattern = r' *'
2791
+ else:
2792
+ pattern = r' {2,}'
2793
+
2794
+ break_tag = "<br%s\n" % self.md.empty_element_suffix
2795
+ text = re.sub(pattern + r"\n(?!\<(?:\/?(ul|ol|li))\>)", break_tag, text)
2796
+
2797
+ return text
2798
+
2799
+
2800
+ class CodeFriendly(ItalicAndBoldProcessor):
2801
+ '''
2802
+ Disable _ and __ for em and strong.
2803
+ '''
2804
+ name = 'code-friendly'
2805
+
2806
+ def sub(self, match: re.Match) -> str:
2807
+ syntax = match.group(1)
2808
+ text: str = match.string[match.start(): match.end()]
2809
+ if '_' in syntax:
2810
+ # if using _this_ syntax, hash the whole thing so that it doesn't get processed
2811
+ key = _hash_text(text)
2812
+ self.hash_table[key] = text
2813
+ return key
2814
+ elif '_' in text:
2815
+ # if the text within the bold/em markers contains '_' then hash those contents to protect them from em_re
2816
+ text = text[len(syntax): -len(syntax)]
2817
+ key = _hash_text(text)
2818
+ self.hash_table[key] = text
2819
+ return syntax + key + syntax
2820
+ # if no underscores are present, the text is fine and we can just leave it alone
2821
+ return super().sub(match)
2822
+
2823
+
2824
+ class FencedCodeBlocks(Extra):
2825
+ '''
2826
+ Allows a code block to not have to be indented
2827
+ by fencing it with '```' on a line before and after. Based on
2828
+ <http://github.github.com/github-flavored-markdown/> with support for
2829
+ syntax highlighting.
2830
+ '''
2831
+
2832
+ name = 'fenced-code-blocks'
2833
+ order = (Stage.LINK_DEFS, Stage.BLOCK_GAMUT), (Stage.PREPROCESS,)
2834
+
2835
+ fenced_code_block_re = re.compile(r'''
2836
+ (?:\n+|\A\n?|(?<=\n))
2837
+ (^[ \t]*`{3,})\s{0,99}?([\w+-]+)?\s{0,99}?\n # $1 = opening fence (captured for back-referencing), $2 = optional lang
2838
+ (.*?) # $3 = code block content
2839
+ \1[ \t]*\n # closing fence
2840
+ ''', re.M | re.X | re.S)
2841
+
2842
+ def test(self, text):
2843
+ if '```' not in text:
2844
+ return False
2845
+ if self.md.stage == Stage.PREPROCESS and not self.md.safe_mode:
2846
+ return True
2847
+ if self.md.stage == Stage.LINK_DEFS and self.md.safe_mode:
2848
+ return True
2849
+ return self.md.stage == Stage.BLOCK_GAMUT
2850
+
2851
+ def _code_block_with_lexer_sub(
2852
+ self,
2853
+ codeblock: str,
2854
+ leading_indent: str,
2855
+ lexer
2856
+ ) -> str:
2857
+ '''
2858
+ Args:
2859
+ codeblock: the codeblock to format
2860
+ leading_indent: the indentation to prefix the block with
2861
+ lexer (pygments.Lexer): the lexer to use
2862
+ '''
2863
+ formatter_opts = self.md.extras['fenced-code-blocks'] or {}
2864
+
2865
+ def unhash_code(codeblock):
2866
+ for key, sanitized in list(self.md.html_spans.items()):
2867
+ codeblock = codeblock.replace(key, sanitized)
2868
+ replacements = [
2869
+ ("&amp;", "&"),
2870
+ ("&lt;", "<"),
2871
+ ("&gt;", ">")
2872
+ ]
2873
+ for old, new in replacements:
2874
+ codeblock = codeblock.replace(old, new)
2875
+ return codeblock
2876
+ # remove leading indent from code block
2877
+ _, codeblock = self.md._uniform_outdent(codeblock, max_outdent=leading_indent)
2878
+
2879
+ codeblock = unhash_code(codeblock)
2880
+ colored = self.md._color_with_pygments(codeblock, lexer,
2881
+ **formatter_opts)
2882
+
2883
+ # add back the indent to all lines
2884
+ return "\n%s\n" % self.md._uniform_indent(colored, leading_indent, True)
2885
+
2886
+ def tags(self, lexer_name: str) -> Tuple[str, str]:
2887
+ '''
2888
+ Returns the tags that the encoded code block will be wrapped in, based
2889
+ upon the lexer name.
2890
+
2891
+ This function can be overridden by subclasses to piggy-back off of the
2892
+ fenced code blocks syntax (see `Mermaid` extra).
2893
+
2894
+ Returns:
2895
+ The opening and closing tags, as strings within a tuple
2896
+ '''
2897
+ pre_class = self.md._html_class_str_from_tag('pre')
2898
+ if "highlightjs-lang" in self.md.extras and lexer_name:
2899
+ code_class = ' class="%s language-%s"' % (lexer_name, lexer_name)
2900
+ else:
2901
+ code_class = self.md._html_class_str_from_tag('code')
2902
+ return ('<pre%s><code%s>' % (pre_class, code_class), '</code></pre>')
2903
+
2904
+ def sub(self, match: re.Match) -> str:
2905
+ lexer_name = match.group(2)
2906
+ codeblock = match.group(3)
2907
+ codeblock = codeblock[:-1] # drop one trailing newline
2908
+
2909
+ # Use pygments only if not using the highlightjs-lang extra
2910
+ if lexer_name and "highlightjs-lang" not in self.md.extras:
2911
+ lexer = self.md._get_pygments_lexer(lexer_name)
2912
+ if lexer:
2913
+ leading_indent = ' '*(len(match.group(1)) - len(match.group(1).lstrip()))
2914
+ return self._code_block_with_lexer_sub(codeblock, leading_indent, lexer)
2915
+
2916
+ # Fenced code blocks need to be outdented before encoding, and then reapplied
2917
+ leading_indent = ' ' * (len(match.group(1)) - len(match.group(1).lstrip()))
2918
+ if codeblock:
2919
+ # only run the codeblock through the outdenter if not empty
2920
+ leading_indent, codeblock = self.md._uniform_outdent(codeblock, max_outdent=leading_indent)
2921
+
2922
+ codeblock = self.md._encode_code(codeblock)
2923
+
2924
+ tags = self.tags(lexer_name)
2925
+
2926
+ return "\n%s%s%s\n%s%s\n" % (leading_indent, tags[0], codeblock, leading_indent, tags[1])
2927
+
2928
+ def run(self, text):
2929
+ return self.fenced_code_block_re.sub(self.sub, text)
2930
+
2931
+
2932
+ class Latex(Extra):
2933
+ '''
2934
+ Convert $ and $$ to <math> and </math> tags for inline and block math.
2935
+ '''
2936
+ name = 'latex'
2937
+ order = (Stage.CODE_BLOCKS, FencedCodeBlocks), ()
2938
+
2939
+ _single_dollar_re = re.compile(r'(?<!\$)\$(?!\$)(.*?)\$')
2940
+ _double_dollar_re = re.compile(r'\$\$(.*?)\$\$', re.DOTALL)
2941
+
2942
+ # Ways to escape
2943
+ _pre_code_block_re = re.compile(r"<pre>(.*?)</pre>", re.DOTALL) # Wraped in <pre>
2944
+ _triple_re = re.compile(r'```(.*?)```', re.DOTALL) # Wrapped in a code block ```
2945
+ _single_re = re.compile(r'(?<!`)(`)(.*?)(?<!`)\1(?!`)') # Wrapped in a single `
2946
+
2947
+ converter = None
2948
+ code_blocks = {}
2949
+
2950
+ def _convert_single_match(self, match):
2951
+ return self.converter.convert(match.group(1))
2952
+
2953
+ def _convert_double_match(self, match):
2954
+ return self.converter.convert(match.group(1).replace(r"\n", ''), display="block")
2955
+
2956
+ def code_placeholder(self, match):
2957
+ placeholder = f"<!--CODE_BLOCK_{len(self.code_blocks)}-->"
2958
+ self.code_blocks[placeholder] = match.group(0)
2959
+ return placeholder
2960
+
2961
+ def run(self, text):
2962
+ try:
2963
+ import latex2mathml.converter
2964
+ self.converter = latex2mathml.converter
2965
+ except ImportError:
2966
+ raise ImportError('The "latex" extra requires the "latex2mathml" package to be installed.')
2967
+
2968
+ # Escape by replacing with a code block
2969
+ text = self._pre_code_block_re.sub(self.code_placeholder, text)
2970
+ text = self._single_re.sub(self.code_placeholder, text)
2971
+ text = self._triple_re.sub(self.code_placeholder, text)
2972
+
2973
+ text = self._single_dollar_re.sub(self._convert_single_match, text)
2974
+ text = self._double_dollar_re.sub(self._convert_double_match, text)
2975
+
2976
+ # Convert placeholder tag back to original code
2977
+ for placeholder, code_block in self.code_blocks.items():
2978
+ text = text.replace(placeholder, code_block)
2979
+
2980
+ return text
2981
+
2982
+
2983
+ class LinkPatterns(Extra):
2984
+ '''
2985
+ Auto-link given regex patterns in text (e.g. bug number
2986
+ references, revision number references).
2987
+ '''
2988
+ name = 'link-patterns'
2989
+ order = (Stage.LINKS,), ()
2990
+ options: _link_patterns
2991
+
2992
+ _basic_link_re = re.compile(r'!?\[.*?\]\(.*?\)')
2993
+
2994
+ def run(self, text):
2995
+ link_from_hash = {}
2996
+ for regex, repl in self.options:
2997
+ replacements = []
2998
+ for match in regex.finditer(text):
2999
+ if any(self.md._match_overlaps_substr(text, match, h) for h in link_from_hash):
3000
+ continue
3001
+
3002
+ if callable(repl):
3003
+ href = repl(match)
3004
+ else:
3005
+ href = match.expand(repl)
3006
+ replacements.append((match.span(), href))
3007
+ for (start, end), href in reversed(replacements):
3008
+
3009
+ # Do not match against links inside brackets.
3010
+ if text[start - 1:start] == '[' and text[end:end + 1] == ']':
3011
+ continue
3012
+
3013
+ # Do not match against links in the standard markdown syntax.
3014
+ if text[start - 2:start] == '](' or text[end:end + 2] == '")':
3015
+ continue
3016
+
3017
+ # Do not match against links which are escaped.
3018
+ if text[start - 3:start] == '"""' and text[end:end + 3] == '"""':
3019
+ text = text[:start - 3] + text[start:end] + text[end + 3:]
3020
+ continue
3021
+
3022
+ # search the text for anything that looks like a link
3023
+ is_inside_link = False
3024
+ for link_re in (self.md._auto_link_re, self._basic_link_re):
3025
+ for match in link_re.finditer(text):
3026
+ if any((r[0] <= start and end <= r[1]) for r in match.regs):
3027
+ # if the link pattern start and end pos is within the bounds of
3028
+ # something that looks like a link, then don't process it
3029
+ is_inside_link = True
3030
+ break
3031
+ else:
3032
+ continue
3033
+ break
3034
+
3035
+ if is_inside_link:
3036
+ continue
3037
+
3038
+ escaped_href = (
3039
+ href.replace('"', '&quot;') # b/c of attr quote
3040
+ # To avoid markdown <em> and <strong>:
3041
+ .replace('*', self.md._escape_table['*'])
3042
+ .replace('_', self.md._escape_table['_']))
3043
+ link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])
3044
+ hash = _hash_text(link)
3045
+ link_from_hash[hash] = link
3046
+ text = text[:start] + hash + text[end:]
3047
+ for hash, link in list(link_from_hash.items()):
3048
+ text = text.replace(hash, link)
3049
+ return text
3050
+
3051
+ def test(self, text):
3052
+ return True
3053
+
3054
+
3055
+ class MarkdownInHTML(Extra):
3056
+ '''
3057
+ Allow the use of `markdown="1"` in a block HTML tag to
3058
+ have markdown processing be done on its contents. Similar to
3059
+ <http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with
3060
+ some limitations.
3061
+ '''
3062
+ name = 'markdown-in-html'
3063
+ order = (), (Stage.HASH_HTML,)
3064
+
3065
+ def run(self, text):
3066
+ def callback(block):
3067
+ indent, block = self.md._uniform_outdent(block)
3068
+ block = self.md._hash_html_block_sub(block)
3069
+ block = self.md._uniform_indent(block, indent, include_empty_lines=True, indent_empty_lines=False)
3070
+ return block
3071
+
3072
+ return self.md._strict_tag_block_sub(text, self.md._block_tags_a, callback, True)
3073
+
3074
+ def test(self, text):
3075
+ return True
3076
+
3077
+
3078
+ class Mermaid(FencedCodeBlocks):
3079
+ name = 'mermaid'
3080
+ order = (FencedCodeBlocks,), ()
3081
+
3082
+ def tags(self, lexer_name):
3083
+ if lexer_name == 'mermaid':
3084
+ return ('<pre class="mermaid-pre"><div class="mermaid">', '</div></pre>')
3085
+ return super().tags(lexer_name)
3086
+
3087
+
3088
+ class MiddleWordEm(ItalicAndBoldProcessor):
3089
+ '''
3090
+ Allows or disallows emphasis syntax in the middle of words,
3091
+ defaulting to allow. Disabling this means that `this_text_here` will not be
3092
+ converted to `this<em>text</em>here`.
3093
+ '''
3094
+ name = 'middle-word-em'
3095
+ order = (CodeFriendly,), (Stage.ITALIC_AND_BOLD,)
3096
+
3097
+ def __init__(self, md: Markdown, options: Union[dict, bool]):
3098
+ '''
3099
+ Args:
3100
+ md: the markdown instance
3101
+ options: can be bool for backwards compatibility but will be converted to a dict
3102
+ in the constructor. All options are:
3103
+ - allowed (bool): whether to allow emphasis in the middle of a word.
3104
+ If `options` is a bool it will be placed under this key.
3105
+ '''
3106
+ if isinstance(options, bool):
3107
+ options = {'allowed': options}
3108
+ options.setdefault('allowed', True)
3109
+ super().__init__(md, options)
3110
+
3111
+ self.liberal_em_re = self.em_re
3112
+ if not options['allowed']:
3113
+ self.em_re = re.compile(r'(?<=\b)%s(?=\b)' % self.liberal_em_re.pattern, self.liberal_em_re.flags)
3114
+
3115
+ def run(self, text):
3116
+ # run strong and whatnot first
3117
+ # this also will process all strict ems
3118
+ text = super().run(text)
3119
+ if self.md.order < self.md.stage:
3120
+ # hash all non-valid ems
3121
+ text = self.liberal_em_re.sub(self.sub_hash, text)
3122
+ return text
3123
+
3124
+ def sub(self, match: re.Match) -> str:
3125
+ syntax = match.group(1)
3126
+ if len(syntax) != 1:
3127
+ # strong syntax
3128
+ return super().sub(match)
3129
+ return '<em>%s</em>' % match.group(2)
3130
+
3131
+
3132
+ class Numbering(Extra):
3133
+ '''
3134
+ Support of generic counters. Non standard extension to
3135
+ allow sequential numbering of figures, tables, equations, exhibits etc.
3136
+ '''
3137
+
3138
+ name = 'numbering'
3139
+ order = (Stage.LINK_DEFS,), ()
3140
+
3141
+ def run(self, text):
3142
+ # First pass to define all the references
3143
+ regex_defns = re.compile(r'''
3144
+ \[\#(\w+) # the counter. Open square plus hash plus a word \1
3145
+ ([^@]*) # Some optional characters, that aren't an @. \2
3146
+ @(\w+) # the id. Should this be normed? \3
3147
+ ([^\]]*)\] # The rest of the text up to the terminating ] \4
3148
+ ''', re.VERBOSE)
3149
+ regex_subs = re.compile(r"\[@(\w+)\s*\]") # [@ref_id]
3150
+ counters = {}
3151
+ references = {}
3152
+ replacements = []
3153
+ definition_html = '<figcaption class="{}" id="counter-ref-{}">{}{}{}</figcaption>'
3154
+ reference_html = '<a class="{}" href="#counter-ref-{}">{}</a>'
3155
+ for match in regex_defns.finditer(text):
3156
+ # We must have four match groups otherwise this isn't a numbering reference
3157
+ if len(match.groups()) != 4:
3158
+ continue
3159
+ counter = match.group(1)
3160
+ text_before = match.group(2).strip()
3161
+ ref_id = match.group(3)
3162
+ text_after = match.group(4)
3163
+ number = counters.get(counter, 1)
3164
+ references[ref_id] = (number, counter)
3165
+ replacements.append((match.start(0),
3166
+ definition_html.format(counter,
3167
+ ref_id,
3168
+ text_before,
3169
+ number,
3170
+ text_after),
3171
+ match.end(0)))
3172
+ counters[counter] = number + 1
3173
+ for repl in reversed(replacements):
3174
+ text = text[:repl[0]] + repl[1] + text[repl[2]:]
3175
+
3176
+ # Second pass to replace the references with the right
3177
+ # value of the counter
3178
+ # Fwiw, it's vaguely annoying to have to turn the iterator into
3179
+ # a list and then reverse it but I can't think of a better thing to do.
3180
+ for match in reversed(list(regex_subs.finditer(text))):
3181
+ number, counter = references.get(match.group(1), (None, None))
3182
+ if number is not None:
3183
+ repl = reference_html.format(counter,
3184
+ match.group(1),
3185
+ number)
3186
+ else:
3187
+ repl = reference_html.format(match.group(1),
3188
+ 'countererror',
3189
+ '?' + match.group(1) + '?')
3190
+ if "smarty-pants" in self.md.extras:
3191
+ repl = repl.replace('"', self.md._escape_table['"'])
3192
+
3193
+ text = text[:match.start()] + repl + text[match.end():]
3194
+ return text
3195
+
3196
+
3197
+ class PyShell(Extra):
3198
+ '''
3199
+ Treats unindented Python interactive shell sessions as <code>
3200
+ blocks.
3201
+ '''
3202
+
3203
+ name = 'pyshell'
3204
+ order = (), (Stage.LISTS,)
3205
+
3206
+ def test(self, text):
3207
+ return ">>>" in text
3208
+
3209
+ def sub(self, match: re.Match) -> str:
3210
+ if "fenced-code-blocks" in self.md.extras:
3211
+ dedented = _dedent(match.group(0))
3212
+ return self.md.extra_classes['fenced-code-blocks'].run("```pycon\n" + dedented + "```\n")
3213
+
3214
+ lines = match.group(0).splitlines(0)
3215
+ _dedentlines(lines)
3216
+ indent = ' ' * self.md.tab_width
3217
+ s = ('\n' # separate from possible cuddled paragraph
3218
+ + indent + ('\n'+indent).join(lines)
3219
+ + '\n')
3220
+ return s
3221
+
3222
+ def run(self, text):
3223
+ less_than_tab = self.md.tab_width - 1
3224
+ _pyshell_block_re = re.compile(r"""
3225
+ ^([ ]{0,%d})>>>[ ].*\n # first line
3226
+ ^(\1[^\S\n]*\S.*\n)* # any number of subsequent lines with at least one character
3227
+ (?=^\1?\n|\Z) # ends with a blank line or end of document
3228
+ """ % less_than_tab, re.M | re.X)
3229
+
3230
+ return _pyshell_block_re.sub(self.sub, text)
3231
+
3232
+
3233
+ class SmartyPants(Extra):
3234
+ '''
3235
+ Replaces ' and " with curly quotation marks or curly
3236
+ apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes,
3237
+ and ellipses.
3238
+ '''
3239
+ name = 'smarty-pants'
3240
+ order = (), (Stage.SPAN_GAMUT,)
3241
+
3242
+ _opening_single_quote_re = re.compile(r"(?<!\S)'(?=\S)")
3243
+ _opening_double_quote_re = re.compile(r'(?<!\S)"(?=\S)')
3244
+ _closing_single_quote_re = re.compile(r"(?<=\S)'")
3245
+ _closing_double_quote_re = re.compile(r'(?<=\S)"(?=(\s|,|;|\.|\?|!|$))')
3246
+ # "smarty-pants" extra: Very liberal in interpreting a single prime as an
3247
+ # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and
3248
+ # "twixt" can be written without an initial apostrophe. This is fine because
3249
+ # using scare quotes (single quotation marks) is rare.
3250
+ _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))")
3251
+ _contractions = ["tis", "twas", "twer", "neath", "o", "n",
3252
+ "round", "bout", "twixt", "nuff", "fraid", "sup"]
3253
+
3254
+
3255
+ def contractions(self, text: str) -> str:
3256
+ text = self._apostrophe_year_re.sub(r"&#8217;\1", text)
3257
+ for c in self._contractions:
3258
+ text = text.replace("'%s" % c, "&#8217;%s" % c)
3259
+ text = text.replace("'%s" % c.capitalize(),
3260
+ "&#8217;%s" % c.capitalize())
3261
+ return text
3262
+
3263
+ def run(self, text):
3264
+ """Fancifies 'single quotes', "double quotes", and apostrophes.
3265
+ Converts --, ---, and ... into en dashes, em dashes, and ellipses.
3266
+
3267
+ Inspiration is: <http://daringfireball.net/projects/smartypants/>
3268
+ See "test/tm-cases/smarty_pants.text" for a full discussion of the
3269
+ support here and
3270
+ <http://code.google.com/p/python-markdown2/issues/detail?id=42> for a
3271
+ discussion of some diversion from the original SmartyPants.
3272
+ """
3273
+ if "'" in text: # guard for perf
3274
+ text = self.contractions(text)
3275
+ text = self._opening_single_quote_re.sub("&#8216;", text)
3276
+ text = self._closing_single_quote_re.sub("&#8217;", text)
3277
+
3278
+ if '"' in text: # guard for perf
3279
+ text = self._opening_double_quote_re.sub("&#8220;", text)
3280
+ text = self._closing_double_quote_re.sub("&#8221;", text)
3281
+
3282
+ text = text.replace("---", "&#8212;")
3283
+ text = text.replace("--", "&#8211;")
3284
+ text = text.replace("...", "&#8230;")
3285
+ text = text.replace(" . . . ", "&#8230;")
3286
+ text = text.replace(". . .", "&#8230;")
3287
+
3288
+ # TODO: Temporary hack to fix https://github.com/trentm/python-markdown2/issues/150
3289
+ if "footnotes" in self.md.extras and "footnote-ref" in text:
3290
+ # Quotes in the footnote back ref get converted to "smart" quotes
3291
+ # Change them back here to ensure they work.
3292
+ text = text.replace('class="footnote-ref&#8221;', 'class="footnote-ref"')
3293
+
3294
+ return text
3295
+
3296
+ def test(self, text):
3297
+ return "'" in text or '"' in text
3298
+
3299
+
3300
+ class Strike(Extra):
3301
+ '''
3302
+ Text inside of double tilde is ~~strikethrough~~
3303
+ '''
3304
+ name = 'strike'
3305
+ order = (Stage.ITALIC_AND_BOLD,), ()
3306
+
3307
+ _strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S)
3308
+
3309
+ def run(self, text):
3310
+ return self._strike_re.sub(r"<s>\1</s>", text)
3311
+
3312
+ def test(self, text):
3313
+ return '~~' in text
3314
+
3315
+
3316
+ class Tables(Extra):
3317
+ '''
3318
+ Tables using the same format as GFM
3319
+ <https://help.github.com/articles/github-flavored-markdown#tables> and
3320
+ PHP-Markdown Extra <https://michelf.ca/projects/php-markdown/extra/#table>.
3321
+ '''
3322
+ name = 'tables'
3323
+ order = (), (Stage.LISTS,)
3324
+
3325
+ def run(self, text):
3326
+ """Copying PHP-Markdown and GFM table syntax. Some regex borrowed from
3327
+ https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538
3328
+ """
3329
+ less_than_tab = self.md.tab_width - 1
3330
+ table_re = re.compile(r'''
3331
+ (?:(?<=\n)|\A\n?) # leading blank line
3332
+
3333
+ ^[ ]{0,%d} # allowed whitespace
3334
+ (.*[|].*)[ ]*\n # $1: header row (at least one pipe)
3335
+
3336
+ ^[ ]{0,%d} # allowed whitespace
3337
+ ( # $2: underline row
3338
+ # underline row with leading bar
3339
+ (?: \|\ *:?-+:?\ * )+ \|? \s?[ ]*\n
3340
+ |
3341
+ # or, underline row without leading bar
3342
+ (?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \s?[ ]*\n
3343
+ )
3344
+
3345
+ ( # $3: data rows
3346
+ (?:
3347
+ ^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces
3348
+ .*\|.*[ ]*\n
3349
+ )+
3350
+ )
3351
+ ''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X)
3352
+ return table_re.sub(self.sub, text)
3353
+
3354
+ def sub(self, match: re.Match) -> str:
3355
+ trim_space_re = '^[ \t\n]+|[ \t\n]+$'
3356
+ trim_bar_re = r'^\||\|$'
3357
+ split_bar_re = r'^\||(?<![\`\\])\|'
3358
+ escape_bar_re = r'\\\|'
3359
+
3360
+ head, underline, body = match.groups()
3361
+
3362
+ # Determine aligns for columns.
3363
+ cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", underline)))]
3364
+ align_from_col_idx = {}
3365
+ for col_idx, col in enumerate(cols):
3366
+ if col[0] == ':' and col[-1] == ':':
3367
+ align_from_col_idx[col_idx] = ' style="text-align:center;"'
3368
+ elif col[0] == ':':
3369
+ align_from_col_idx[col_idx] = ' style="text-align:left;"'
3370
+ elif col[-1] == ':':
3371
+ align_from_col_idx[col_idx] = ' style="text-align:right;"'
3372
+
3373
+ # thead
3374
+ hlines = ['<table%s>' % self.md._html_class_str_from_tag('table'), '<thead%s>' % self.md._html_class_str_from_tag('thead'), '<tr>']
3375
+ cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)))]
3376
+ for col_idx, col in enumerate(cols):
3377
+ hlines.append(' <th%s>%s</th>' % (
3378
+ align_from_col_idx.get(col_idx, ''),
3379
+ self.md._run_span_gamut(col)
3380
+ ))
3381
+ hlines.append('</tr>')
3382
+ hlines.append('</thead>')
3383
+
3384
+ # tbody
3385
+ hlines.append('<tbody>')
3386
+ for line in body.strip('\n').split('\n'):
3387
+ hlines.append('<tr>')
3388
+ cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)))]
3389
+ for col_idx, col in enumerate(cols):
3390
+ hlines.append(' <td%s>%s</td>' % (
3391
+ align_from_col_idx.get(col_idx, ''),
3392
+ self.md._run_span_gamut(col)
3393
+ ))
3394
+ hlines.append('</tr>')
3395
+ hlines.append('</tbody>')
3396
+ hlines.append('</table>')
3397
+
3398
+ return '\n'.join(hlines) + '\n'
3399
+
3400
+
3401
+ class TelegramSpoiler(Extra):
3402
+ name = 'tg-spoiler'
3403
+ order = (), (Stage.ITALIC_AND_BOLD,)
3404
+
3405
+ _tg_spoiler_re = re.compile(r"\|\|\s?(.+?)\s?\|\|", re.S)
3406
+
3407
+ def run(self, text):
3408
+ return self._tg_spoiler_re.sub(r"<tg-spoiler>\1</tg-spoiler>", text)
3409
+
3410
+ def test(self, text):
3411
+ return '||' in text
3412
+
3413
+
3414
+ class Underline(Extra):
3415
+ '''
3416
+ Text inside of double dash is --underlined--.
3417
+ '''
3418
+ name = 'underline'
3419
+ order = (Stage.ITALIC_AND_BOLD,), ()
3420
+
3421
+ _underline_re = re.compile(r"(?<!<!)--(?!>)(?=\S)(.+?)(?<=\S)(?<!<!)--(?!>)", re.S)
3422
+
3423
+ def run(self, text):
3424
+ return self._underline_re.sub(r"<u>\1</u>", text)
3425
+
3426
+ def test(self, text):
3427
+ return '--' in text
3428
+
3429
+
3430
+ class _WavedromExtraOpts(TypedDict, total=False):
3431
+ '''Options for the `Wavedrom` extra'''
3432
+ prefer_embed_svg: bool
3433
+ '''
3434
+ Use the `wavedrom` library to convert diagrams to SVGs and embed them directly.
3435
+ This will only work if the `wavedrom` library has been installed.
3436
+
3437
+ Defaults to `True`
3438
+ '''
3439
+
3440
+
3441
+ class Wavedrom(Extra):
3442
+ '''
3443
+ Support for generating Wavedrom digital timing diagrams
3444
+ '''
3445
+ name = 'wavedrom'
3446
+ order = (Stage.CODE_BLOCKS, FencedCodeBlocks), ()
3447
+ options: _WavedromExtraOpts
3448
+
3449
+ def test(self, text):
3450
+ match = FencedCodeBlocks.fenced_code_block_re.search(text)
3451
+ return match is None or match.group(2) == 'wavedrom'
3452
+
3453
+ def sub(self, match: re.Match) -> str:
3454
+ # dedent the block for processing
3455
+ lead_indent, waves = self.md._uniform_outdent(match.group(3))
3456
+ # default tags to wrap the wavedrom block in
3457
+ open_tag, close_tag = '<script type="WaveDrom">\n', '</script>'
3458
+
3459
+ # check if the user would prefer to have the SVG embedded directly
3460
+ embed_svg = self.options.get('prefer_embed_svg', True)
3461
+
3462
+ if embed_svg:
3463
+ try:
3464
+ import wavedrom
3465
+ waves = wavedrom.render(waves).tostring()
3466
+ open_tag, close_tag = '<div>', '\n</div>'
3467
+ except ImportError:
3468
+ pass
3469
+
3470
+ # hash SVG to prevent <> chars being messed with
3471
+ self.md._escape_table[waves] = _hash_text(waves)
3472
+
3473
+ return self.md._uniform_indent(
3474
+ '\n%s%s%s\n' % (open_tag, self.md._escape_table[waves], close_tag),
3475
+ lead_indent, include_empty_lines=True
3476
+ )
3477
+
3478
+ def run(self, text):
3479
+ return FencedCodeBlocks.fenced_code_block_re.sub(self.sub, text)
3480
+
3481
+
3482
+ class WikiTables(Extra):
3483
+ '''
3484
+ Google Code Wiki-style tables. See
3485
+ <http://code.google.com/p/support/wiki/WikiSyntax#Tables>.
3486
+ '''
3487
+ name = 'wiki-tables'
3488
+ order = (Tables,), ()
3489
+
3490
+ def run(self, text):
3491
+ less_than_tab = self.md.tab_width - 1
3492
+ wiki_table_re = re.compile(r'''
3493
+ (?:(?<=\n\n)|\A\n?) # leading blank line
3494
+ ^([ ]{0,%d})\|\|.+?\|\|[ ]*\n # first line
3495
+ (^\1\|\|.+?\|\|\n)* # any number of subsequent lines
3496
+ ''' % less_than_tab, re.M | re.X)
3497
+ return wiki_table_re.sub(self.sub, text)
3498
+
3499
+ def sub(self, match: re.Match) -> str:
3500
+ ttext = match.group(0).strip()
3501
+ rows = []
3502
+ for line in ttext.splitlines(0):
3503
+ line = line.strip()[2:-2].strip()
3504
+ row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)]
3505
+ rows.append(row)
3506
+
3507
+ hlines = []
3508
+
3509
+ def add_hline(line, indents=0):
3510
+ hlines.append((self.md.tab * indents) + line)
3511
+
3512
+ def format_cell(text):
3513
+ return self.md._run_span_gamut(re.sub(r"^\s*~", "", cell).strip(" "))
3514
+
3515
+ add_hline('<table%s>' % self.md._html_class_str_from_tag('table'))
3516
+ # Check if first cell of first row is a header cell. If so, assume the whole row is a header row.
3517
+ if rows and rows[0] and re.match(r"^\s*~", rows[0][0]):
3518
+ add_hline('<thead%s>' % self.md._html_class_str_from_tag('thead'), 1)
3519
+ add_hline('<tr>', 2)
3520
+ for cell in rows[0]:
3521
+ add_hline("<th>{}</th>".format(format_cell(cell)), 3)
3522
+ add_hline('</tr>', 2)
3523
+ add_hline('</thead>', 1)
3524
+ # Only one header row allowed.
3525
+ rows = rows[1:]
3526
+ # If no more rows, don't create a tbody.
3527
+ if rows:
3528
+ add_hline('<tbody>', 1)
3529
+ for row in rows:
3530
+ add_hline('<tr>', 2)
3531
+ for cell in row:
3532
+ add_hline('<td>{}</td>'.format(format_cell(cell)), 3)
3533
+ add_hline('</tr>', 2)
3534
+ add_hline('</tbody>', 1)
3535
+ add_hline('</table>')
3536
+ return '\n'.join(hlines) + '\n'
3537
+
3538
+ def test(self, text):
3539
+ return '||' in text
3540
+
3541
+
3542
+ # Register extras
3543
+ Admonitions.register()
3544
+ Alerts.register()
3545
+ Breaks.register()
3546
+ CodeFriendly.register()
3547
+ FencedCodeBlocks.register()
3548
+ Latex.register()
3549
+ LinkPatterns.register()
3550
+ MarkdownInHTML.register()
3551
+ MiddleWordEm.register()
3552
+ Mermaid.register()
3553
+ Numbering.register()
3554
+ PyShell.register()
3555
+ SmartyPants.register()
3556
+ Strike.register()
3557
+ Tables.register()
3558
+ TelegramSpoiler.register()
3559
+ Underline.register()
3560
+ Wavedrom.register()
3561
+ WikiTables.register()
3562
+
3563
+
3564
+ # ----------------------------------------------------------
3565
+
3566
+
3567
+ # ---- internal support functions
3568
+
3569
+
3570
+ def calculate_toc_html(toc: Union[List[Tuple[int, str, str]], None]) -> Optional[str]:
3571
+ """Return the HTML for the current TOC.
3572
+
3573
+ This expects the `_toc` attribute to have been set on this instance.
3574
+ """
3575
+ if toc is None:
3576
+ return None
3577
+
3578
+ def indent():
3579
+ return ' ' * (len(h_stack) - 1)
3580
+ lines = []
3581
+ h_stack = [0] # stack of header-level numbers
3582
+ for level, id, name in toc:
3583
+ if level > h_stack[-1]:
3584
+ lines.append("%s<ul>" % indent())
3585
+ h_stack.append(level)
3586
+ elif level == h_stack[-1]:
3587
+ lines[-1] += "</li>"
3588
+ else:
3589
+ while level < h_stack[-1]:
3590
+ h_stack.pop()
3591
+ if not lines[-1].endswith("</li>"):
3592
+ lines[-1] += "</li>"
3593
+ lines.append("%s</ul></li>" % indent())
3594
+ lines.append('%s<li><a href="#%s">%s</a>' % (
3595
+ indent(), id, name))
3596
+ while len(h_stack) > 1:
3597
+ h_stack.pop()
3598
+ if not lines[-1].endswith("</li>"):
3599
+ lines[-1] += "</li>"
3600
+ lines.append("%s</ul>" % indent())
3601
+ return '\n'.join(lines) + '\n'
3602
+
3603
+
3604
+ class UnicodeWithAttrs(str):
3605
+ """A subclass of unicode used for the return value of conversion to
3606
+ possibly attach some attributes. E.g. the "toc_html" attribute when
3607
+ the "toc" extra is used.
3608
+ """
3609
+ metadata: Optional[Dict[str, str]] = None
3610
+ toc_html: Optional[str] = None
3611
+
3612
+ ## {{{ http://code.activestate.com/recipes/577257/ (r1)
3613
+ _slugify_strip_re = re.compile(r'[^\w\s-]')
3614
+ _slugify_hyphenate_re = re.compile(r'[-\s]+')
3615
+ def _slugify(value: str) -> str:
2731
3616
  """
2732
3617
  Normalizes string, converts to lowercase, removes non-alpha characters,
2733
3618
  and converts spaces to hyphens.
@@ -2735,15 +3620,14 @@ def _slugify(value):
2735
3620
  From Django's "django/template/defaultfilters.py".
2736
3621
  """
2737
3622
  import unicodedata
2738
- value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
3623
+ value = unicodedata.normalize('NFKD', value).encode('utf-8', 'ignore').decode()
2739
3624
  value = _slugify_strip_re.sub('', value).strip().lower()
2740
3625
  return _slugify_hyphenate_re.sub('-', value)
2741
3626
  ## end of http://code.activestate.com/recipes/577257/ }}}
2742
3627
 
2743
3628
 
2744
3629
  # From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
2745
- def _curry(*args, **kwargs):
2746
- function, args = args[0], args[1:]
3630
+ def _curry(function: Callable, *args, **kwargs) -> Callable:
2747
3631
  def result(*rest, **kwrest):
2748
3632
  combined = kwargs.copy()
2749
3633
  combined.update(kwrest)
@@ -2752,7 +3636,7 @@ def _curry(*args, **kwargs):
2752
3636
 
2753
3637
 
2754
3638
  # Recipe: regex_from_encoded_pattern (1.0)
2755
- def _regex_from_encoded_pattern(s):
3639
+ def _regex_from_encoded_pattern(s: str) -> re.Pattern:
2756
3640
  """'foo' -> re.compile(re.escape('foo'))
2757
3641
  '/foo/' -> re.compile('foo')
2758
3642
  '/foo/i' -> re.compile('foo', re.I)
@@ -2782,7 +3666,7 @@ def _regex_from_encoded_pattern(s):
2782
3666
 
2783
3667
 
2784
3668
  # Recipe: dedent (0.1.2)
2785
- def _dedentlines(lines, tabsize=8, skip_first_line=False):
3669
+ def _dedentlines(lines: List[str], tabsize: int = 8, skip_first_line: bool = False) -> List[str]:
2786
3670
  """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
2787
3671
 
2788
3672
  "lines" is a list of lines to dedent.
@@ -2800,7 +3684,8 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False):
2800
3684
  % (tabsize, skip_first_line))
2801
3685
  margin = None
2802
3686
  for i, line in enumerate(lines):
2803
- if i == 0 and skip_first_line: continue
3687
+ if i == 0 and skip_first_line:
3688
+ continue
2804
3689
  indent = 0
2805
3690
  for ch in line:
2806
3691
  if ch == ' ':
@@ -2813,16 +3698,19 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False):
2813
3698
  break
2814
3699
  else:
2815
3700
  continue # skip all-whitespace lines
2816
- if DEBUG: print("dedent: indent=%d: %r" % (indent, line))
3701
+ if DEBUG:
3702
+ print("dedent: indent=%d: %r" % (indent, line))
2817
3703
  if margin is None:
2818
3704
  margin = indent
2819
3705
  else:
2820
3706
  margin = min(margin, indent)
2821
- if DEBUG: print("dedent: margin=%r" % margin)
3707
+ if DEBUG:
3708
+ print("dedent: margin=%r" % margin)
2822
3709
 
2823
3710
  if margin is not None and margin > 0:
2824
3711
  for i, line in enumerate(lines):
2825
- if i == 0 and skip_first_line: continue
3712
+ if i == 0 and skip_first_line:
3713
+ continue
2826
3714
  removed = 0
2827
3715
  for j, ch in enumerate(line):
2828
3716
  if ch == ' ':
@@ -2830,7 +3718,8 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False):
2830
3718
  elif ch == '\t':
2831
3719
  removed += tabsize - (removed % tabsize)
2832
3720
  elif ch in '\r\n':
2833
- if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line)
3721
+ if DEBUG:
3722
+ print("dedent: %r: EOL -> strip up to EOL" % line)
2834
3723
  lines[i] = lines[i][j:]
2835
3724
  break
2836
3725
  else:
@@ -2852,7 +3741,7 @@ def _dedentlines(lines, tabsize=8, skip_first_line=False):
2852
3741
  return lines
2853
3742
 
2854
3743
 
2855
- def _dedent(text, tabsize=8, skip_first_line=False):
3744
+ def _dedent(text: str, tabsize: int = 8, skip_first_line: bool = False) -> str:
2856
3745
  """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
2857
3746
 
2858
3747
  "text" is the text to dedent.
@@ -2863,7 +3752,7 @@ def _dedent(text, tabsize=8, skip_first_line=False):
2863
3752
 
2864
3753
  textwrap.dedent(s), but don't expand tabs to spaces
2865
3754
  """
2866
- lines = text.splitlines(1)
3755
+ lines = text.splitlines(True)
2867
3756
  _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
2868
3757
  return ''.join(lines)
2869
3758
 
@@ -2895,7 +3784,7 @@ class _memoized(object):
2895
3784
  return self.func.__doc__
2896
3785
 
2897
3786
 
2898
- def _xml_oneliner_re_from_tab_width(tab_width):
3787
+ def _xml_oneliner_re_from_tab_width(tab_width: int) -> re.Pattern:
2899
3788
  """Standalone XML processing instruction regex."""
2900
3789
  return re.compile(r"""
2901
3790
  (?:
@@ -2917,7 +3806,7 @@ def _xml_oneliner_re_from_tab_width(tab_width):
2917
3806
  _xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
2918
3807
 
2919
3808
 
2920
- def _hr_tag_re_from_tab_width(tab_width):
3809
+ def _hr_tag_re_from_tab_width(tab_width: int) -> re.Pattern:
2921
3810
  return re.compile(r"""
2922
3811
  (?:
2923
3812
  (?<=\n\n) # Starting after a blank line
@@ -2937,7 +3826,7 @@ def _hr_tag_re_from_tab_width(tab_width):
2937
3826
  _hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)
2938
3827
 
2939
3828
 
2940
- def _xml_escape_attr(attr, skip_single_quote=True):
3829
+ def _xml_escape_attr(attr: str, skip_single_quote: bool = True) -> str:
2941
3830
  """Escape the given string for use in an HTML/XML tag attribute.
2942
3831
 
2943
3832
  By default this doesn't bother with escaping `'` to `&#39;`, presuming that
@@ -2954,7 +3843,7 @@ def _xml_escape_attr(attr, skip_single_quote=True):
2954
3843
  return escaped
2955
3844
 
2956
3845
 
2957
- def _xml_encode_email_char_at_random(ch):
3846
+ def _xml_encode_email_char_at_random(ch: str) -> str:
2958
3847
  r = random()
2959
3848
  # Roughly 10% raw, 45% hex, 45% dec.
2960
3849
  # '@' *must* be encoded. I [John Gruber] insist.
@@ -2968,14 +3857,25 @@ def _xml_encode_email_char_at_random(ch):
2968
3857
  return '&#%s;' % ord(ch)
2969
3858
 
2970
3859
 
2971
- def _html_escape_url(attr, safe_mode=False):
2972
- """Replace special characters that are potentially malicious in url string."""
3860
+ def _html_escape_url(
3861
+ attr: str,
3862
+ safe_mode: Union[_safe_mode, bool, None] = False,
3863
+ charset: Optional[str] = None
3864
+ ):
3865
+ """
3866
+ Replace special characters that are potentially malicious in url string.
3867
+
3868
+ Args:
3869
+ charset: don't escape characters from this charset. Currently the only
3870
+ exception is for '+' when charset=='base64'
3871
+ """
2973
3872
  escaped = (attr
2974
3873
  .replace('"', '&quot;')
2975
3874
  .replace('<', '&lt;')
2976
3875
  .replace('>', '&gt;'))
2977
3876
  if safe_mode:
2978
- escaped = escaped.replace('+', ' ')
3877
+ if charset != 'base64':
3878
+ escaped = escaped.replace('+', ' ')
2979
3879
  escaped = escaped.replace("'", "&#39;")
2980
3880
  return escaped
2981
3881
 
@@ -3065,8 +3965,10 @@ def main(argv=None):
3065
3965
  f = open(opts.link_patterns_file)
3066
3966
  try:
3067
3967
  for i, line in enumerate(f.readlines()):
3068
- if not line.strip(): continue
3069
- if line.lstrip().startswith("#"): continue
3968
+ if not line.strip():
3969
+ continue
3970
+ if line.lstrip().startswith("#"):
3971
+ continue
3070
3972
  try:
3071
3973
  pat, href = line.rstrip().rsplit(None, 1)
3072
3974
  except ValueError: