pdoc 15.0.3__py3-none-any.whl → 16.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4029 +0,0 @@
1
- # fmt: off
2
- # flake8: noqa
3
- # type: ignore
4
- # Taken from here: https://github.com/trentm/python-markdown2/blob/8d3a65bc7d4f8b64af89f668eb6c60841dc0578c/lib/markdown2.py
5
-
6
- #!/usr/bin/env python
7
- # Copyright (c) 2012 Trent Mick.
8
- # Copyright (c) 2007-2008 ActiveState Corp.
9
- # License: MIT (http://www.opensource.org/licenses/mit-license.php)
10
-
11
- r"""A fast and complete Python implementation of Markdown.
12
-
13
- [from http://daringfireball.net/projects/markdown/]
14
- > Markdown is a text-to-HTML filter; it translates an easy-to-read /
15
- > easy-to-write structured text format into HTML. Markdown's text
16
- > format is most similar to that of plain text email, and supports
17
- > features such as headers, *emphasis*, code blocks, blockquotes, and
18
- > links.
19
- >
20
- > Markdown's syntax is designed not as a generic markup language, but
21
- > specifically to serve as a front-end to (X)HTML. You can use span-level
22
- > HTML tags anywhere in a Markdown document, and you can use block level
23
- > HTML tags (like <div> and <table> as well).
24
-
25
- Module usage:
26
-
27
- >>> import markdown2
28
- >>> markdown2.markdown("*boo!*") # or use `html = markdown_path(PATH)`
29
- u'<p><em>boo!</em></p>\n'
30
-
31
- >>> markdowner = Markdown()
32
- >>> markdowner.convert("*boo!*")
33
- u'<p><em>boo!</em></p>\n'
34
- >>> markdowner.convert("**boom!**")
35
- u'<p><strong>boom!</strong></p>\n'
36
-
37
- This implementation of Markdown implements the full "core" syntax plus a
38
- number of extras (e.g., code syntax coloring, footnotes) as described on
39
- <https://github.com/trentm/python-markdown2/wiki/Extras>.
40
- """
41
-
42
- cmdln_desc = """A fast and complete Python implementation of Markdown, a
43
- text-to-HTML conversion tool for web writers.
44
-
45
- Supported extra syntax options (see -x|--extras option below and
46
- see <https://github.com/trentm/python-markdown2/wiki/Extras> for details):
47
-
48
- * admonitions: Enable parsing of RST admonitions.
49
- * breaks: Control where hard breaks are inserted in the markdown.
50
- Options include:
51
- - on_newline: Replace single new line characters with <br> when True
52
- - on_backslash: Replace backslashes at the end of a line with <br>
53
- * break-on-newline: Alias for the on_newline option in the breaks extra.
54
- * code-friendly: Disable _ and __ for em and strong.
55
- * cuddled-lists: Allow lists to be cuddled to the preceding paragraph.
56
- * fenced-code-blocks: Allows a code block to not have to be indented
57
- by fencing it with '```' on a line before and after. Based on
58
- <http://github.github.com/github-flavored-markdown/> with support for
59
- syntax highlighting.
60
- * footnotes: Support footnotes as in use on daringfireball.net and
61
- implemented in other Markdown processors (tho not in Markdown.pl v1.0.1).
62
- * header-ids: Adds "id" attributes to headers. The id value is a slug of
63
- the header text.
64
- * highlightjs-lang: Allows specifying the language which used for syntax
65
- highlighting when using fenced-code-blocks and highlightjs.
66
- * html-classes: Takes a dict mapping html tag names (lowercase) to a
67
- string to use for a "class" tag attribute. Currently only supports "img",
68
- "table", "thead", "pre", "code", "ul" and "ol" tags. Add an issue if you require
69
- this for other tags.
70
- * link-patterns: Auto-link given regex patterns in text (e.g. bug number
71
- references, revision number references).
72
- * markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to
73
- have markdown processing be done on its contents. Similar to
74
- <http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with
75
- some limitations.
76
- * metadata: Extract metadata from a leading '---'-fenced block.
77
- See <https://github.com/trentm/python-markdown2/issues/77> for details.
78
- * middle-word-em: Allows or disallows emphasis syntax in the middle of words,
79
- defaulting to allow. Disabling this means that `this_text_here` will not be
80
- converted to `this<em>text</em>here`.
81
- * nofollow: Add `rel="nofollow"` to add `<a>` tags with an href. See
82
- <http://en.wikipedia.org/wiki/Nofollow>.
83
- * numbering: Support of generic counters. Non standard extension to
84
- allow sequential numbering of figures, tables, equations, exhibits etc.
85
- * pyshell: Treats unindented Python interactive shell sessions as <code>
86
- blocks.
87
- * smarty-pants: Replaces ' and " with curly quotation marks or curly
88
- apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes,
89
- and ellipses.
90
- * spoiler: A special kind of blockquote commonly hidden behind a
91
- click on SO. Syntax per <http://meta.stackexchange.com/a/72878>.
92
- * strike: text inside of double tilde is ~~strikethrough~~
93
- * tag-friendly: Requires atx style headers to have a space between the # and
94
- the header text. Useful for applications that require twitter style tags to
95
- pass through the parser.
96
- * tables: Tables using the same format as GFM
97
- <https://help.github.com/articles/github-flavored-markdown#tables> and
98
- PHP-Markdown Extra <https://michelf.ca/projects/php-markdown/extra/#table>.
99
- * toc: The returned HTML string gets a new "toc_html" attribute which is
100
- a Table of Contents for the document. (experimental)
101
- * use-file-vars: Look for an Emacs-style markdown-extras file variable to turn
102
- on Extras.
103
- * wiki-tables: Google Code Wiki-style tables. See
104
- <http://code.google.com/p/support/wiki/WikiSyntax#Tables>.
105
- * wavedrom: Support for generating Wavedrom digital timing diagrams
106
- * xml: Passes one-liner processing instructions and namespaced XML tags.
107
- """
108
-
109
- # Dev Notes:
110
- # - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm
111
- # not yet sure if there implications with this. Compare 'pydoc sre'
112
- # and 'perldoc perlre'.
113
-
114
- __version_info__ = (2, 5, 1)
115
- __version__ = '.'.join(map(str, __version_info__))
116
- __author__ = "Trent Mick"
117
-
118
- import argparse
119
- import codecs
120
- import logging
121
- import re
122
- import sys
123
- from collections import defaultdict, OrderedDict
124
- from abc import ABC, abstractmethod
125
- import functools
126
- from hashlib import sha256
127
- from random import randint, random
128
- from typing import Any, Callable, Collection, Dict, List, Literal, Optional, Tuple, Type, TypedDict, Union
129
- from enum import IntEnum, auto
130
-
131
- if sys.version_info[1] < 9:
132
- from typing import Iterable
133
- else:
134
- from collections.abc import Iterable
135
-
136
- # ---- type defs
137
- _safe_mode = Literal['replace', 'escape']
138
- _extras_dict = Dict[str, Any]
139
- _extras_param = Union[List[str], _extras_dict]
140
- _link_patterns = Iterable[Tuple[re.Pattern, Union[str, Callable[[re.Match], str]]]]
141
-
142
- # ---- globals
143
-
144
- DEBUG = False
145
- log = logging.getLogger("markdown")
146
-
147
- DEFAULT_TAB_WIDTH = 4
148
-
149
-
150
- SECRET_SALT = bytes(randint(0, 1000000))
151
- # MD5 function was previously used for this; the "md5" prefix was kept for
152
- # backwards compatibility.
153
- def _hash_text(s: str) -> str:
154
- return 'md5-' + sha256(SECRET_SALT + s.encode("utf-8")).hexdigest()[32:]
155
-
156
- # Table of hash values for escaped characters:
157
- g_escape_table = dict([(ch, _hash_text(ch))
158
- for ch in '\\`*_{}[]()>#+-.!'])
159
-
160
- # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
161
- # http://bumppo.net/projects/amputator/
162
- _AMPERSAND_RE = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
163
-
164
-
165
- # ---- exceptions
166
- class MarkdownError(Exception):
167
- pass
168
-
169
-
170
- # ---- public api
171
-
172
- def markdown_path(
173
- path: str,
174
- encoding: str = "utf-8",
175
- html4tags: bool = False,
176
- tab_width: int = DEFAULT_TAB_WIDTH,
177
- safe_mode: Optional[_safe_mode] = None,
178
- extras: Optional[_extras_param] = None,
179
- link_patterns: Optional[_link_patterns] = None,
180
- footnote_title: Optional[str] = None,
181
- footnote_return_symbol: Optional[str] = None,
182
- use_file_vars: bool = False
183
- ) -> 'UnicodeWithAttrs':
184
- fp = codecs.open(path, 'r', encoding)
185
- text = fp.read()
186
- fp.close()
187
- return Markdown(html4tags=html4tags, tab_width=tab_width,
188
- safe_mode=safe_mode, extras=extras,
189
- link_patterns=link_patterns,
190
- footnote_title=footnote_title,
191
- footnote_return_symbol=footnote_return_symbol,
192
- use_file_vars=use_file_vars).convert(text)
193
-
194
-
195
- def markdown(
196
- text: str,
197
- html4tags: bool = False,
198
- tab_width: int = DEFAULT_TAB_WIDTH,
199
- safe_mode: Optional[_safe_mode] = None,
200
- extras: Optional[_extras_param] = None,
201
- link_patterns: Optional[_link_patterns] = None,
202
- footnote_title: Optional[str] = None,
203
- footnote_return_symbol: Optional[str] =None,
204
- use_file_vars: bool = False,
205
- cli: bool = False
206
- ) -> 'UnicodeWithAttrs':
207
- return Markdown(html4tags=html4tags, tab_width=tab_width,
208
- safe_mode=safe_mode, extras=extras,
209
- link_patterns=link_patterns,
210
- footnote_title=footnote_title,
211
- footnote_return_symbol=footnote_return_symbol,
212
- use_file_vars=use_file_vars, cli=cli).convert(text)
213
-
214
-
215
- class Stage(IntEnum):
216
- PREPROCESS = auto()
217
- HASH_HTML = auto()
218
- LINK_DEFS = auto()
219
-
220
- BLOCK_GAMUT = auto()
221
- HEADERS = auto()
222
- LISTS = auto()
223
- CODE_BLOCKS = auto()
224
- BLOCK_QUOTES = auto()
225
- PARAGRAPHS = auto()
226
-
227
- SPAN_GAMUT = auto()
228
- CODE_SPANS = auto()
229
- ESCAPE_SPECIAL = auto()
230
- LINKS = auto() # and auto links
231
- ITALIC_AND_BOLD = auto()
232
-
233
- POSTPROCESS = auto()
234
- UNHASH_HTML = auto()
235
-
236
-
237
- def mark_stage(stage: Stage):
238
- '''
239
- Decorator that handles executing relevant `Extra`s before and after this `Stage` executes.
240
- '''
241
- def wrapper(func):
242
- @functools.wraps(func)
243
- def inner(md: 'Markdown', text, *args, **kwargs):
244
- md.stage = stage
245
- # set "order" prop so extras can tell if they're being invoked before/after the stage
246
- md.order = stage - 0.5
247
-
248
- if stage in Extra._exec_order:
249
- for klass in Extra._exec_order[stage][0]:
250
- if klass.name not in md.extra_classes:
251
- continue
252
- extra = md.extra_classes[klass.name]
253
- if extra.test(text):
254
- text = extra.run(text)
255
-
256
- md.order = stage
257
- text = func(md, text, *args, **kwargs)
258
- md.order = stage + 0.5
259
-
260
- if stage in Extra._exec_order:
261
- for klass in Extra._exec_order[stage][1]:
262
- if klass.name not in md.extra_classes:
263
- continue
264
- extra = md.extra_classes[klass.name]
265
- if extra.test(text):
266
- text = extra.run(text)
267
-
268
- return text
269
-
270
- return inner
271
-
272
- return wrapper
273
-
274
-
275
- class Markdown(object):
276
- # The dict of "extras" to enable in processing -- a mapping of
277
- # extra name to argument for the extra. Most extras do not have an
278
- # argument, in which case the value is None.
279
- #
280
- # This can be set via (a) subclassing and (b) the constructor
281
- # "extras" argument.
282
- extras: _extras_dict
283
- # dict of `Extra` names and associated class instances, populated during _setup_extras
284
- extra_classes: Dict[str, 'Extra']
285
-
286
- urls: Dict[str, str]
287
- titles: Dict[str, str]
288
- html_blocks: Dict[str, str]
289
- html_spans: Dict[str, str]
290
- html_removed_text: str = "{(#HTML#)}" # placeholder removed text that does not trigger bold
291
- html_removed_text_compat: str = "[HTML_REMOVED]" # for compat with markdown.py
292
- safe_mode: Optional[_safe_mode]
293
-
294
- _toc: List[Tuple[int, str, str]]
295
-
296
- # Used to track when we're inside an ordered or unordered list
297
- # (see _ProcessListItems() for details):
298
- list_level = 0
299
-
300
- stage: Stage
301
- '''Current "stage" of markdown conversion taking place'''
302
- order: float
303
- '''
304
- Same as `Stage` but will be +/- 0.5 of the value of `Stage`.
305
- This allows extras to check if they are running before or after a particular stage
306
- with `if md.order < md.stage`.
307
- '''
308
-
309
- _ws_only_line_re = re.compile(r"^[ \t]+$", re.M)
310
-
311
- def __init__(
312
- self,
313
- html4tags: bool = False,
314
- tab_width: int = DEFAULT_TAB_WIDTH,
315
- safe_mode: Optional[_safe_mode] = None,
316
- extras: Optional[_extras_param] = None,
317
- link_patterns: Optional[_link_patterns] = None,
318
- footnote_title: Optional[str] = None,
319
- footnote_return_symbol: Optional[str] = None,
320
- use_file_vars: bool = False,
321
- cli: bool = False
322
- ):
323
- if html4tags:
324
- self.empty_element_suffix = ">"
325
- else:
326
- self.empty_element_suffix = " />"
327
- self.tab_width = tab_width
328
- self.tab = tab_width * " "
329
-
330
- # For compatibility with earlier markdown2.py and with
331
- # markdown.py's safe_mode being a boolean,
332
- # safe_mode == True -> "replace"
333
- if safe_mode is True:
334
- self.safe_mode = "replace"
335
- else:
336
- self.safe_mode = safe_mode
337
-
338
- # Massaging and building the "extras" info.
339
- if getattr(self, 'extras', None) is None:
340
- self.extras = {}
341
- elif not isinstance(self.extras, dict):
342
- # inheriting classes may set `self.extras` as List[str].
343
- # we can't allow it through type hints but we can convert it
344
- self.extras = dict([(e, None) for e in self.extras]) # type:ignore
345
-
346
- if extras:
347
- if not isinstance(extras, dict):
348
- extras = dict([(e, None) for e in extras])
349
- self.extras.update(extras)
350
- assert isinstance(self.extras, dict)
351
-
352
- if "toc" in self.extras:
353
- if "header-ids" not in self.extras:
354
- self.extras["header-ids"] = None # "toc" implies "header-ids"
355
-
356
- if self.extras["toc"] is None:
357
- self._toc_depth = 6
358
- else:
359
- self._toc_depth = self.extras["toc"].get("depth", 6)
360
-
361
- if 'header-ids' in self.extras:
362
- if not isinstance(self.extras['header-ids'], dict):
363
- self.extras['header-ids'] = {
364
- 'mixed': False,
365
- 'prefix': self.extras['header-ids'],
366
- 'reset-count': True
367
- }
368
-
369
- if 'break-on-newline' in self.extras:
370
- self.extras.setdefault('breaks', {})
371
- self.extras['breaks']['on_newline'] = True
372
-
373
- if 'link-patterns' in self.extras:
374
- # allow link patterns via extras dict without kwarg explicitly set
375
- link_patterns = link_patterns or self.extras['link-patterns']
376
- if link_patterns is None:
377
- # if you have specified that the link-patterns extra SHOULD
378
- # be used (via self.extras) but you haven't provided anything
379
- # via the link_patterns argument then an error is raised
380
- raise MarkdownError("If the 'link-patterns' extra is used, an argument for 'link_patterns' is required")
381
- self.extras['link-patterns'] = link_patterns
382
-
383
- self._instance_extras = self.extras.copy()
384
- self.link_patterns = link_patterns
385
- self.footnote_title = footnote_title
386
- self.footnote_return_symbol = footnote_return_symbol
387
- self.use_file_vars = use_file_vars
388
- self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M)
389
- self.cli = cli
390
-
391
- self._escape_table = g_escape_table.copy()
392
- self._code_table = {}
393
- if "smarty-pants" in self.extras:
394
- self._escape_table['"'] = _hash_text('"')
395
- self._escape_table["'"] = _hash_text("'")
396
-
397
- def reset(self):
398
- self.urls = {}
399
- self.titles = {}
400
- self.html_blocks = {}
401
- self.html_spans = {}
402
- self.list_level = 0
403
- self.extras = self._instance_extras.copy()
404
- self._setup_extras()
405
- self._toc = []
406
-
407
- def _setup_extras(self):
408
- if "footnotes" in self.extras:
409
- # order of insertion matters for footnotes. Use ordered dict for Python < 3.7
410
- # https://docs.python.org/3/whatsnew/3.7.html#summary-release-highlights
411
- self.footnotes = OrderedDict()
412
- self.footnote_ids = []
413
- if "header-ids" in self.extras:
414
- if not hasattr(self, '_count_from_header_id') or self.extras['header-ids'].get('reset-count', False):
415
- self._count_from_header_id = defaultdict(int)
416
- if "metadata" in self.extras:
417
- self.metadata: Dict[str, Any] = {}
418
-
419
- self.extra_classes = {}
420
- for name, klass in Extra._registry.items():
421
- if name not in self.extras:
422
- continue
423
- self.extra_classes[name] = klass(self, (self.extras.get(name, {})))
424
-
425
- # Per <https://developer.mozilla.org/en-US/docs/HTML/Element/a> "rel"
426
- # should only be used in <a> tags with an "href" attribute.
427
-
428
- # Opens the linked document in a new window or tab
429
- # should only used in <a> tags with an "href" attribute.
430
- # same with _a_nofollow
431
- _a_nofollow_or_blank_links = re.compile(r"""
432
- <(a)
433
- (
434
- [^>]*
435
- href= # href is required
436
- ['"]? # HTML5 attribute values do not have to be quoted
437
- [^#'"] # We don't want to match href values that start with # (like footnotes)
438
- )
439
- """,
440
- re.IGNORECASE | re.VERBOSE
441
- )
442
-
443
- def convert(self, text: str) -> 'UnicodeWithAttrs':
444
- """Convert the given text."""
445
- # Main function. The order in which other subs are called here is
446
- # essential. Link and image substitutions need to happen before
447
- # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
448
- # and <img> tags get encoded.
449
-
450
- # Clear the global hashes. If we don't clear these, you get conflicts
451
- # from other articles when generating a page which contains more than
452
- # one article (e.g. an index page that shows the N most recent
453
- # articles):
454
- self.reset()
455
-
456
- if not isinstance(text, str):
457
- # TODO: perhaps shouldn't presume UTF-8 for string input?
458
- text = str(text, 'utf-8')
459
-
460
- if self.use_file_vars:
461
- # Look for emacs-style file variable hints.
462
- text = self._emacs_oneliner_vars_pat.sub(self._emacs_vars_oneliner_sub, text)
463
- emacs_vars = self._get_emacs_vars(text)
464
- if "markdown-extras" in emacs_vars:
465
- splitter = re.compile("[ ,]+")
466
- for e in splitter.split(emacs_vars["markdown-extras"]):
467
- if '=' in e:
468
- ename, earg = e.split('=', 1)
469
- try:
470
- earg = int(earg)
471
- except ValueError:
472
- pass
473
- else:
474
- ename, earg = e, None
475
- self.extras[ename] = earg
476
-
477
- self._setup_extras()
478
-
479
- # Standardize line endings:
480
- text = text.replace("\r\n", "\n")
481
- text = text.replace("\r", "\n")
482
-
483
- # Make sure $text ends with a couple of newlines:
484
- text += "\n\n"
485
-
486
- # Convert all tabs to spaces.
487
- text = self._detab(text)
488
-
489
- # Strip any lines consisting only of spaces and tabs.
490
- # This makes subsequent regexen easier to write, because we can
491
- # match consecutive blank lines with /\n+/ instead of something
492
- # contorted like /[ \t]*\n+/ .
493
- text = self._ws_only_line_re.sub("", text)
494
-
495
- # strip metadata from head and extract
496
- if "metadata" in self.extras:
497
- text = self._extract_metadata(text)
498
-
499
- text = self.preprocess(text)
500
-
501
- if self.safe_mode:
502
- text = self._hash_html_spans(text)
503
-
504
- # Turn block-level HTML blocks into hash entries
505
- text = self._hash_html_blocks(text, raw=True)
506
-
507
- # Strip link definitions, store in hashes.
508
- if "footnotes" in self.extras:
509
- # Must do footnotes first because an unlucky footnote defn
510
- # looks like a link defn:
511
- # [^4]: this "looks like a link defn"
512
- text = self._strip_footnote_definitions(text)
513
- text = self._strip_link_definitions(text)
514
-
515
- text = self._run_block_gamut(text)
516
-
517
- if "footnotes" in self.extras:
518
- text = self._add_footnotes(text)
519
-
520
- text = self.postprocess(text)
521
-
522
- text = self._unescape_special_chars(text)
523
-
524
- if self.safe_mode:
525
- text = self._unhash_html_spans(text)
526
- # return the removed text warning to its markdown.py compatible form
527
- text = text.replace(self.html_removed_text, self.html_removed_text_compat)
528
-
529
- do_target_blank_links = "target-blank-links" in self.extras
530
- do_nofollow_links = "nofollow" in self.extras
531
-
532
- if do_target_blank_links and do_nofollow_links:
533
- text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="nofollow noopener" target="_blank"\2', text)
534
- elif do_target_blank_links:
535
- text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="noopener" target="_blank"\2', text)
536
- elif do_nofollow_links:
537
- text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="nofollow"\2', text)
538
-
539
- if "toc" in self.extras and self._toc:
540
- if self.extras['header-ids'].get('mixed'):
541
- # TOC will only be out of order if mixed headers is enabled
542
- def toc_sort(entry):
543
- '''Sort the TOC by order of appearance in text'''
544
- match = re.search(
545
- # header tag, any attrs, the ID, any attrs, the text, close tag
546
- r'^<(h%d).*?id=(["\'])%s\2.*>%s</\1>$' % (entry[0], entry[1], re.escape(entry[2])),
547
- text, re.M
548
- )
549
- return match.start() if match else 0
550
-
551
- self._toc.sort(key=toc_sort)
552
- self._toc_html = calculate_toc_html(self._toc)
553
-
554
- # Prepend toc html to output
555
- if self.cli or (self.extras['toc'] is not None and self.extras['toc'].get('prepend', False)):
556
- text = '{}\n{}'.format(self._toc_html, text)
557
-
558
- text += "\n"
559
-
560
- # Attach attrs to output
561
- rv = UnicodeWithAttrs(text)
562
-
563
- if "toc" in self.extras and self._toc:
564
- rv.toc_html = self._toc_html
565
-
566
- if "metadata" in self.extras:
567
- rv.metadata = self.metadata
568
- return rv
569
-
570
- @mark_stage(Stage.POSTPROCESS)
571
- def postprocess(self, text: str) -> str:
572
- """A hook for subclasses to do some postprocessing of the html, if
573
- desired. This is called before unescaping of special chars and
574
- unhashing of raw HTML spans.
575
- """
576
- return text
577
-
578
- @mark_stage(Stage.PREPROCESS)
579
- def preprocess(self, text: str) -> str:
580
- """A hook for subclasses to do some preprocessing of the Markdown, if
581
- desired. This is called after basic formatting of the text, but prior
582
- to any extras, safe mode, etc. processing.
583
- """
584
- return text
585
-
586
- # Is metadata if the content starts with optional '---'-fenced `key: value`
587
- # pairs. E.g. (indented for presentation):
588
- # ---
589
- # foo: bar
590
- # another-var: blah blah
591
- # ---
592
- # # header
593
- # or:
594
- # foo: bar
595
- # another-var: blah blah
596
- #
597
- # # header
598
- _meta_data_pattern = re.compile(r'''
599
- ^{0}( # optional opening fence
600
- (?:
601
- {1}:(?:\n+[ \t]+.*)+ # indented lists
602
- )|(?:
603
- (?:{1}:\s+>(?:\n\s+.*)+?) # multiline long descriptions
604
- (?=\n{1}:\s*.*\n|\s*\Z) # match up until the start of the next key:value definition or the end of the input text
605
- )|(?:
606
- {1}:(?! >).*\n? # simple key:value pair, leading spaces allowed
607
- )
608
- ){0} # optional closing fence
609
- '''.format(r'(?:---[\ \t]*\n)?', r'[\S \t]*\w[\S \t]*\s*'), re.MULTILINE | re.VERBOSE
610
- )
611
-
612
- _key_val_list_pat = re.compile(
613
- r"^-(?:[ \t]*([^\n]*)(?:[ \t]*[:-][ \t]*(\S+))?)(?:\n((?:[ \t]+[^\n]+\n?)+))?",
614
- re.MULTILINE,
615
- )
616
- _key_val_dict_pat = re.compile(
617
- r"^([^:\n]+)[ \t]*:[ \t]*([^\n]*)(?:((?:\n[ \t]+[^\n]+)+))?", re.MULTILINE
618
- ) # grp0: key, grp1: value, grp2: multiline value
619
- _meta_data_fence_pattern = re.compile(r'^---[\ \t]*\n', re.MULTILINE)
620
- _meta_data_newline = re.compile("^\n", re.MULTILINE)
621
-
622
- def _extract_metadata(self, text: str) -> str:
623
- if text.startswith("---"):
624
- fence_splits = re.split(self._meta_data_fence_pattern, text, maxsplit=2)
625
- metadata_content = fence_splits[1]
626
- tail = fence_splits[2]
627
- else:
628
- metadata_split = re.split(self._meta_data_newline, text, maxsplit=1)
629
- metadata_content = metadata_split[0]
630
- tail = metadata_split[1]
631
-
632
- # _meta_data_pattern only has one capturing group, so we can assume
633
- # the returned type to be list[str]
634
- match: List[str] = re.findall(self._meta_data_pattern, metadata_content)
635
- if not match:
636
- return text
637
-
638
- def parse_structured_value(value: str) -> Union[List[Any], Dict[str, Any]]:
639
- vs = value.lstrip()
640
- vs = value.replace(v[: len(value) - len(vs)], "\n")[1:]
641
-
642
- # List
643
- if vs.startswith("-"):
644
- r: List[Any] = []
645
- # the regex used has multiple capturing groups, so
646
- # returned type from findall will be List[List[str]]
647
- match: List[str]
648
- for match in re.findall(self._key_val_list_pat, vs):
649
- if match[0] and not match[1] and not match[2]:
650
- r.append(match[0].strip())
651
- elif match[0] == ">" and not match[1] and match[2]:
652
- r.append(match[2].strip())
653
- elif match[0] and match[1]:
654
- r.append({match[0].strip(): match[1].strip()})
655
- elif not match[0] and not match[1] and match[2]:
656
- r.append(parse_structured_value(match[2]))
657
- else:
658
- # Broken case
659
- pass
660
-
661
- return r
662
-
663
- # Dict
664
- else:
665
- return {
666
- match[0].strip(): (
667
- match[1].strip()
668
- if match[1]
669
- else parse_structured_value(match[2])
670
- )
671
- for match in re.findall(self._key_val_dict_pat, vs)
672
- }
673
-
674
- for item in match:
675
-
676
- k, v = item.split(":", 1)
677
-
678
- # Multiline value
679
- if v[:3] == " >\n":
680
- self.metadata[k.strip()] = _dedent(v[3:]).strip()
681
-
682
- # Empty value
683
- elif v == "\n":
684
- self.metadata[k.strip()] = ""
685
-
686
- # Structured value
687
- elif v[0] == "\n":
688
- self.metadata[k.strip()] = parse_structured_value(v)
689
-
690
- # Simple value
691
- else:
692
- self.metadata[k.strip()] = v.strip()
693
-
694
- return tail
695
-
696
- _emacs_oneliner_vars_pat = re.compile(r"((?:<!--)?\s*-\*-)\s*(?:(\S[^\r\n]*?)([\r\n]\s*)?)?(-\*-\s*(?:-->)?)", re.UNICODE)
697
- # This regular expression is intended to match blocks like this:
698
- # PREFIX Local Variables: SUFFIX
699
- # PREFIX mode: Tcl SUFFIX
700
- # PREFIX End: SUFFIX
701
- # Some notes:
702
- # - "[ \t]" is used instead of "\s" to specifically exclude newlines
703
- # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does
704
- # not like anything other than Unix-style line terminators.
705
- _emacs_local_vars_pat = re.compile(r"""^
706
- (?P<prefix>(?:[^\r\n|\n|\r])*?)
707
- [\ \t]*Local\ Variables:[\ \t]*
708
- (?P<suffix>.*?)(?:\r\n|\n|\r)
709
- (?P<content>.*?\1End:)
710
- """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)
711
-
712
- def _emacs_vars_oneliner_sub(self, match: re.Match) -> str:
713
- if match.group(1).strip() == '-*-' and match.group(4).strip() == '-*-':
714
- lead_ws = re.findall(r'^\s*', match.group(1))[0]
715
- tail_ws = re.findall(r'\s*$', match.group(4))[0]
716
- return '%s<!-- %s %s %s -->%s' % (lead_ws, '-*-', match.group(2).strip(), '-*-', tail_ws)
717
-
718
- start, end = match.span()
719
- return match.string[start: end]
720
-
721
- def _get_emacs_vars(self, text: str) -> Dict[str, str]:
722
- """Return a dictionary of emacs-style local variables.
723
-
724
- Parsing is done loosely according to this spec (and according to
725
- some in-practice deviations from this):
726
- http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables
727
- """
728
- emacs_vars = {}
729
- SIZE = pow(2, 13) # 8kB
730
-
731
- # Search near the start for a '-*-'-style one-liner of variables.
732
- head = text[:SIZE]
733
- if "-*-" in head:
734
- match = self._emacs_oneliner_vars_pat.search(head)
735
- if match:
736
- emacs_vars_str = match.group(2)
737
- assert '\n' not in emacs_vars_str
738
- emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';')
739
- if s.strip()]
740
- if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]:
741
- # While not in the spec, this form is allowed by emacs:
742
- # -*- Tcl -*-
743
- # where the implied "variable" is "mode". This form
744
- # is only allowed if there are no other variables.
745
- emacs_vars["mode"] = emacs_var_strs[0].strip()
746
- else:
747
- for emacs_var_str in emacs_var_strs:
748
- try:
749
- variable, value = emacs_var_str.strip().split(':', 1)
750
- except ValueError:
751
- log.debug("emacs variables error: malformed -*- "
752
- "line: %r", emacs_var_str)
753
- continue
754
- # Lowercase the variable name because Emacs allows "Mode"
755
- # or "mode" or "MoDe", etc.
756
- emacs_vars[variable.lower()] = value.strip()
757
-
758
- tail = text[-SIZE:]
759
- if "Local Variables" in tail:
760
- match = self._emacs_local_vars_pat.search(tail)
761
- if match:
762
- prefix = match.group("prefix")
763
- suffix = match.group("suffix")
764
- lines = match.group("content").splitlines(False)
765
- # print "prefix=%r, suffix=%r, content=%r, lines: %s"\
766
- # % (prefix, suffix, match.group("content"), lines)
767
-
768
- # Validate the Local Variables block: proper prefix and suffix
769
- # usage.
770
- for i, line in enumerate(lines):
771
- if not line.startswith(prefix):
772
- log.debug("emacs variables error: line '%s' "
773
- "does not use proper prefix '%s'"
774
- % (line, prefix))
775
- return {}
776
- # Don't validate suffix on last line. Emacs doesn't care,
777
- # neither should we.
778
- if i != len(lines)-1 and not line.endswith(suffix):
779
- log.debug("emacs variables error: line '%s' "
780
- "does not use proper suffix '%s'"
781
- % (line, suffix))
782
- return {}
783
-
784
- # Parse out one emacs var per line.
785
- continued_for = None
786
- for line in lines[:-1]: # no var on the last line ("PREFIX End:")
787
- if prefix:
788
- line = line[len(prefix):] # strip prefix
789
- if suffix:
790
- line = line[:-len(suffix)] # strip suffix
791
- line = line.strip()
792
- if continued_for:
793
- variable = continued_for
794
- if line.endswith('\\'):
795
- line = line[:-1].rstrip()
796
- else:
797
- continued_for = None
798
- emacs_vars[variable] += ' ' + line
799
- else:
800
- try:
801
- variable, value = line.split(':', 1)
802
- except ValueError:
803
- log.debug("local variables error: missing colon "
804
- "in local variables entry: '%s'" % line)
805
- continue
806
- # Do NOT lowercase the variable name, because Emacs only
807
- # allows "mode" (and not "Mode", "MoDe", etc.) in this block.
808
- value = value.strip()
809
- if value.endswith('\\'):
810
- value = value[:-1].rstrip()
811
- continued_for = variable
812
- else:
813
- continued_for = None
814
- emacs_vars[variable] = value
815
-
816
- # Unquote values.
817
- for var, val in list(emacs_vars.items()):
818
- if len(val) > 1 and (val.startswith('"') and val.endswith('"')
819
- or val.startswith('"') and val.endswith('"')):
820
- emacs_vars[var] = val[1:-1]
821
-
822
- return emacs_vars
823
-
824
- def _detab_line(self, line: str) -> str:
825
- r"""Recusively convert tabs to spaces in a single line.
826
-
827
- Called from _detab()."""
828
- if '\t' not in line:
829
- return line
830
- chunk1, chunk2 = line.split('\t', 1)
831
- chunk1 += (' ' * (self.tab_width - len(chunk1) % self.tab_width))
832
- output = chunk1 + chunk2
833
- return self._detab_line(output)
834
-
835
- def _detab(self, text: str) -> str:
836
- r"""Iterate text line by line and convert tabs to spaces.
837
-
838
- >>> m = Markdown()
839
- >>> m._detab("\tfoo")
840
- ' foo'
841
- >>> m._detab(" \tfoo")
842
- ' foo'
843
- >>> m._detab("\t foo")
844
- ' foo'
845
- >>> m._detab(" foo")
846
- ' foo'
847
- >>> m._detab(" foo\n\tbar\tblam")
848
- ' foo\n bar blam'
849
- """
850
- if '\t' not in text:
851
- return text
852
- output = []
853
- for line in text.splitlines():
854
- output.append(self._detab_line(line))
855
- return '\n'.join(output)
856
-
857
- # I broke out the html5 tags here and add them to _block_tags_a and
858
- # _block_tags_b. This way html5 tags are easy to keep track of.
859
- _html5tags = '|article|aside|header|hgroup|footer|nav|section|figure|figcaption'
860
-
861
- _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del|style|html|head|body'
862
- _block_tags_a += _html5tags
863
-
864
- _strict_tag_block_re = re.compile(r"""
865
- ( # save in \1
866
- ^ # start of line (with re.M)
867
- <(%s) # start tag = \2
868
- \b # word break
869
- (.*\n)*? # any number of lines, minimally matching
870
- </\2> # the matching end tag
871
- [ \t]* # trailing spaces/tabs
872
- (?=\n+|\Z) # followed by a newline or end of document
873
- )
874
- """ % _block_tags_a,
875
- re.X | re.M)
876
-
877
- _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math'
878
- _block_tags_b += _html5tags
879
-
880
- _span_tags = (
881
- 'a|abbr|acronym|b|bdo|big|br|button|cite|code|dfn|em|i|img|input|kbd|label|map|object|output|q'
882
- '|samp|script|select|small|span|strong|sub|sup|textarea|time|tt|var'
883
- )
884
-
885
- _liberal_tag_block_re = re.compile(r"""
886
- ( # save in \1
887
- ^ # start of line (with re.M)
888
- <(%s) # start tag = \2
889
- \b # word break
890
- (.*\n)*? # any number of lines, minimally matching
891
- .*</\2> # the matching end tag
892
- [ \t]* # trailing spaces/tabs
893
- (?=\n+|\Z) # followed by a newline or end of document
894
- )
895
- """ % _block_tags_b,
896
- re.X | re.M)
897
-
898
- _html_markdown_attr_re = re.compile(
899
- r'''\s+markdown=("1"|'1')''')
900
- def _hash_html_block_sub(
901
- self,
902
- match: Union[re.Match, str],
903
- raw: bool = False
904
- ) -> str:
905
- if isinstance(match, str):
906
- html = match
907
- tag = None
908
- else:
909
- html = match.group(1)
910
- try:
911
- tag = match.group(2)
912
- except IndexError:
913
- tag = None
914
-
915
- if not tag:
916
- m = re.match(r'.*?<(\S).*?\s*>', html)
917
- # tag shouldn't be none but make the assertion for type checker
918
- assert m is not None
919
- tag = m.group(1)
920
-
921
- if raw and self.safe_mode:
922
- html = self._sanitize_html(html)
923
- elif 'markdown-in-html' in self.extras and 'markdown=' in html:
924
- first_line = html.split('\n', 1)[0]
925
- m = self._html_markdown_attr_re.search(first_line)
926
- if m:
927
- lines = html.split('\n')
928
- # if MD is on same line as opening tag then split across two lines
929
- lines = list(filter(None, (re.split(r'(.*?<%s.*markdown=.*?>)' % tag, lines[0])))) + lines[1:]
930
- # if MD on same line as closing tag, split across two lines
931
- lines = lines[:-1] + list(filter(None, re.split(r'(\s*?</%s>.*?$)' % tag, lines[-1])))
932
- # extract key sections of the match
933
- first_line = lines[0]
934
- middle = '\n'.join(lines[1:-1])
935
- last_line = lines[-1]
936
- # remove `markdown="1"` attr from tag
937
- first_line = first_line[:m.start()] + first_line[m.end():]
938
- # hash the HTML segments to protect them
939
- f_key = _hash_text(first_line)
940
- self.html_blocks[f_key] = first_line
941
- l_key = _hash_text(last_line)
942
- self.html_blocks[l_key] = last_line
943
- return ''.join(["\n\n", f_key,
944
- "\n\n", middle, "\n\n",
945
- l_key, "\n\n"])
946
- elif self.extras.get('header-ids', {}).get('mixed') and self._h_tag_re.match(html):
947
- html = self._h_tag_re.sub(self._h_tag_sub, html)
948
- key = _hash_text(html)
949
- self.html_blocks[key] = html
950
- return "\n\n" + key + "\n\n"
951
-
952
- @mark_stage(Stage.HASH_HTML)
953
- def _hash_html_blocks(self, text: str, raw: bool = False) -> str:
954
- """Hashify HTML blocks
955
-
956
- We only want to do this for block-level HTML tags, such as headers,
957
- lists, and tables. That's because we still want to wrap <p>s around
958
- "paragraphs" that are wrapped in non-block-level tags, such as anchors,
959
- phrase emphasis, and spans. The list of tags we're looking for is
960
- hard-coded.
961
-
962
- @param raw {boolean} indicates if these are raw HTML blocks in
963
- the original source. It makes a difference in "safe" mode.
964
- """
965
- if '<' not in text:
966
- return text
967
-
968
- # Pass `raw` value into our calls to self._hash_html_block_sub.
969
- hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw)
970
-
971
- # First, look for nested blocks, e.g.:
972
- # <div>
973
- # <div>
974
- # tags for inner block must be indented.
975
- # </div>
976
- # </div>
977
- #
978
- # The outermost tags must start at the left margin for this to match, and
979
- # the inner nested divs must be indented.
980
- # We need to do this before the next, more liberal match, because the next
981
- # match will start at the first `<div>` and stop at the first `</div>`.
982
- text = self._strict_tag_block_sub(text, self._block_tags_a, hash_html_block_sub)
983
-
984
- # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
985
- text = self._liberal_tag_block_re.sub(hash_html_block_sub, text)
986
-
987
- # now do the same for spans that are acting like blocks
988
- # eg: an anchor split over multiple lines for readability
989
- text = self._strict_tag_block_sub(
990
- text, self._span_tags,
991
- # inline elements can't contain block level elements, so only span gamut is required
992
- lambda t: hash_html_block_sub(self._run_span_gamut(t))
993
- )
994
-
995
- # Special case just for <hr />. It was easier to make a special
996
- # case than to make the other regex more complicated.
997
- if "<hr" in text:
998
- _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width)
999
- text = _hr_tag_re.sub(hash_html_block_sub, text)
1000
-
1001
- # Special case for standalone HTML comments:
1002
- if "<!--" in text:
1003
- start = 0
1004
- while True:
1005
- # Delimiters for next comment block.
1006
- try:
1007
- start_idx = text.index("<!--", start)
1008
- except ValueError:
1009
- break
1010
- try:
1011
- end_idx = text.index("-->", start_idx) + 3
1012
- except ValueError:
1013
- break
1014
-
1015
- # Start position for next comment block search.
1016
- start = end_idx
1017
-
1018
- # Validate whitespace before comment.
1019
- if start_idx:
1020
- # - Up to `tab_width - 1` spaces before start_idx.
1021
- for i in range(self.tab_width - 1):
1022
- if text[start_idx - 1] != ' ':
1023
- break
1024
- start_idx -= 1
1025
- if start_idx == 0:
1026
- break
1027
- # - Must be preceded by 2 newlines or hit the start of
1028
- # the document.
1029
- if start_idx == 0:
1030
- pass
1031
- elif start_idx == 1 and text[0] == '\n':
1032
- start_idx = 0 # to match minute detail of Markdown.pl regex
1033
- elif text[start_idx-2:start_idx] == '\n\n':
1034
- pass
1035
- else:
1036
- break
1037
-
1038
- # Validate whitespace after comment.
1039
- # - Any number of spaces and tabs.
1040
- while end_idx < len(text):
1041
- if text[end_idx] not in ' \t':
1042
- break
1043
- end_idx += 1
1044
- # - Must be following by 2 newlines or hit end of text.
1045
- if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'):
1046
- continue
1047
-
1048
- # Escape and hash (must match `_hash_html_block_sub`).
1049
- html = text[start_idx:end_idx]
1050
- if raw and self.safe_mode:
1051
- html = self._sanitize_html(html)
1052
- key = _hash_text(html)
1053
- self.html_blocks[key] = html
1054
- text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:]
1055
-
1056
- if "xml" in self.extras:
1057
- # Treat XML processing instructions and namespaced one-liner
1058
- # tags as if they were block HTML tags. E.g., if standalone
1059
- # (i.e. are their own paragraph), the following do not get
1060
- # wrapped in a <p> tag:
1061
- # <?foo bar?>
1062
- #
1063
- # <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/>
1064
- _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width)
1065
- text = _xml_oneliner_re.sub(hash_html_block_sub, text)
1066
-
1067
- return text
1068
-
1069
- def _strict_tag_block_sub(
1070
- self,
1071
- text: str,
1072
- html_tags_re: str,
1073
- callback: Callable[[str], str],
1074
- allow_indent: bool = False
1075
- ) -> str:
1076
- '''
1077
- Finds and substitutes HTML blocks within blocks of text
1078
-
1079
- Args:
1080
- text: the text to search
1081
- html_tags_re: a regex pattern of HTML block tags to match against.
1082
- For example, `Markdown._block_tags_a`
1083
- callback: callback function that receives the found HTML text block and returns a new str
1084
- allow_indent: allow matching HTML blocks that are not completely outdented
1085
- '''
1086
- tag_count = 0
1087
- current_tag = html_tags_re
1088
- block = ''
1089
- result = ''
1090
-
1091
- for chunk in text.splitlines(True):
1092
- is_markup = re.match(
1093
- r'^(\s{0,%s})(?:</code>(?=</pre>))?(</?(%s)\b>?)' % ('' if allow_indent else '0', current_tag), chunk
1094
- )
1095
- block += chunk
1096
-
1097
- if is_markup:
1098
- if chunk.startswith('%s</' % is_markup.group(1)):
1099
- tag_count -= 1
1100
- else:
1101
- # if close tag is in same line
1102
- if self._tag_is_closed(is_markup.group(3), chunk):
1103
- # we must ignore these
1104
- is_markup = None
1105
- else:
1106
- tag_count += 1
1107
- current_tag = is_markup.group(3)
1108
-
1109
- if tag_count == 0:
1110
- if is_markup:
1111
- block = callback(block.rstrip('\n')) # remove trailing newline
1112
- current_tag = html_tags_re
1113
- result += block
1114
- block = ''
1115
-
1116
- result += block
1117
-
1118
- return result
1119
-
1120
- def _tag_is_closed(self, tag_name: str, text: str) -> bool:
1121
- # super basic check if number of open tags == number of closing tags
1122
- return len(re.findall('<%s(?:.*?)>' % tag_name, text)) == len(re.findall('</%s>' % tag_name, text))
1123
-
1124
- @mark_stage(Stage.LINK_DEFS)
1125
- def _strip_link_definitions(self, text: str) -> str:
1126
- # Strips link definitions from text, stores the URLs and titles in
1127
- # hash references.
1128
- less_than_tab = self.tab_width - 1
1129
-
1130
- # Link defs are in the form:
1131
- # [id]: url "optional title"
1132
- _link_def_re = re.compile(r"""
1133
- ^[ ]{0,%d}\[(.+)\]: # id = \1
1134
- [ \t]*
1135
- \n? # maybe *one* newline
1136
- [ \t]*
1137
- <?(.+?)>? # url = \2
1138
- [ \t]*
1139
- (?:
1140
- \n? # maybe one newline
1141
- [ \t]*
1142
- (?<=\s) # lookbehind for whitespace
1143
- ['"(]
1144
- ([^\n]*) # title = \3
1145
- ['")]
1146
- [ \t]*
1147
- )? # title is optional
1148
- (?:\n+|\Z)
1149
- """ % less_than_tab, re.X | re.M | re.U)
1150
- return _link_def_re.sub(self._extract_link_def_sub, text)
1151
-
1152
- def _extract_link_def_sub(self, match: re.Match) -> str:
1153
- id, url, title = match.groups()
1154
- key = id.lower() # Link IDs are case-insensitive
1155
- self.urls[key] = self._encode_amps_and_angles(url)
1156
- if title:
1157
- self.titles[key] = title
1158
- return ""
1159
-
1160
- def _extract_footnote_def_sub(self, match: re.Match) -> str:
1161
- id, text = match.groups()
1162
- text = _dedent(text, skip_first_line=not text.startswith('\n')).strip()
1163
- normed_id = re.sub(r'\W', '-', id)
1164
- # Ensure footnote text ends with a couple newlines (for some
1165
- # block gamut matches).
1166
- self.footnotes[normed_id] = text + "\n\n"
1167
- return ""
1168
-
1169
- def _strip_footnote_definitions(self, text: str) -> str:
1170
- """A footnote definition looks like this:
1171
-
1172
- [^note-id]: Text of the note.
1173
-
1174
- May include one or more indented paragraphs.
1175
-
1176
- Where,
1177
- - The 'note-id' can be pretty much anything, though typically it
1178
- is the number of the footnote.
1179
- - The first paragraph may start on the next line, like so:
1180
-
1181
- [^note-id]:
1182
- Text of the note.
1183
- """
1184
- less_than_tab = self.tab_width - 1
1185
- footnote_def_re = re.compile(r'''
1186
- ^[ ]{0,%d}\[\^(.+)\]: # id = \1
1187
- [ \t]*
1188
- ( # footnote text = \2
1189
- # First line need not start with the spaces.
1190
- (?:\s*.*\n+)
1191
- (?:
1192
- (?:[ ]{%d} | \t) # Subsequent lines must be indented.
1193
- .*\n+
1194
- )*
1195
- )
1196
- # Lookahead for non-space at line-start, or end of doc.
1197
- (?:(?=^[ ]{0,%d}\S)|\Z)
1198
- ''' % (less_than_tab, self.tab_width, self.tab_width),
1199
- re.X | re.M)
1200
- return footnote_def_re.sub(self._extract_footnote_def_sub, text)
1201
-
1202
- _hr_re = re.compile(r'^[ ]{0,3}([-_*])[ ]{0,2}(\1[ ]{0,2}){2,}$', re.M)
1203
-
1204
- @mark_stage(Stage.BLOCK_GAMUT)
1205
- def _run_block_gamut(self, text: str) -> str:
1206
- # These are all the transformations that form block-level
1207
- # tags like paragraphs, headers, and list items.
1208
-
1209
- text = self._do_headers(text)
1210
-
1211
- # Do Horizontal Rules:
1212
- # On the number of spaces in horizontal rules: The spec is fuzzy: "If
1213
- # you wish, you may use spaces between the hyphens or asterisks."
1214
- # Markdown.pl 1.0.1's hr regexes limit the number of spaces between the
1215
- # hr chars to one or two. We'll reproduce that limit here.
1216
- hr = "\n<hr"+self.empty_element_suffix+"\n"
1217
- text = re.sub(self._hr_re, hr, text)
1218
-
1219
- text = self._do_lists(text)
1220
-
1221
- text = self._do_code_blocks(text)
1222
-
1223
- text = self._do_block_quotes(text)
1224
-
1225
- # We already ran _HashHTMLBlocks() before, in Markdown(), but that
1226
- # was to escape raw HTML in the original Markdown source. This time,
1227
- # we're escaping the markup we've just created, so that we don't wrap
1228
- # <p> tags around block-level tags.
1229
- text = self._hash_html_blocks(text)
1230
-
1231
- text = self._form_paragraphs(text)
1232
-
1233
- return text
1234
-
1235
- @mark_stage(Stage.SPAN_GAMUT)
1236
- def _run_span_gamut(self, text: str) -> str:
1237
- # These are all the transformations that occur *within* block-level
1238
- # tags like paragraphs, headers, and list items.
1239
-
1240
- text = self._do_code_spans(text)
1241
-
1242
- text = self._escape_special_chars(text)
1243
-
1244
- # Process anchor and image tags.
1245
- text = self._do_links(text)
1246
-
1247
- # Make links out of things like `<http://example.com/>`
1248
- # Must come after _do_links(), because you can use < and >
1249
- # delimiters in inline links like [this](<url>).
1250
- text = self._do_auto_links(text)
1251
-
1252
- text = self._encode_amps_and_angles(text)
1253
-
1254
- text = self._do_italics_and_bold(text)
1255
-
1256
- # Do hard breaks
1257
- text = re.sub(r" {2,}\n(?!\<(?:\/?(ul|ol|li))\>)", "<br%s\n" % self.empty_element_suffix, text)
1258
-
1259
- return text
1260
-
1261
- # "Sorta" because auto-links are identified as "tag" tokens.
1262
- _sorta_html_tokenize_re = re.compile(r"""
1263
- (
1264
- \\* # escapes
1265
- (?:
1266
- # tag
1267
- </?
1268
- (?:\w+) # tag name
1269
- (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))* # attributes
1270
- \s*/?>
1271
- |
1272
- # auto-link (e.g., <http://www.activestate.com/>)
1273
- <[\w~:/?#\[\]@!$&'\(\)*+,;%=\.\\-]+>
1274
- |
1275
- <!--.*?--> # comment
1276
- |
1277
- <\?.*?\?> # processing instruction
1278
- )
1279
- )
1280
- """, re.X)
1281
-
1282
- @mark_stage(Stage.ESCAPE_SPECIAL)
1283
- def _escape_special_chars(self, text: str) -> str:
1284
- # Python markdown note: the HTML tokenization here differs from
1285
- # that in Markdown.pl, hence the behaviour for subtle cases can
1286
- # differ (I believe the tokenizer here does a better job because
1287
- # it isn't susceptible to unmatched '<' and '>' in HTML tags).
1288
- # Note, however, that '>' is not allowed in an auto-link URL
1289
- # here.
1290
- lead_escape_re = re.compile(r'^((?:\\\\)*(?!\\))')
1291
- escaped = []
1292
- is_html_markup = False
1293
- for token in self._sorta_html_tokenize_re.split(text):
1294
- # check token is preceded by 0 or more PAIRS of escapes, because escape pairs
1295
- # escape themselves and don't affect the token
1296
- if is_html_markup and lead_escape_re.match(token):
1297
- # Within tags/HTML-comments/auto-links, encode * and _
1298
- # so they don't conflict with their use in Markdown for
1299
- # italics and strong. We're replacing each such
1300
- # character with its corresponding MD5 checksum value;
1301
- # this is likely overkill, but it should prevent us from
1302
- # colliding with the escape values by accident.
1303
- escape_seq, token = lead_escape_re.split(token)[1:] or ('', token)
1304
- escaped.append(
1305
- escape_seq.replace('\\\\', self._escape_table['\\'])
1306
- + token.replace('*', self._escape_table['*'])
1307
- .replace('_', self._escape_table['_'])
1308
- )
1309
- else:
1310
- escaped.append(self._encode_backslash_escapes(token.replace('\\<', '&lt;')))
1311
- is_html_markup = not is_html_markup
1312
- return ''.join(escaped)
1313
-
1314
- @mark_stage(Stage.HASH_HTML)
1315
- def _hash_html_spans(self, text: str) -> str:
1316
- # Used for safe_mode.
1317
-
1318
- def _is_auto_link(s):
1319
- if ':' in s and self._auto_link_re.match(s):
1320
- return True
1321
- elif '@' in s and self._auto_email_link_re.match(s):
1322
- return True
1323
- return False
1324
-
1325
- def _is_code_span(index, token):
1326
- try:
1327
- if token == '<code>':
1328
- peek_tokens = split_tokens[index: index + 3]
1329
- elif token == '</code>':
1330
- peek_tokens = split_tokens[index - 2: index + 1]
1331
- else:
1332
- return False
1333
- except IndexError:
1334
- return False
1335
-
1336
- return re.match(r'<code>md5-[A-Fa-f0-9]{32}</code>', ''.join(peek_tokens))
1337
-
1338
- def _is_comment(token):
1339
- if self.safe_mode == 'replace':
1340
- # don't bother processing each section of comment in replace mode. Just do the whole thing
1341
- return
1342
- return re.match(r'(<!--)(.*)(-->)', token)
1343
-
1344
- def _hash(token):
1345
- key = _hash_text(token)
1346
- self.html_spans[key] = token
1347
- return key
1348
-
1349
- tokens = []
1350
- split_tokens = self._sorta_html_tokenize_re.split(text)
1351
- is_html_markup = False
1352
- for index, token in enumerate(split_tokens):
1353
- if is_html_markup and not _is_auto_link(token) and not _is_code_span(index, token):
1354
- is_comment = _is_comment(token)
1355
- if is_comment:
1356
- tokens.append(_hash(self._sanitize_html(is_comment.group(1))))
1357
- # sanitise but leave comment body intact for further markdown processing
1358
- tokens.append(self._sanitize_html(is_comment.group(2)))
1359
- tokens.append(_hash(self._sanitize_html(is_comment.group(3))))
1360
- else:
1361
- tokens.append(_hash(self._sanitize_html(token)))
1362
- else:
1363
- tokens.append(self._encode_incomplete_tags(token))
1364
- is_html_markup = not is_html_markup
1365
- return ''.join(tokens)
1366
-
1367
- def _unhash_html_spans(self, text: str) -> str:
1368
- for key, sanitized in list(self.html_spans.items()):
1369
- text = text.replace(key, sanitized)
1370
- return text
1371
-
1372
- def _sanitize_html(self, s: str) -> str:
1373
- if self.safe_mode == "replace":
1374
- return self.html_removed_text
1375
- elif self.safe_mode == "escape":
1376
- replacements = [
1377
- ('&', '&amp;'),
1378
- ('<', '&lt;'),
1379
- ('>', '&gt;'),
1380
- ]
1381
- for before, after in replacements:
1382
- s = s.replace(before, after)
1383
- return s
1384
- else:
1385
- raise MarkdownError("invalid value for 'safe_mode': %r (must be "
1386
- "'escape' or 'replace')" % self.safe_mode)
1387
-
1388
- _inline_link_title = re.compile(r'''
1389
- ( # \1
1390
- [ \t]+
1391
- (['"]) # quote char = \2
1392
- (?P<title>.*?)
1393
- \2
1394
- )? # title is optional
1395
- \)$
1396
- ''', re.X | re.S)
1397
- _tail_of_reference_link_re = re.compile(r'''
1398
- # Match tail of: [text][id]
1399
- [ ]? # one optional space
1400
- (?:\n[ ]*)? # one optional newline followed by spaces
1401
- \[
1402
- (?P<id>.*?)
1403
- \]
1404
- ''', re.X | re.S)
1405
-
1406
- _whitespace = re.compile(r'\s*')
1407
-
1408
- _strip_anglebrackets = re.compile(r'<(.*)>.*')
1409
-
1410
- def _find_non_whitespace(self, text: str, start: int) -> int:
1411
- """Returns the index of the first non-whitespace character in text
1412
- after (and including) start
1413
- """
1414
- match = self._whitespace.match(text, start)
1415
- return match.end() if match else len(text)
1416
-
1417
- def _find_balanced(self, text: str, start: int, open_c: str, close_c: str) -> int:
1418
- """Returns the index where the open_c and close_c characters balance
1419
- out - the same number of open_c and close_c are encountered - or the
1420
- end of string if it's reached before the balance point is found.
1421
- """
1422
- i = start
1423
- l = len(text)
1424
- count = 1
1425
- while count > 0 and i < l:
1426
- if text[i] == open_c:
1427
- count += 1
1428
- elif text[i] == close_c:
1429
- count -= 1
1430
- i += 1
1431
- return i
1432
-
1433
- def _extract_url_and_title(self, text: str, start: int) -> Union[Tuple[str, str, int], Tuple[None, None, None]]:
1434
- """Extracts the url and (optional) title from the tail of a link"""
1435
- # text[start] equals the opening parenthesis
1436
- idx = self._find_non_whitespace(text, start+1)
1437
- if idx == len(text):
1438
- return None, None, None
1439
- end_idx = idx
1440
- has_anglebrackets = text[idx] == "<"
1441
- if has_anglebrackets:
1442
- end_idx = self._find_balanced(text, end_idx+1, "<", ">")
1443
- end_idx = self._find_balanced(text, end_idx, "(", ")")
1444
- match = self._inline_link_title.search(text, idx, end_idx)
1445
- if not match:
1446
- return None, None, None
1447
- url, title = text[idx:match.start()], match.group("title")
1448
- if has_anglebrackets:
1449
- url = self._strip_anglebrackets.sub(r'\1', url)
1450
- return url, title, end_idx
1451
-
1452
- # https://developer.mozilla.org/en-US/docs/web/http/basics_of_http/data_urls
1453
- # https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types
1454
- _data_url_re = re.compile(r'''
1455
- data:
1456
- # in format type/subtype;parameter=optional
1457
- (?P<mime>\w+/[\w+\.-]+(?:;\w+=[\w+\.-]+)?)?
1458
- # optional base64 token
1459
- (?P<token>;base64)?
1460
- ,(?P<data>.*)
1461
- ''', re.X)
1462
-
1463
- def _protect_url(self, url: str) -> str:
1464
- '''
1465
- Function that passes a URL through `_html_escape_url` to remove any nasty characters,
1466
- and then hashes the now "safe" URL to prevent other safety mechanisms from tampering
1467
- with it (eg: escaping "&" in URL parameters)
1468
- '''
1469
- data_url = self._data_url_re.match(url)
1470
- charset = None
1471
- if data_url is not None:
1472
- mime = data_url.group('mime') or ''
1473
- if mime.startswith('image/') and data_url.group('token') == ';base64':
1474
- charset='base64'
1475
- url = _html_escape_url(url, safe_mode=self.safe_mode, charset=charset)
1476
- key = _hash_text(url)
1477
- self._escape_table[url] = key
1478
- return key
1479
-
1480
- _safe_protocols = r'(?:https?|ftp):\/\/|(?:mailto|tel):'
1481
-
1482
- @property
1483
- def _safe_href(self):
1484
- '''
1485
- _safe_href is adapted from pagedown's Markdown.Sanitizer.js
1486
- From: https://github.com/StackExchange/pagedown/blob/master/LICENSE.txt
1487
- Original Showdown code copyright (c) 2007 John Fraser
1488
- Modifications and bugfixes (c) 2009 Dana Robinson
1489
- Modifications and bugfixes (c) 2009-2014 Stack Exchange Inc.
1490
- '''
1491
- safe = r'-\w'
1492
- # omitted ['"<>] for XSS reasons
1493
- less_safe = r'#/\.!#$%&\(\)\+,/:;=\?@\[\]^`\{\}\|~'
1494
- # dot seperated hostname, optional port number, not followed by protocol seperator
1495
- domain = r'(?:[%s]+(?:\.[%s]+)*)(?:(?<!tel):\d+/?)?(?![^:/]*:/*)' % (safe, safe)
1496
- fragment = r'[%s]*' % (safe + less_safe)
1497
-
1498
- return re.compile(r'^(?:(%s)?(%s)(%s)|(#|\.{,2}/)(%s))$' % (self._safe_protocols, domain, fragment, fragment), re.I)
1499
-
1500
- @mark_stage(Stage.LINKS)
1501
- def _do_links(self, text: str) -> str:
1502
- """Turn Markdown link shortcuts into XHTML <a> and <img> tags.
1503
-
1504
- This is a combination of Markdown.pl's _DoAnchors() and
1505
- _DoImages(). They are done together because that simplified the
1506
- approach. It was necessary to use a different approach than
1507
- Markdown.pl because of the lack of atomic matching support in
1508
- Python's regex engine used in $g_nested_brackets.
1509
- """
1510
- MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24
1511
-
1512
- # `anchor_allowed_pos` is used to support img links inside
1513
- # anchors, but not anchors inside anchors. An anchor's start
1514
- # pos must be `>= anchor_allowed_pos`.
1515
- anchor_allowed_pos = 0
1516
-
1517
- curr_pos = 0
1518
- while True: # Handle the next link.
1519
- # The next '[' is the start of:
1520
- # - an inline anchor: [text](url "title")
1521
- # - a reference anchor: [text][id]
1522
- # - an inline img: ![text](url "title")
1523
- # - a reference img: ![text][id]
1524
- # - a footnote ref: [^id]
1525
- # (Only if 'footnotes' extra enabled)
1526
- # - a footnote defn: [^id]: ...
1527
- # (Only if 'footnotes' extra enabled) These have already
1528
- # been stripped in _strip_footnote_definitions() so no
1529
- # need to watch for them.
1530
- # - a link definition: [id]: url "title"
1531
- # These have already been stripped in
1532
- # _strip_link_definitions() so no need to watch for them.
1533
- # - not markup: [...anything else...
1534
- try:
1535
- start_idx = text.index('[', curr_pos)
1536
- except ValueError:
1537
- break
1538
- text_length = len(text)
1539
-
1540
- # Find the matching closing ']'.
1541
- # Markdown.pl allows *matching* brackets in link text so we
1542
- # will here too. Markdown.pl *doesn't* currently allow
1543
- # matching brackets in img alt text -- we'll differ in that
1544
- # regard.
1545
- bracket_depth = 0
1546
- for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
1547
- text_length)):
1548
- ch = text[p]
1549
- if ch == ']':
1550
- bracket_depth -= 1
1551
- if bracket_depth < 0:
1552
- break
1553
- elif ch == '[':
1554
- bracket_depth += 1
1555
- else:
1556
- # Closing bracket not found within sentinel length.
1557
- # This isn't markup.
1558
- curr_pos = start_idx + 1
1559
- continue
1560
- link_text = text[start_idx+1:p]
1561
-
1562
- # Fix for issue 341 - Injecting XSS into link text
1563
- if self.safe_mode:
1564
- link_text = self._hash_html_spans(link_text)
1565
- link_text = self._unhash_html_spans(link_text)
1566
-
1567
- # Possibly a footnote ref?
1568
- if "footnotes" in self.extras and link_text.startswith("^"):
1569
- normed_id = re.sub(r'\W', '-', link_text[1:])
1570
- if normed_id in self.footnotes:
1571
- self.footnote_ids.append(normed_id)
1572
- result = '<sup class="footnote-ref" id="fnref-%s">' \
1573
- '<a href="#fn-%s">%s</a></sup>' \
1574
- % (normed_id, normed_id, len(self.footnote_ids))
1575
- text = text[:start_idx] + result + text[p+1:]
1576
- else:
1577
- # This id isn't defined, leave the markup alone.
1578
- curr_pos = p+1
1579
- continue
1580
-
1581
- # Now determine what this is by the remainder.
1582
- p += 1
1583
-
1584
- # Inline anchor or img?
1585
- if text[p:p + 1] == '(': # attempt at perf improvement
1586
- url, title, url_end_idx = self._extract_url_and_title(text, p)
1587
- if url is not None:
1588
- # Handle an inline anchor or img.
1589
- is_img = start_idx > 0 and text[start_idx-1] == "!"
1590
- if is_img:
1591
- start_idx -= 1
1592
-
1593
- # We've got to encode these to avoid conflicting
1594
- # with italics/bold.
1595
- url = url.replace('*', self._escape_table['*']) \
1596
- .replace('_', self._escape_table['_'])
1597
- if title:
1598
- title_str = ' title="%s"' % (
1599
- _xml_escape_attr(title)
1600
- .replace('*', self._escape_table['*'])
1601
- .replace('_', self._escape_table['_']))
1602
- else:
1603
- title_str = ''
1604
- if is_img:
1605
- img_class_str = self._html_class_str_from_tag("img")
1606
- result = '<img src="%s" alt="%s"%s%s%s' \
1607
- % (self._protect_url(url),
1608
- _xml_escape_attr(link_text),
1609
- title_str,
1610
- img_class_str,
1611
- self.empty_element_suffix)
1612
- if "smarty-pants" in self.extras:
1613
- result = result.replace('"', self._escape_table['"'])
1614
- curr_pos = start_idx + len(result)
1615
- anchor_allowed_pos = start_idx + len(result)
1616
- text = text[:start_idx] + result + text[url_end_idx:]
1617
- elif start_idx >= anchor_allowed_pos:
1618
- safe_link = self._safe_href.match(url)
1619
- if self.safe_mode and not safe_link:
1620
- result_head = '<a href="#"%s>' % (title_str)
1621
- else:
1622
- result_head = '<a href="%s"%s>' % (self._protect_url(url), title_str)
1623
- result = '%s%s</a>' % (result_head, link_text)
1624
- if "smarty-pants" in self.extras:
1625
- result = result.replace('"', self._escape_table['"'])
1626
- # <img> allowed from curr_pos on, <a> from
1627
- # anchor_allowed_pos on.
1628
- curr_pos = start_idx + len(result_head)
1629
- anchor_allowed_pos = start_idx + len(result)
1630
- text = text[:start_idx] + result + text[url_end_idx:]
1631
- else:
1632
- # Anchor not allowed here.
1633
- curr_pos = start_idx + 1
1634
- continue
1635
-
1636
- # Reference anchor or img?
1637
- else:
1638
- match = self._tail_of_reference_link_re.match(text, p)
1639
- if match:
1640
- # Handle a reference-style anchor or img.
1641
- is_img = start_idx > 0 and text[start_idx-1] == "!"
1642
- if is_img:
1643
- start_idx -= 1
1644
- link_id = match.group("id").lower()
1645
- if not link_id:
1646
- link_id = link_text.lower() # for links like [this][]
1647
- if link_id in self.urls:
1648
- url = self.urls[link_id]
1649
- # We've got to encode these to avoid conflicting
1650
- # with italics/bold.
1651
- url = url.replace('*', self._escape_table['*']) \
1652
- .replace('_', self._escape_table['_'])
1653
- title = self.titles.get(link_id)
1654
- if title:
1655
- title = _xml_escape_attr(title) \
1656
- .replace('*', self._escape_table['*']) \
1657
- .replace('_', self._escape_table['_'])
1658
- title_str = ' title="%s"' % title
1659
- else:
1660
- title_str = ''
1661
- if is_img:
1662
- img_class_str = self._html_class_str_from_tag("img")
1663
- result = '<img src="%s" alt="%s"%s%s%s' \
1664
- % (self._protect_url(url),
1665
- _xml_escape_attr(link_text),
1666
- title_str,
1667
- img_class_str,
1668
- self.empty_element_suffix)
1669
- if "smarty-pants" in self.extras:
1670
- result = result.replace('"', self._escape_table['"'])
1671
- curr_pos = start_idx + len(result)
1672
- text = text[:start_idx] + result + text[match.end():]
1673
- elif start_idx >= anchor_allowed_pos:
1674
- if self.safe_mode and not self._safe_href.match(url):
1675
- result_head = '<a href="#"%s>' % (title_str)
1676
- else:
1677
- result_head = '<a href="%s"%s>' % (self._protect_url(url), title_str)
1678
- result = '%s%s</a>' % (result_head, link_text)
1679
- if "smarty-pants" in self.extras:
1680
- result = result.replace('"', self._escape_table['"'])
1681
- # <img> allowed from curr_pos on, <a> from
1682
- # anchor_allowed_pos on.
1683
- curr_pos = start_idx + len(result_head)
1684
- anchor_allowed_pos = start_idx + len(result)
1685
- text = text[:start_idx] + result + text[match.end():]
1686
- else:
1687
- # Anchor not allowed here.
1688
- curr_pos = start_idx + 1
1689
- else:
1690
- # This id isn't defined, leave the markup alone.
1691
- # set current pos to end of link title and continue from there
1692
- curr_pos = p
1693
- continue
1694
-
1695
- # Otherwise, it isn't markup.
1696
- curr_pos = start_idx + 1
1697
-
1698
- return text
1699
-
1700
- def header_id_from_text(self,
1701
- text: str,
1702
- prefix: str,
1703
- n: Optional[int] = None
1704
- ) -> str:
1705
- """Generate a header id attribute value from the given header
1706
- HTML content.
1707
-
1708
- This is only called if the "header-ids" extra is enabled.
1709
- Subclasses may override this for different header ids.
1710
-
1711
- @param text {str} The text of the header tag
1712
- @param prefix {str} The requested prefix for header ids. This is the
1713
- value of the "header-ids" extra key, if any. Otherwise, None.
1714
- @param n {int} (unused) The <hN> tag number, i.e. `1` for an <h1> tag.
1715
- @returns {str} The value for the header tag's "id" attribute. Return
1716
- None to not have an id attribute and to exclude this header from
1717
- the TOC (if the "toc" extra is specified).
1718
- """
1719
- header_id = _slugify(text)
1720
- if prefix and isinstance(prefix, str):
1721
- header_id = prefix + '-' + header_id
1722
-
1723
- self._count_from_header_id[header_id] += 1
1724
- if 0 == len(header_id) or self._count_from_header_id[header_id] > 1:
1725
- header_id += '-%s' % self._count_from_header_id[header_id]
1726
-
1727
- return header_id
1728
-
1729
- def _header_id_exists(self, text: str) -> bool:
1730
- header_id = _slugify(text)
1731
- prefix = self.extras['header-ids'].get('prefix')
1732
- if prefix and isinstance(prefix, str):
1733
- header_id = prefix + '-' + header_id
1734
- return header_id in self._count_from_header_id or header_id in map(lambda x: x[1], self._toc)
1735
-
1736
- def _toc_add_entry(self, level: int, id: str, name: str) -> None:
1737
- if level > self._toc_depth:
1738
- return
1739
- if self._toc is None:
1740
- self._toc = []
1741
- self._toc.append((level, id, self._unescape_special_chars(name)))
1742
-
1743
- _h_re_base = r'''
1744
- (^(.+)[ \t]{0,99}\n(=+|-+)[ \t]*\n+)
1745
- |
1746
- (^(\#{1,6}) # \1 = string of #'s
1747
- [ \t]%s
1748
- (.+?) # \2 = Header text
1749
- [ \t]{0,99}
1750
- (?<!\\) # ensure not an escaped trailing '#'
1751
- \#* # optional closing #'s (not counted)
1752
- \n+
1753
- )
1754
- '''
1755
-
1756
- _h_re = re.compile(_h_re_base % '*', re.X | re.M)
1757
- _h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M)
1758
-
1759
- def _h_sub(self, match: re.Match) -> str:
1760
- '''Handles processing markdown headers'''
1761
- if match.group(1) is not None and match.group(3) == "-":
1762
- return match.group(1)
1763
- elif match.group(1) is not None:
1764
- # Setext header
1765
- n = {"=": 1, "-": 2}[match.group(3)[0]]
1766
- header_group = match.group(2)
1767
- else:
1768
- # atx header
1769
- n = len(match.group(5))
1770
- header_group = match.group(6)
1771
-
1772
- demote_headers = self.extras.get("demote-headers")
1773
- if demote_headers:
1774
- n = min(n + demote_headers, 6)
1775
- header_id_attr = ""
1776
- if "header-ids" in self.extras:
1777
- header_id = self.header_id_from_text(header_group,
1778
- self.extras["header-ids"].get('prefix'), n)
1779
- if header_id:
1780
- header_id_attr = ' id="%s"' % header_id
1781
- html = self._run_span_gamut(header_group)
1782
- if "toc" in self.extras and header_id:
1783
- self._toc_add_entry(n, header_id, html)
1784
- return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
1785
-
1786
- _h_tag_re = re.compile(r'''
1787
- ^<h([1-6])(.*)> # \1 tag num, \2 attrs
1788
- (.*) # \3 text
1789
- </h\1>
1790
- ''', re.X | re.M)
1791
-
1792
- def _h_tag_sub(self, match: re.Match) -> str:
1793
- '''Different to `_h_sub` in that this function handles existing HTML headers'''
1794
- text = match.string[match.start(): match.end()]
1795
- h_level = int(match.group(1))
1796
- # extract id= attr from tag, trying to account for regex "misses"
1797
- id_attr = (re.match(r'.*?id=(\S+)?.*', match.group(2) or '') or '')
1798
- if id_attr:
1799
- # if id attr exists, extract that
1800
- id_attr = id_attr.group(1) or ''
1801
- id_attr = id_attr.strip('\'" ')
1802
- h_text = match.group(3)
1803
-
1804
- # check if header was already processed (ie: was a markdown header rather than HTML)
1805
- if id_attr and self._header_id_exists(id_attr):
1806
- return text
1807
-
1808
- # generate new header id if none existed
1809
- header_id = id_attr or self.header_id_from_text(h_text, self.extras['header-ids'].get('prefix'), h_level)
1810
- if "toc" in self.extras:
1811
- self._toc_add_entry(h_level, header_id, h_text)
1812
- if header_id and not id_attr:
1813
- # '<h[digit]' + new ID + '...'
1814
- return text[:3] + ' id="%s"' % header_id + text[3:]
1815
- return text
1816
-
1817
- @mark_stage(Stage.HEADERS)
1818
- def _do_headers(self, text: str) -> str:
1819
- # Setext-style headers:
1820
- # Header 1
1821
- # ========
1822
- #
1823
- # Header 2
1824
- # --------
1825
-
1826
- # atx-style headers:
1827
- # # Header 1
1828
- # ## Header 2
1829
- # ## Header 2 with closing hashes ##
1830
- # ...
1831
- # ###### Header 6
1832
-
1833
- if 'tag-friendly' in self.extras:
1834
- return self._h_re_tag_friendly.sub(self._h_sub, text)
1835
- return self._h_re.sub(self._h_sub, text)
1836
-
1837
- _marker_ul_chars = '*+-'
1838
- _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars
1839
- _marker_ul = '(?:[%s])' % _marker_ul_chars
1840
- _marker_ol = r'(?:\d+\.)'
1841
-
1842
- def _list_sub(self, match: re.Match) -> str:
1843
- lst = match.group(1)
1844
- lst_type = match.group(4) in self._marker_ul_chars and "ul" or "ol"
1845
-
1846
- if lst_type == 'ol' and match.group(4) != '1.':
1847
- # if list doesn't start at 1 then set the ol start attribute
1848
- lst_opts = ' start="%s"' % match.group(4)[:-1]
1849
- else:
1850
- lst_opts = ''
1851
-
1852
- lst_opts = lst_opts + self._html_class_str_from_tag(lst_type)
1853
-
1854
- result = self._process_list_items(lst)
1855
- if self.list_level:
1856
- return "<%s%s>\n%s</%s>\n" % (lst_type, lst_opts, result, lst_type)
1857
- else:
1858
- return "<%s%s>\n%s</%s>\n\n" % (lst_type, lst_opts, result, lst_type)
1859
-
1860
- @mark_stage(Stage.LISTS)
1861
- def _do_lists(self, text: str) -> str:
1862
- # Form HTML ordered (numbered) and unordered (bulleted) lists.
1863
-
1864
- # Iterate over each *non-overlapping* list match.
1865
- pos = 0
1866
- while True:
1867
- # Find the *first* hit for either list style (ul or ol). We
1868
- # match ul and ol separately to avoid adjacent lists of different
1869
- # types running into each other (see issue #16).
1870
- hits = []
1871
- for marker_pat in (self._marker_ul, self._marker_ol):
1872
- less_than_tab = self.tab_width - 1
1873
- other_marker_pat = self._marker_ul if marker_pat == self._marker_ol else self._marker_ol
1874
- whole_list = r'''
1875
- ( # \1 = whole list
1876
- ( # \2
1877
- ([ ]{0,%d}) # \3 = the indentation level of the list item marker
1878
- (%s) # \4 = first list item marker
1879
- [ \t]+
1880
- (?!\ *\4\ ) # '- - - ...' isn't a list. See 'not_quite_a_list' test case.
1881
- )
1882
- (?:.+?)
1883
- ( # \5
1884
- \Z
1885
- |
1886
- \n{2,}
1887
- (?=\S)
1888
- (?! # Negative lookahead for another list item marker
1889
- [ \t]*
1890
- %s[ \t]+
1891
- )
1892
- |
1893
- \n+
1894
- (?=
1895
- \3 # lookahead for a different style of list item marker
1896
- %s[ \t]+
1897
- )
1898
- )
1899
- )
1900
- ''' % (less_than_tab, marker_pat, marker_pat, other_marker_pat)
1901
- if self.list_level: # sub-list
1902
- list_re = re.compile("^"+whole_list, re.X | re.M | re.S)
1903
- else:
1904
- list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list,
1905
- re.X | re.M | re.S)
1906
- match = list_re.search(text, pos)
1907
- if match:
1908
- hits.append((match.start(), match))
1909
- if not hits:
1910
- break
1911
- hits.sort()
1912
- match = hits[0][1]
1913
- start, end = match.span()
1914
- middle = self._list_sub(match)
1915
- text = text[:start] + middle + text[end:]
1916
- pos = start + len(middle) # start pos for next attempted match
1917
-
1918
- return text
1919
-
1920
- _list_item_re = re.compile(r'''
1921
- (\n)? # leading line = \1
1922
- (^[ \t]*) # leading whitespace = \2
1923
- (?P<marker>%s) [ \t]+ # list marker = \3
1924
- ((?:.+?) # list item text = \4
1925
- (\n{1,2})) # eols = \5
1926
- (?= \n* (\Z | \2 (?P<next_marker>%s) [ \t]+))
1927
- ''' % (_marker_any, _marker_any),
1928
- re.M | re.X | re.S)
1929
-
1930
- _task_list_item_re = re.compile(r'''
1931
- (\[[\ xX]\])[ \t]+ # tasklist marker = \1
1932
- (.*) # list item text = \2
1933
- ''', re.M | re.X | re.S)
1934
-
1935
- _task_list_warpper_str = r'<input type="checkbox" class="task-list-item-checkbox" %sdisabled> %s'
1936
-
1937
- def _task_list_item_sub(self, match: re.Match) -> str:
1938
- marker = match.group(1)
1939
- item_text = match.group(2)
1940
- if marker in ['[x]','[X]']:
1941
- return self._task_list_warpper_str % ('checked ', item_text)
1942
- elif marker == '[ ]':
1943
- return self._task_list_warpper_str % ('', item_text)
1944
- # returning None has same effect as returning empty str, but only
1945
- # one makes the type checker happy
1946
- return ''
1947
-
1948
- _last_li_endswith_two_eols = False
1949
- def _list_item_sub(self, match: re.Match) -> str:
1950
- item = match.group(4)
1951
- leading_line = match.group(1)
1952
- if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
1953
- item = self._uniform_outdent(item, min_outdent=' ', max_outdent=self.tab)[1]
1954
- item = self._run_block_gamut(item)
1955
- else:
1956
- # Recursion for sub-lists:
1957
- item = self._do_lists(self._uniform_outdent(item, min_outdent=' ')[1])
1958
- if item.endswith('\n'):
1959
- item = item[:-1]
1960
- item = self._run_span_gamut(item)
1961
- self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
1962
-
1963
- if "task_list" in self.extras:
1964
- item = self._task_list_item_re.sub(self._task_list_item_sub, item)
1965
-
1966
- return "<li>%s</li>\n" % item
1967
-
1968
- def _process_list_items(self, list_str: str) -> str:
1969
- # Process the contents of a single ordered or unordered list,
1970
- # splitting it into individual list items.
1971
-
1972
- # The $g_list_level global keeps track of when we're inside a list.
1973
- # Each time we enter a list, we increment it; when we leave a list,
1974
- # we decrement. If it's zero, we're not in a list anymore.
1975
- #
1976
- # We do this because when we're not inside a list, we want to treat
1977
- # something like this:
1978
- #
1979
- # I recommend upgrading to version
1980
- # 8. Oops, now this line is treated
1981
- # as a sub-list.
1982
- #
1983
- # As a single paragraph, despite the fact that the second line starts
1984
- # with a digit-period-space sequence.
1985
- #
1986
- # Whereas when we're inside a list (or sub-list), that line will be
1987
- # treated as the start of a sub-list. What a kludge, huh? This is
1988
- # an aspect of Markdown's syntax that's hard to parse perfectly
1989
- # without resorting to mind-reading. Perhaps the solution is to
1990
- # change the syntax rules such that sub-lists must start with a
1991
- # starting cardinal number; e.g. "1." or "a.".
1992
- self.list_level += 1
1993
- self._last_li_endswith_two_eols = False
1994
- list_str = list_str.rstrip('\n') + '\n'
1995
- list_str = self._list_item_re.sub(self._list_item_sub, list_str)
1996
- self.list_level -= 1
1997
- return list_str
1998
-
1999
- def _get_pygments_lexer(self, lexer_name: str):
2000
- '''
2001
- Returns:
2002
- `pygments.Lexer` or None if a lexer matching `lexer_name` is
2003
- not found
2004
- '''
2005
- try:
2006
- from pygments import lexers, util
2007
- except ImportError:
2008
- return None
2009
- try:
2010
- return lexers.get_lexer_by_name(lexer_name)
2011
- except util.ClassNotFound:
2012
- return None
2013
-
2014
- def _color_with_pygments(
2015
- self,
2016
- codeblock: str,
2017
- lexer,
2018
- **formatter_opts
2019
- ) -> str:
2020
- '''
2021
- TODO: this function is only referenced by the `FencedCodeBlocks`
2022
- extra. May be worth moving over there
2023
-
2024
- Args:
2025
- codeblock: the codeblock to highlight
2026
- lexer (pygments.Lexer): lexer to use
2027
- formatter_opts: pygments HtmlFormatter options
2028
- '''
2029
- import pygments
2030
- import pygments.formatters
2031
-
2032
- class HtmlCodeFormatter(pygments.formatters.HtmlFormatter):
2033
- def _wrap_code(self, inner):
2034
- """A function for use in a Pygments Formatter which
2035
- wraps in <code> tags.
2036
- """
2037
- yield 0, "<code>"
2038
- for tup in inner:
2039
- yield tup
2040
- yield 0, "</code>"
2041
-
2042
- def _add_newline(self, inner):
2043
- # Add newlines around the inner contents so that _strict_tag_block_re matches the outer div.
2044
- yield 0, "\n"
2045
- yield from inner
2046
- yield 0, "\n"
2047
-
2048
- def wrap(self, source, outfile=None):
2049
- """Return the source with a code, pre, and div."""
2050
- if outfile is None:
2051
- # pygments >= 2.12
2052
- return self._add_newline(self._wrap_pre(self._wrap_code(source)))
2053
- else:
2054
- # pygments < 2.12
2055
- return self._wrap_div(self._add_newline(self._wrap_pre(self._wrap_code(source))))
2056
-
2057
- formatter_opts.setdefault("cssclass", "codehilite")
2058
- formatter = HtmlCodeFormatter(**formatter_opts)
2059
- return pygments.highlight(codeblock, lexer, formatter)
2060
-
2061
- def _code_block_sub(self, match: re.Match) -> str:
2062
- codeblock = match.group(1)
2063
- codeblock = self._outdent(codeblock)
2064
- codeblock = self._detab(codeblock)
2065
- codeblock = codeblock.lstrip('\n') # trim leading newlines
2066
- codeblock = codeblock.rstrip() # trim trailing whitespace
2067
-
2068
- pre_class_str = self._html_class_str_from_tag("pre")
2069
- code_class_str = self._html_class_str_from_tag("code")
2070
-
2071
- codeblock = self._encode_code(codeblock)
2072
-
2073
- return "\n<pre%s><code%s>%s\n</code></pre>\n" % (
2074
- pre_class_str, code_class_str, codeblock)
2075
-
2076
- def _html_class_str_from_tag(self, tag: str) -> str:
2077
- """Get the appropriate ' class="..."' string (note the leading
2078
- space), if any, for the given tag.
2079
- """
2080
- if "html-classes" not in self.extras:
2081
- return ""
2082
- try:
2083
- html_classes_from_tag = self.extras["html-classes"]
2084
- except TypeError:
2085
- return ""
2086
- else:
2087
- if isinstance(html_classes_from_tag, dict):
2088
- if tag in html_classes_from_tag:
2089
- return ' class="%s"' % html_classes_from_tag[tag]
2090
- return ""
2091
-
2092
- @mark_stage(Stage.CODE_BLOCKS)
2093
- def _do_code_blocks(self, text: str) -> str:
2094
- """Process Markdown `<pre><code>` blocks."""
2095
- code_block_re = re.compile(r'''
2096
- (?:\n\n|\A\n?)
2097
- ( # $1 = the code block -- one or more lines, starting with a space/tab
2098
- (?:
2099
- (?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces
2100
- .*\n+
2101
- )+
2102
- )
2103
- ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
2104
- # Lookahead to make sure this block isn't already in a code block.
2105
- # Needed when syntax highlighting is being used.
2106
- (?!([^<]|<(/?)span)*\</code\>)
2107
- ''' % (self.tab_width, self.tab_width),
2108
- re.M | re.X)
2109
- return code_block_re.sub(self._code_block_sub, text)
2110
-
2111
- # Rules for a code span:
2112
- # - backslash escapes are not interpreted in a code span
2113
- # - to include one or or a run of more backticks the delimiters must
2114
- # be a longer run of backticks
2115
- # - cannot start or end a code span with a backtick; pad with a
2116
- # space and that space will be removed in the emitted HTML
2117
- # See `test/tm-cases/escapes.text` for a number of edge-case
2118
- # examples.
2119
- _code_span_re = re.compile(r'''
2120
- (?<!\\)
2121
- (`+) # \1 = Opening run of `
2122
- (?!`) # See Note A test/tm-cases/escapes.text
2123
- (.+?) # \2 = The code block
2124
- (?<!`)
2125
- \1 # Matching closer
2126
- (?!`)
2127
- ''', re.X | re.S)
2128
-
2129
- def _code_span_sub(self, match: re.Match) -> str:
2130
- c = match.group(2).strip(" \t")
2131
- c = self._encode_code(c)
2132
- return "<code%s>%s</code>" % (self._html_class_str_from_tag("code"), c)
2133
-
2134
- @mark_stage(Stage.CODE_SPANS)
2135
- def _do_code_spans(self, text: str) -> str:
2136
- # * Backtick quotes are used for <code></code> spans.
2137
- #
2138
- # * You can use multiple backticks as the delimiters if you want to
2139
- # include literal backticks in the code span. So, this input:
2140
- #
2141
- # Just type ``foo `bar` baz`` at the prompt.
2142
- #
2143
- # Will translate to:
2144
- #
2145
- # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
2146
- #
2147
- # There's no arbitrary limit to the number of backticks you
2148
- # can use as delimters. If you need three consecutive backticks
2149
- # in your code, use four for delimiters, etc.
2150
- #
2151
- # * You can use spaces to get literal backticks at the edges:
2152
- #
2153
- # ... type `` `bar` `` ...
2154
- #
2155
- # Turns to:
2156
- #
2157
- # ... type <code>`bar`</code> ...
2158
- return self._code_span_re.sub(self._code_span_sub, text)
2159
-
2160
- def _encode_code(self, text: str) -> str:
2161
- """Encode/escape certain characters inside Markdown code runs.
2162
- The point is that in code, these characters are literals,
2163
- and lose their special Markdown meanings.
2164
- """
2165
- replacements = [
2166
- # Encode all ampersands; HTML entities are not
2167
- # entities within a Markdown code span.
2168
- ('&', '&amp;'),
2169
- # Do the angle bracket song and dance:
2170
- ('<', '&lt;'),
2171
- ('>', '&gt;'),
2172
- ]
2173
- for before, after in replacements:
2174
- text = text.replace(before, after)
2175
- hashed = _hash_text(text)
2176
- self._code_table[text] = hashed
2177
- return hashed
2178
-
2179
- _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]?)(?<=\S)\1", re.S)
2180
- _em_re = re.compile(r"(\*|_)(?=\S)(.*?\S)\1", re.S)
2181
-
2182
- @mark_stage(Stage.ITALIC_AND_BOLD)
2183
- def _do_italics_and_bold(self, text: str) -> str:
2184
- # <strong> must go first:
2185
- text = self._strong_re.sub(r"<strong>\2</strong>", text)
2186
- text = self._em_re.sub(r"<em>\2</em>", text)
2187
- return text
2188
-
2189
- _block_quote_base = r'''
2190
- ( # Wrap whole match in \1
2191
- (
2192
- ^[ \t]*>%s[ \t]? # '>' at the start of a line
2193
- .+\n # rest of the first line
2194
- (.+\n)* # subsequent consecutive lines
2195
- )+
2196
- )
2197
- '''
2198
- _block_quote_re = re.compile(_block_quote_base % '', re.M | re.X)
2199
- _block_quote_re_spoiler = re.compile(_block_quote_base % '[ \t]*?!?', re.M | re.X)
2200
- _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M)
2201
- _bq_one_level_re_spoiler = re.compile('^[ \t]*>[ \t]*?![ \t]?', re.M)
2202
- _bq_all_lines_spoilers = re.compile(r'\A(?:^[ \t]*>[ \t]*?!.*[\n\r]*)+\Z', re.M)
2203
- _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
2204
- def _dedent_two_spaces_sub(self, match: re.Match) -> str:
2205
- return re.sub(r'(?m)^ ', '', match.group(1))
2206
-
2207
- def _block_quote_sub(self, match: re.Match) -> str:
2208
- bq = match.group(1)
2209
- is_spoiler = 'spoiler' in self.extras and self._bq_all_lines_spoilers.match(bq)
2210
- # trim one level of quoting
2211
- if is_spoiler:
2212
- bq = self._bq_one_level_re_spoiler.sub('', bq)
2213
- else:
2214
- bq = self._bq_one_level_re.sub('', bq)
2215
- # trim whitespace-only lines
2216
- bq = self._ws_only_line_re.sub('', bq)
2217
- bq = self._run_block_gamut(bq) # recurse
2218
-
2219
- bq = re.sub('(?m)^', ' ', bq)
2220
- # These leading spaces screw with <pre> content, so we need to fix that:
2221
- bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
2222
-
2223
- if is_spoiler:
2224
- return '<blockquote class="spoiler">\n%s\n</blockquote>\n\n' % bq
2225
- else:
2226
- return '<blockquote>\n%s\n</blockquote>\n\n' % bq
2227
-
2228
- @mark_stage(Stage.BLOCK_QUOTES)
2229
- def _do_block_quotes(self, text: str) -> str:
2230
- if '>' not in text:
2231
- return text
2232
- if 'spoiler' in self.extras:
2233
- return self._block_quote_re_spoiler.sub(self._block_quote_sub, text)
2234
- else:
2235
- return self._block_quote_re.sub(self._block_quote_sub, text)
2236
-
2237
- @mark_stage(Stage.PARAGRAPHS)
2238
- def _form_paragraphs(self, text: str) -> str:
2239
- # Strip leading and trailing lines:
2240
- text = text.strip('\n')
2241
-
2242
- # Wrap <p> tags.
2243
- grafs = []
2244
- for i, graf in enumerate(re.split(r"\n{2,}", text)):
2245
- if graf in self.html_blocks:
2246
- # Unhashify HTML blocks
2247
- grafs.append(self.html_blocks[graf])
2248
- else:
2249
- cuddled_list = None
2250
- if "cuddled-lists" in self.extras:
2251
- # Need to put back trailing '\n' for `_list_item_re`
2252
- # match at the end of the paragraph.
2253
- li = self._list_item_re.search(graf + '\n')
2254
- # Two of the same list marker in this paragraph: a likely
2255
- # candidate for a list cuddled to preceding paragraph
2256
- # text (issue 33). Note the `[-1]` is a quick way to
2257
- # consider numeric bullets (e.g. "1." and "2.") to be
2258
- # equal.
2259
- if (li and len(li.group(2)) <= 3
2260
- and (
2261
- (li.group("next_marker") and li.group("marker")[-1] == li.group("next_marker")[-1])
2262
- or
2263
- li.group("next_marker") is None
2264
- )
2265
- ):
2266
- start = li.start()
2267
- cuddled_list = self._do_lists(graf[start:]).rstrip("\n")
2268
- if re.match(r'^<(?:ul|ol).*?>', cuddled_list):
2269
- graf = graf[:start]
2270
- else:
2271
- # Not quite a cuddled list. (See not_quite_a_list_cuddled_lists test case)
2272
- # Store as a simple paragraph.
2273
- graf = cuddled_list
2274
- cuddled_list = None
2275
-
2276
- # Wrap <p> tags.
2277
- graf = self._run_span_gamut(graf)
2278
- grafs.append("<p%s>" % self._html_class_str_from_tag('p') + graf.lstrip(" \t") + "</p>")
2279
-
2280
- if cuddled_list:
2281
- grafs.append(cuddled_list)
2282
-
2283
- return "\n\n".join(grafs)
2284
-
2285
- def _add_footnotes(self, text: str) -> str:
2286
- if self.footnotes:
2287
- footer = [
2288
- '<div class="footnotes">',
2289
- '<hr' + self.empty_element_suffix,
2290
- '<ol>',
2291
- ]
2292
-
2293
- if not self.footnote_title:
2294
- self.footnote_title = "Jump back to footnote %d in the text."
2295
- if not self.footnote_return_symbol:
2296
- self.footnote_return_symbol = "&#8617;"
2297
-
2298
- # self.footnotes is generated in _strip_footnote_definitions, which runs re.sub on the whole
2299
- # text. This means that the dict keys are inserted in order of appearance. Use the dict to
2300
- # sort footnote ids by that same order
2301
- self.footnote_ids.sort(key=lambda a: list(self.footnotes.keys()).index(a))
2302
- for i, id in enumerate(self.footnote_ids):
2303
- if i != 0:
2304
- footer.append('')
2305
- footer.append('<li id="fn-%s">' % id)
2306
- footer.append(self._run_block_gamut(self.footnotes[id]))
2307
- try:
2308
- backlink = ('<a href="#fnref-%s" ' +
2309
- 'class="footnoteBackLink" ' +
2310
- 'title="' + self.footnote_title + '">' +
2311
- self.footnote_return_symbol +
2312
- '</a>') % (id, i+1)
2313
- except TypeError:
2314
- log.debug("Footnote error. `footnote_title` "
2315
- "must include parameter. Using defaults.")
2316
- backlink = ('<a href="#fnref-%s" '
2317
- 'class="footnoteBackLink" '
2318
- 'title="Jump back to footnote %d in the text.">'
2319
- '&#8617;</a>' % (id, i+1))
2320
-
2321
- if footer[-1].endswith("</p>"):
2322
- footer[-1] = footer[-1][:-len("</p>")] \
2323
- + '&#160;' + backlink + "</p>"
2324
- else:
2325
- footer.append("\n<p>%s</p>" % backlink)
2326
- footer.append('</li>')
2327
- footer.append('</ol>')
2328
- footer.append('</div>')
2329
- return text + '\n\n' + '\n'.join(footer)
2330
- else:
2331
- return text
2332
-
2333
- _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I)
2334
- _naked_gt_re = re.compile(r'''(?<![a-z0-9?!/'"-])>''', re.I)
2335
-
2336
- def _encode_amps_and_angles(self, text: str) -> str:
2337
- # Smart processing for ampersands and angle brackets that need
2338
- # to be encoded.
2339
- text = _AMPERSAND_RE.sub('&amp;', text)
2340
-
2341
- # Encode naked <'s
2342
- text = self._naked_lt_re.sub('&lt;', text)
2343
-
2344
- # Encode naked >'s
2345
- # Note: Other markdown implementations (e.g. Markdown.pl, PHP
2346
- # Markdown) don't do this.
2347
- text = self._naked_gt_re.sub('&gt;', text)
2348
- return text
2349
-
2350
- _incomplete_tags_re = re.compile(r"<(!--|/?\w+?(?!\w)\s*?.+?(?:[\s/]+?|$))")
2351
-
2352
- def _encode_incomplete_tags(self, text: str) -> str:
2353
- if self.safe_mode not in ("replace", "escape"):
2354
- return text
2355
-
2356
- if text.endswith(">"):
2357
- return text # this is not an incomplete tag, this is a link in the form <http://x.y.z>
2358
-
2359
- def incomplete_tags_sub(match):
2360
- return match.group().replace('<', '&lt;')
2361
-
2362
- return self._incomplete_tags_re.sub(incomplete_tags_sub, text)
2363
-
2364
- def _encode_backslash_escapes(self, text: str) -> str:
2365
- for ch, escape in list(self._escape_table.items()):
2366
- text = text.replace("\\"+ch, escape)
2367
- return text
2368
-
2369
- _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I)
2370
- def _auto_link_sub(self, match: re.Match) -> str:
2371
- g1 = match.group(1)
2372
- return '<a href="%s">%s</a>' % (self._protect_url(g1), g1)
2373
-
2374
- _auto_email_link_re = re.compile(r"""
2375
- <
2376
- (?:mailto:)?
2377
- (
2378
- [-.\w]+
2379
- \@
2380
- [-\w]+(\.[-\w]+)*\.[a-z]+
2381
- )
2382
- >
2383
- """, re.I | re.X | re.U)
2384
- def _auto_email_link_sub(self, match: re.Match) -> str:
2385
- return self._encode_email_address(
2386
- self._unescape_special_chars(match.group(1)))
2387
-
2388
- def _do_auto_links(self, text: str) -> str:
2389
- text = self._auto_link_re.sub(self._auto_link_sub, text)
2390
- text = self._auto_email_link_re.sub(self._auto_email_link_sub, text)
2391
- return text
2392
-
2393
- def _encode_email_address(self, addr: str) -> str:
2394
- # Input: an email address, e.g. "foo@example.com"
2395
- #
2396
- # Output: the email address as a mailto link, with each character
2397
- # of the address encoded as either a decimal or hex entity, in
2398
- # the hopes of foiling most address harvesting spam bots. E.g.:
2399
- #
2400
- # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
2401
- # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
2402
- # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
2403
- #
2404
- # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
2405
- # mailing list: <http://tinyurl.com/yu7ue>
2406
- chars = [_xml_encode_email_char_at_random(ch)
2407
- for ch in "mailto:" + addr]
2408
- # Strip the mailto: from the visible part.
2409
- addr = '<a href="%s">%s</a>' \
2410
- % (''.join(chars), ''.join(chars[7:]))
2411
- return addr
2412
-
2413
- def _unescape_special_chars(self, text: str) -> str:
2414
- # Swap back in all the special characters we've hidden.
2415
- hashmap = tuple(self._escape_table.items()) + tuple(self._code_table.items())
2416
- # html_blocks table is in format {hash: item} compared to usual {item: hash}
2417
- hashmap += tuple(tuple(reversed(i)) for i in self.html_blocks.items())
2418
- while True:
2419
- orig_text = text
2420
- for ch, hash in hashmap:
2421
- text = text.replace(hash, ch)
2422
- if text == orig_text:
2423
- break
2424
- return text
2425
-
2426
- def _outdent(self, text: str) -> str:
2427
- # Remove one level of line-leading tabs or spaces
2428
- return self._outdent_re.sub('', text)
2429
-
2430
- @staticmethod
2431
- def _uniform_outdent(
2432
- text: str,
2433
- min_outdent: Optional[str] = None,
2434
- max_outdent: Optional[str] = None
2435
- ) -> Tuple[str, str]:
2436
- '''
2437
- Removes the smallest common leading indentation from each (non empty)
2438
- line of `text` and returns said indent along with the outdented text.
2439
-
2440
- Args:
2441
- min_outdent: make sure the smallest common whitespace is at least this size
2442
- max_outdent: the maximum amount a line can be outdented by
2443
- '''
2444
-
2445
- # find the leading whitespace for every line
2446
- whitespace: List[Union[str, None]] = [
2447
- re.findall(r'^[ \t]*', line)[0] if line else None
2448
- for line in text.splitlines()
2449
- ]
2450
- whitespace_not_empty = [i for i in whitespace if i is not None]
2451
-
2452
- # if no whitespace detected (ie: no lines in code block, issue #505)
2453
- if not whitespace_not_empty:
2454
- return '', text
2455
-
2456
- # get minimum common whitespace
2457
- outdent = min(whitespace_not_empty)
2458
- # adjust min common ws to be within bounds
2459
- if min_outdent is not None:
2460
- outdent = min([i for i in whitespace_not_empty if i >= min_outdent] or [min_outdent])
2461
- if max_outdent is not None:
2462
- outdent = min(outdent, max_outdent)
2463
-
2464
- outdented = []
2465
- for line_ws, line in zip(whitespace, text.splitlines(True)):
2466
- if line.startswith(outdent):
2467
- # if line starts with smallest common ws, dedent it
2468
- outdented.append(line.replace(outdent, '', 1))
2469
- elif line_ws is not None and line_ws < outdent:
2470
- # if less indented than min common whitespace then outdent as much as possible
2471
- outdented.append(line.replace(line_ws, '', 1))
2472
- else:
2473
- outdented.append(line)
2474
-
2475
- return outdent, ''.join(outdented)
2476
-
2477
- @staticmethod
2478
- def _uniform_indent(
2479
- text: str,
2480
- indent: str,
2481
- include_empty_lines: bool = False,
2482
- indent_empty_lines: bool = False
2483
- ) -> str:
2484
- '''
2485
- Uniformly indent a block of text by a fixed amount
2486
-
2487
- Args:
2488
- text: the text to indent
2489
- indent: a string containing the indent to apply
2490
- include_empty_lines: don't remove whitespace only lines
2491
- indent_empty_lines: indent whitespace only lines with the rest of the text
2492
- '''
2493
- blocks = []
2494
- for line in text.splitlines(True):
2495
- if line.strip() or indent_empty_lines:
2496
- blocks.append(indent + line)
2497
- elif include_empty_lines:
2498
- blocks.append(line)
2499
- else:
2500
- blocks.append('')
2501
- return ''.join(blocks)
2502
-
2503
- @staticmethod
2504
- def _match_overlaps_substr(text, match: re.Match, substr: str) -> bool:
2505
- '''
2506
- Checks if a regex match overlaps with a substring in the given text.
2507
- '''
2508
- for instance in re.finditer(re.escape(substr), text):
2509
- start, end = instance.span()
2510
- if start <= match.start() <= end:
2511
- return True
2512
- if start <= match.end() <= end:
2513
- return True
2514
- return False
2515
-
2516
-
2517
- class MarkdownWithExtras(Markdown):
2518
- """A markdowner class that enables most extras:
2519
-
2520
- - footnotes
2521
- - fenced-code-blocks (only highlights code if 'pygments' Python module on path)
2522
-
2523
- These are not included:
2524
- - pyshell (specific to Python-related documenting)
2525
- - code-friendly (because it *disables* part of the syntax)
2526
- - link-patterns (because you need to specify some actual
2527
- link-patterns anyway)
2528
- """
2529
- extras = ["footnotes", "fenced-code-blocks"] # type: ignore
2530
-
2531
-
2532
- # ----------------------------------------------------------
2533
- # Extras
2534
- # ----------------------------------------------------------
2535
-
2536
- # Base classes
2537
- # ----------------------------------------------------------
2538
-
2539
- class Extra(ABC):
2540
- _registry: Dict[str, Type['Extra']] = {}
2541
- _exec_order: Dict[Stage, Tuple[List[Type['Extra']], List[Type['Extra']]]] = {}
2542
-
2543
- name: str
2544
- '''
2545
- An identifiable name that users can use to invoke the extra
2546
- in the Markdown class
2547
- '''
2548
- order: Tuple[Collection[Union[Stage, Type['Extra']]], Collection[Union[Stage, Type['Extra']]]]
2549
- '''
2550
- Tuple of two iterables containing the stages/extras this extra will run before and
2551
- after, respectively
2552
- '''
2553
-
2554
- def __init__(self, md: Markdown, options: Optional[dict]):
2555
- '''
2556
- Args:
2557
- md: An instance of `Markdown`
2558
- options: a dict of settings to alter the extra's behaviour
2559
- '''
2560
- self.md = md
2561
- self.options = options if options is not None else {}
2562
-
2563
- @classmethod
2564
- def deregister(cls):
2565
- '''
2566
- Removes the class from the extras registry and unsets its execution order.
2567
- '''
2568
- if cls.name in cls._registry:
2569
- del cls._registry[cls.name]
2570
-
2571
- for exec_order in Extra._exec_order.values():
2572
- # find everywhere this extra is mentioned and remove it
2573
- for section in exec_order:
2574
- while cls in section:
2575
- section.remove(cls)
2576
-
2577
- @classmethod
2578
- def register(cls):
2579
- '''
2580
- Registers the class for use with `Markdown` and calculates its execution order based on
2581
- the `order` class attribute.
2582
- '''
2583
- cls._registry[cls.name] = cls
2584
-
2585
- for index, item in enumerate((*cls.order[0], *cls.order[1])):
2586
- before = index < len(cls.order[0])
2587
- if not isinstance(item, Stage) and issubclass(item, Extra):
2588
- # eg: FencedCodeBlocks
2589
- for exec_orders in Extra._exec_order.values():
2590
- # insert this extra everywhere the other one is mentioned
2591
- for section in exec_orders:
2592
- if item in section:
2593
- to_index = section.index(item)
2594
- if not before:
2595
- to_index += 1
2596
- section.insert(to_index, cls)
2597
- else:
2598
- # eg: Stage.PREPROCESS
2599
- Extra._exec_order.setdefault(item, ([], []))
2600
- if cls in Extra._exec_order[item][0 if before else 1]:
2601
- # extra is already runnig after this stage. Don't duplicate that effort
2602
- continue
2603
- if before:
2604
- Extra._exec_order[item][0].insert(0, cls)
2605
- else:
2606
- Extra._exec_order[item][1].append(cls)
2607
-
2608
- @abstractmethod
2609
- def run(self, text: str) -> str:
2610
- '''
2611
- Run the extra against the given text.
2612
-
2613
- Returns:
2614
- The new text after being modified by the extra
2615
- '''
2616
- ...
2617
-
2618
- def test(self, text: str) -> bool:
2619
- '''
2620
- Check a section of markdown to see if this extra should be run upon it.
2621
- The default implementation will always return True but it's recommended to override
2622
- this behaviour to improve performance.
2623
- '''
2624
- return True
2625
-
2626
-
2627
- class ItalicAndBoldProcessor(Extra):
2628
- '''
2629
- An ABC that provides hooks for dealing with italics and bold syntax.
2630
- This class is set to trigger both before AND after the italics and bold stage.
2631
- This allows any child classes to intercept instances of bold or italic syntax and
2632
- change the output or hash it to prevent it from being processed.
2633
-
2634
- After the I&B stage any hashes in the `hash_tables` instance variable are replaced.
2635
- '''
2636
- name = 'italic-and-bold-processor'
2637
- order = (Stage.ITALIC_AND_BOLD,), (Stage.ITALIC_AND_BOLD,)
2638
-
2639
- strong_re = Markdown._strong_re
2640
- em_re = Markdown._em_re
2641
-
2642
- def __init__(self, md: Markdown, options: dict):
2643
- super().__init__(md, options)
2644
- self.hash_table = {}
2645
-
2646
- def run(self, text):
2647
- if self.md.order < Stage.ITALIC_AND_BOLD:
2648
- text = self.strong_re.sub(self.sub, text)
2649
- text = self.em_re.sub(self.sub, text)
2650
- else:
2651
- # push any hashed values back, using a while loop to deal with recursive hashes
2652
- orig_text = ''
2653
- while orig_text != text:
2654
- orig_text = text
2655
- for key, substr in self.hash_table.items():
2656
- text = text.replace(key, substr)
2657
- return text
2658
-
2659
- @abstractmethod
2660
- def sub(self, match: re.Match) -> str:
2661
- # do nothing. Let `Markdown._do_italics_and_bold` do its thing later
2662
- return match.string[match.start(): match.end()]
2663
-
2664
- def sub_hash(self, match: re.Match) -> str:
2665
- substr = match.string[match.start(): match.end()]
2666
- key = _hash_text(substr)
2667
- self.hash_table[key] = substr
2668
- return key
2669
-
2670
- def test(self, text):
2671
- if self.md.order < Stage.ITALIC_AND_BOLD:
2672
- return '*' in text or '_' in text
2673
- return self.hash_table and re.search(r'md5-[0-9a-z]{32}', text)
2674
-
2675
- # User facing extras
2676
- # ----------------------------------------------------------
2677
-
2678
-
2679
- class Admonitions(Extra):
2680
- '''
2681
- Enable parsing of RST admonitions
2682
- '''
2683
-
2684
- name = 'admonitions'
2685
- order = (Stage.BLOCK_GAMUT, Stage.LINK_DEFS), ()
2686
-
2687
- admonitions = r'admonition|attention|caution|danger|error|hint|important|note|tip|warning'
2688
-
2689
- admonitions_re = re.compile(r'''
2690
- ^(\ *)\.\.\ (%s)::\ * # $1 leading indent, $2 the admonition
2691
- (.*)? # $3 admonition title
2692
- ((?:\s*\n\1\ {3,}.*)+?) # $4 admonition body (required)
2693
- (?=\s*(?:\Z|\n{4,}|\n\1?\ {0,2}\S)) # until EOF, 3 blank lines or something less indented
2694
- ''' % admonitions,
2695
- re.IGNORECASE | re.MULTILINE | re.VERBOSE
2696
- )
2697
-
2698
- def test(self, text):
2699
- return self.admonitions_re.search(text) is not None
2700
-
2701
- def sub(self, match: re.Match) -> str:
2702
- lead_indent, admonition_name, title, body = match.groups()
2703
-
2704
- admonition_type = '<strong>%s</strong>' % admonition_name
2705
-
2706
- # figure out the class names to assign the block
2707
- if admonition_name.lower() == 'admonition':
2708
- admonition_class = 'admonition'
2709
- else:
2710
- admonition_class = 'admonition %s' % admonition_name.lower()
2711
-
2712
- # titles are generally optional
2713
- if title:
2714
- title = '<em>%s</em>' % title
2715
-
2716
- # process the admonition body like regular markdown
2717
- body = self.md._run_block_gamut("\n%s\n" % self.md._uniform_outdent(body)[1])
2718
-
2719
- # indent the body before placing inside the aside block
2720
- admonition = self.md._uniform_indent(
2721
- '%s\n%s\n\n%s\n' % (admonition_type, title, body),
2722
- self.md.tab, False
2723
- )
2724
- # wrap it in an aside
2725
- admonition = '<aside class="%s">\n%s</aside>' % (admonition_class, admonition)
2726
- # now indent the whole admonition back to where it started
2727
- return self.md._uniform_indent(admonition, lead_indent, False)
2728
-
2729
- def run(self, text):
2730
- return self.admonitions_re.sub(self.sub, text)
2731
-
2732
-
2733
- class Alerts(Extra):
2734
- '''
2735
- Markdown Alerts as per
2736
- https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#alerts
2737
- '''
2738
-
2739
- name = 'alerts'
2740
- order = (), (Stage.BLOCK_QUOTES, )
2741
-
2742
- alert_re = re.compile(r'''
2743
- <blockquote>\s*
2744
- <p>
2745
- \[!(?P<type>NOTE|TIP|IMPORTANT|WARNING|CAUTION)\]
2746
- (?P<closing_tag></p>[ \t]*\n?)?
2747
- (?P<contents>[\s\S]+?)
2748
- </blockquote>
2749
- ''', re.X
2750
- )
2751
-
2752
- def test(self, text):
2753
- return "<blockquote>" in text
2754
-
2755
- def sub(self, match: re.Match) -> str:
2756
- typ = match["type"].lower()
2757
- heading = f"<em>{match['type'].title()}</em>"
2758
- contents = match["contents"].strip()
2759
- if match["closing_tag"]:
2760
- return f'<div class="alert {typ}">\n{heading}\n{contents}\n</div>'
2761
- else:
2762
- return f'<div class="alert {typ}">\n{heading}\n<p>{contents}\n</div>'
2763
-
2764
- def run(self, text):
2765
- return self.alert_re.sub(self.sub, text)
2766
-
2767
-
2768
- class _BreaksExtraOpts(TypedDict, total=False):
2769
- '''Options for the `Breaks` extra'''
2770
- on_backslash: bool
2771
- '''Replace backslashes at the end of a line with <br>'''
2772
- on_newline: bool
2773
- '''Replace single new line characters with <br> when True'''
2774
-
2775
-
2776
- class Breaks(Extra):
2777
- name = 'breaks'
2778
- order = (), (Stage.ITALIC_AND_BOLD,)
2779
- options: _BreaksExtraOpts
2780
-
2781
- def run(self, text):
2782
- on_backslash = self.options.get('on_backslash', False)
2783
- on_newline = self.options.get('on_newline', False)
2784
-
2785
- if on_backslash and on_newline:
2786
- pattern = r' *\\?'
2787
- elif on_backslash:
2788
- pattern = r'(?: *\\| {2,})'
2789
- elif on_newline:
2790
- pattern = r' *'
2791
- else:
2792
- pattern = r' {2,}'
2793
-
2794
- break_tag = "<br%s\n" % self.md.empty_element_suffix
2795
- text = re.sub(pattern + r"\n(?!\<(?:\/?(ul|ol|li))\>)", break_tag, text)
2796
-
2797
- return text
2798
-
2799
-
2800
- class CodeFriendly(ItalicAndBoldProcessor):
2801
- '''
2802
- Disable _ and __ for em and strong.
2803
- '''
2804
- name = 'code-friendly'
2805
-
2806
- def sub(self, match: re.Match) -> str:
2807
- syntax = match.group(1)
2808
- text: str = match.string[match.start(): match.end()]
2809
- if '_' in syntax:
2810
- # if using _this_ syntax, hash the whole thing so that it doesn't get processed
2811
- key = _hash_text(text)
2812
- self.hash_table[key] = text
2813
- return key
2814
- elif '_' in text:
2815
- # if the text within the bold/em markers contains '_' then hash those contents to protect them from em_re
2816
- text = text[len(syntax): -len(syntax)]
2817
- key = _hash_text(text)
2818
- self.hash_table[key] = text
2819
- return syntax + key + syntax
2820
- # if no underscores are present, the text is fine and we can just leave it alone
2821
- return super().sub(match)
2822
-
2823
-
2824
- class FencedCodeBlocks(Extra):
2825
- '''
2826
- Allows a code block to not have to be indented
2827
- by fencing it with '```' on a line before and after. Based on
2828
- <http://github.github.com/github-flavored-markdown/> with support for
2829
- syntax highlighting.
2830
- '''
2831
-
2832
- name = 'fenced-code-blocks'
2833
- order = (Stage.LINK_DEFS, Stage.BLOCK_GAMUT), (Stage.PREPROCESS,)
2834
-
2835
- fenced_code_block_re = re.compile(r'''
2836
- (?:\n+|\A\n?|(?<=\n))
2837
- (^[ \t]*`{3,})\s{0,99}?([\w+-]+)?\s{0,99}?\n # $1 = opening fence (captured for back-referencing), $2 = optional lang
2838
- (.*?) # $3 = code block content
2839
- \1[ \t]*\n # closing fence
2840
- ''', re.M | re.X | re.S)
2841
-
2842
- def test(self, text):
2843
- if '```' not in text:
2844
- return False
2845
- if self.md.stage == Stage.PREPROCESS and not self.md.safe_mode:
2846
- return True
2847
- if self.md.stage == Stage.LINK_DEFS and self.md.safe_mode:
2848
- return True
2849
- return self.md.stage == Stage.BLOCK_GAMUT
2850
-
2851
- def _code_block_with_lexer_sub(
2852
- self,
2853
- codeblock: str,
2854
- leading_indent: str,
2855
- lexer
2856
- ) -> str:
2857
- '''
2858
- Args:
2859
- codeblock: the codeblock to format
2860
- leading_indent: the indentation to prefix the block with
2861
- lexer (pygments.Lexer): the lexer to use
2862
- '''
2863
- formatter_opts = self.md.extras['fenced-code-blocks'] or {}
2864
-
2865
- def unhash_code(codeblock):
2866
- for key, sanitized in list(self.md.html_spans.items()):
2867
- codeblock = codeblock.replace(key, sanitized)
2868
- replacements = [
2869
- ("&amp;", "&"),
2870
- ("&lt;", "<"),
2871
- ("&gt;", ">")
2872
- ]
2873
- for old, new in replacements:
2874
- codeblock = codeblock.replace(old, new)
2875
- return codeblock
2876
- # remove leading indent from code block
2877
- _, codeblock = self.md._uniform_outdent(codeblock, max_outdent=leading_indent)
2878
-
2879
- codeblock = unhash_code(codeblock)
2880
- colored = self.md._color_with_pygments(codeblock, lexer,
2881
- **formatter_opts)
2882
-
2883
- # add back the indent to all lines
2884
- return "\n%s\n" % self.md._uniform_indent(colored, leading_indent, True)
2885
-
2886
- def tags(self, lexer_name: str) -> Tuple[str, str]:
2887
- '''
2888
- Returns the tags that the encoded code block will be wrapped in, based
2889
- upon the lexer name.
2890
-
2891
- This function can be overridden by subclasses to piggy-back off of the
2892
- fenced code blocks syntax (see `Mermaid` extra).
2893
-
2894
- Returns:
2895
- The opening and closing tags, as strings within a tuple
2896
- '''
2897
- pre_class = self.md._html_class_str_from_tag('pre')
2898
- if "highlightjs-lang" in self.md.extras and lexer_name:
2899
- code_class = ' class="%s language-%s"' % (lexer_name, lexer_name)
2900
- else:
2901
- code_class = self.md._html_class_str_from_tag('code')
2902
- return ('<pre%s><code%s>' % (pre_class, code_class), '</code></pre>')
2903
-
2904
- def sub(self, match: re.Match) -> str:
2905
- lexer_name = match.group(2)
2906
- codeblock = match.group(3)
2907
- codeblock = codeblock[:-1] # drop one trailing newline
2908
-
2909
- # Use pygments only if not using the highlightjs-lang extra
2910
- if lexer_name and "highlightjs-lang" not in self.md.extras:
2911
- lexer = self.md._get_pygments_lexer(lexer_name)
2912
- if lexer:
2913
- leading_indent = ' '*(len(match.group(1)) - len(match.group(1).lstrip()))
2914
- return self._code_block_with_lexer_sub(codeblock, leading_indent, lexer)
2915
-
2916
- # Fenced code blocks need to be outdented before encoding, and then reapplied
2917
- leading_indent = ' ' * (len(match.group(1)) - len(match.group(1).lstrip()))
2918
- if codeblock:
2919
- # only run the codeblock through the outdenter if not empty
2920
- leading_indent, codeblock = self.md._uniform_outdent(codeblock, max_outdent=leading_indent)
2921
-
2922
- codeblock = self.md._encode_code(codeblock)
2923
-
2924
- tags = self.tags(lexer_name)
2925
-
2926
- return "\n%s%s%s\n%s%s\n" % (leading_indent, tags[0], codeblock, leading_indent, tags[1])
2927
-
2928
- def run(self, text):
2929
- return self.fenced_code_block_re.sub(self.sub, text)
2930
-
2931
-
2932
- class Latex(Extra):
2933
- '''
2934
- Convert $ and $$ to <math> and </math> tags for inline and block math.
2935
- '''
2936
- name = 'latex'
2937
- order = (Stage.CODE_BLOCKS, FencedCodeBlocks), ()
2938
-
2939
- _single_dollar_re = re.compile(r'(?<!\$)\$(?!\$)(.*?)\$')
2940
- _double_dollar_re = re.compile(r'\$\$(.*?)\$\$', re.DOTALL)
2941
-
2942
- # Ways to escape
2943
- _pre_code_block_re = re.compile(r"<pre>(.*?)</pre>", re.DOTALL) # Wraped in <pre>
2944
- _triple_re = re.compile(r'```(.*?)```', re.DOTALL) # Wrapped in a code block ```
2945
- _single_re = re.compile(r'(?<!`)(`)(.*?)(?<!`)\1(?!`)') # Wrapped in a single `
2946
-
2947
- converter = None
2948
- code_blocks = {}
2949
-
2950
- def _convert_single_match(self, match):
2951
- return self.converter.convert(match.group(1))
2952
-
2953
- def _convert_double_match(self, match):
2954
- return self.converter.convert(match.group(1).replace(r"\n", ''), display="block")
2955
-
2956
- def code_placeholder(self, match):
2957
- placeholder = f"<!--CODE_BLOCK_{len(self.code_blocks)}-->"
2958
- self.code_blocks[placeholder] = match.group(0)
2959
- return placeholder
2960
-
2961
- def run(self, text):
2962
- try:
2963
- import latex2mathml.converter
2964
- self.converter = latex2mathml.converter
2965
- except ImportError:
2966
- raise ImportError('The "latex" extra requires the "latex2mathml" package to be installed.')
2967
-
2968
- # Escape by replacing with a code block
2969
- text = self._pre_code_block_re.sub(self.code_placeholder, text)
2970
- text = self._single_re.sub(self.code_placeholder, text)
2971
- text = self._triple_re.sub(self.code_placeholder, text)
2972
-
2973
- text = self._single_dollar_re.sub(self._convert_single_match, text)
2974
- text = self._double_dollar_re.sub(self._convert_double_match, text)
2975
-
2976
- # Convert placeholder tag back to original code
2977
- for placeholder, code_block in self.code_blocks.items():
2978
- text = text.replace(placeholder, code_block)
2979
-
2980
- return text
2981
-
2982
-
2983
- class LinkPatterns(Extra):
2984
- '''
2985
- Auto-link given regex patterns in text (e.g. bug number
2986
- references, revision number references).
2987
- '''
2988
- name = 'link-patterns'
2989
- order = (Stage.LINKS,), ()
2990
- options: _link_patterns
2991
-
2992
- _basic_link_re = re.compile(r'!?\[.*?\]\(.*?\)')
2993
-
2994
- def run(self, text):
2995
- link_from_hash = {}
2996
- for regex, repl in self.options:
2997
- replacements = []
2998
- for match in regex.finditer(text):
2999
- if any(self.md._match_overlaps_substr(text, match, h) for h in link_from_hash):
3000
- continue
3001
-
3002
- if callable(repl):
3003
- href = repl(match)
3004
- else:
3005
- href = match.expand(repl)
3006
- replacements.append((match.span(), href))
3007
- for (start, end), href in reversed(replacements):
3008
-
3009
- # Do not match against links inside brackets.
3010
- if text[start - 1:start] == '[' and text[end:end + 1] == ']':
3011
- continue
3012
-
3013
- # Do not match against links in the standard markdown syntax.
3014
- if text[start - 2:start] == '](' or text[end:end + 2] == '")':
3015
- continue
3016
-
3017
- # Do not match against links which are escaped.
3018
- if text[start - 3:start] == '"""' and text[end:end + 3] == '"""':
3019
- text = text[:start - 3] + text[start:end] + text[end + 3:]
3020
- continue
3021
-
3022
- # search the text for anything that looks like a link
3023
- is_inside_link = False
3024
- for link_re in (self.md._auto_link_re, self._basic_link_re):
3025
- for match in link_re.finditer(text):
3026
- if any((r[0] <= start and end <= r[1]) for r in match.regs):
3027
- # if the link pattern start and end pos is within the bounds of
3028
- # something that looks like a link, then don't process it
3029
- is_inside_link = True
3030
- break
3031
- else:
3032
- continue
3033
- break
3034
-
3035
- if is_inside_link:
3036
- continue
3037
-
3038
- escaped_href = (
3039
- href.replace('"', '&quot;') # b/c of attr quote
3040
- # To avoid markdown <em> and <strong>:
3041
- .replace('*', self.md._escape_table['*'])
3042
- .replace('_', self.md._escape_table['_']))
3043
- link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])
3044
- hash = _hash_text(link)
3045
- link_from_hash[hash] = link
3046
- text = text[:start] + hash + text[end:]
3047
- for hash, link in list(link_from_hash.items()):
3048
- text = text.replace(hash, link)
3049
- return text
3050
-
3051
- def test(self, text):
3052
- return True
3053
-
3054
-
3055
- class MarkdownInHTML(Extra):
3056
- '''
3057
- Allow the use of `markdown="1"` in a block HTML tag to
3058
- have markdown processing be done on its contents. Similar to
3059
- <http://michelf.com/projects/php-markdown/extra/#markdown-attr> but with
3060
- some limitations.
3061
- '''
3062
- name = 'markdown-in-html'
3063
- order = (), (Stage.HASH_HTML,)
3064
-
3065
- def run(self, text):
3066
- def callback(block):
3067
- indent, block = self.md._uniform_outdent(block)
3068
- block = self.md._hash_html_block_sub(block)
3069
- block = self.md._uniform_indent(block, indent, include_empty_lines=True, indent_empty_lines=False)
3070
- return block
3071
-
3072
- return self.md._strict_tag_block_sub(text, self.md._block_tags_a, callback, True)
3073
-
3074
- def test(self, text):
3075
- return True
3076
-
3077
-
3078
- class Mermaid(FencedCodeBlocks):
3079
- name = 'mermaid'
3080
- order = (FencedCodeBlocks,), ()
3081
-
3082
- def tags(self, lexer_name):
3083
- if lexer_name == 'mermaid':
3084
- return ('<pre class="mermaid-pre"><div class="mermaid">', '</div></pre>')
3085
- return super().tags(lexer_name)
3086
-
3087
-
3088
- class MiddleWordEm(ItalicAndBoldProcessor):
3089
- '''
3090
- Allows or disallows emphasis syntax in the middle of words,
3091
- defaulting to allow. Disabling this means that `this_text_here` will not be
3092
- converted to `this<em>text</em>here`.
3093
- '''
3094
- name = 'middle-word-em'
3095
- order = (CodeFriendly,), (Stage.ITALIC_AND_BOLD,)
3096
-
3097
- def __init__(self, md: Markdown, options: Union[dict, bool]):
3098
- '''
3099
- Args:
3100
- md: the markdown instance
3101
- options: can be bool for backwards compatibility but will be converted to a dict
3102
- in the constructor. All options are:
3103
- - allowed (bool): whether to allow emphasis in the middle of a word.
3104
- If `options` is a bool it will be placed under this key.
3105
- '''
3106
- if isinstance(options, bool):
3107
- options = {'allowed': options}
3108
- options.setdefault('allowed', True)
3109
- super().__init__(md, options)
3110
-
3111
- self.liberal_em_re = self.em_re
3112
- if not options['allowed']:
3113
- self.em_re = re.compile(r'(?<=\b)%s(?=\b)' % self.liberal_em_re.pattern, self.liberal_em_re.flags)
3114
-
3115
- def run(self, text):
3116
- # run strong and whatnot first
3117
- # this also will process all strict ems
3118
- text = super().run(text)
3119
- if self.md.order < self.md.stage:
3120
- # hash all non-valid ems
3121
- text = self.liberal_em_re.sub(self.sub_hash, text)
3122
- return text
3123
-
3124
- def sub(self, match: re.Match) -> str:
3125
- syntax = match.group(1)
3126
- if len(syntax) != 1:
3127
- # strong syntax
3128
- return super().sub(match)
3129
- return '<em>%s</em>' % match.group(2)
3130
-
3131
-
3132
- class Numbering(Extra):
3133
- '''
3134
- Support of generic counters. Non standard extension to
3135
- allow sequential numbering of figures, tables, equations, exhibits etc.
3136
- '''
3137
-
3138
- name = 'numbering'
3139
- order = (Stage.LINK_DEFS,), ()
3140
-
3141
- def run(self, text):
3142
- # First pass to define all the references
3143
- regex_defns = re.compile(r'''
3144
- \[\#(\w+) # the counter. Open square plus hash plus a word \1
3145
- ([^@]*) # Some optional characters, that aren't an @. \2
3146
- @(\w+) # the id. Should this be normed? \3
3147
- ([^\]]*)\] # The rest of the text up to the terminating ] \4
3148
- ''', re.VERBOSE)
3149
- regex_subs = re.compile(r"\[@(\w+)\s*\]") # [@ref_id]
3150
- counters = {}
3151
- references = {}
3152
- replacements = []
3153
- definition_html = '<figcaption class="{}" id="counter-ref-{}">{}{}{}</figcaption>'
3154
- reference_html = '<a class="{}" href="#counter-ref-{}">{}</a>'
3155
- for match in regex_defns.finditer(text):
3156
- # We must have four match groups otherwise this isn't a numbering reference
3157
- if len(match.groups()) != 4:
3158
- continue
3159
- counter = match.group(1)
3160
- text_before = match.group(2).strip()
3161
- ref_id = match.group(3)
3162
- text_after = match.group(4)
3163
- number = counters.get(counter, 1)
3164
- references[ref_id] = (number, counter)
3165
- replacements.append((match.start(0),
3166
- definition_html.format(counter,
3167
- ref_id,
3168
- text_before,
3169
- number,
3170
- text_after),
3171
- match.end(0)))
3172
- counters[counter] = number + 1
3173
- for repl in reversed(replacements):
3174
- text = text[:repl[0]] + repl[1] + text[repl[2]:]
3175
-
3176
- # Second pass to replace the references with the right
3177
- # value of the counter
3178
- # Fwiw, it's vaguely annoying to have to turn the iterator into
3179
- # a list and then reverse it but I can't think of a better thing to do.
3180
- for match in reversed(list(regex_subs.finditer(text))):
3181
- number, counter = references.get(match.group(1), (None, None))
3182
- if number is not None:
3183
- repl = reference_html.format(counter,
3184
- match.group(1),
3185
- number)
3186
- else:
3187
- repl = reference_html.format(match.group(1),
3188
- 'countererror',
3189
- '?' + match.group(1) + '?')
3190
- if "smarty-pants" in self.md.extras:
3191
- repl = repl.replace('"', self.md._escape_table['"'])
3192
-
3193
- text = text[:match.start()] + repl + text[match.end():]
3194
- return text
3195
-
3196
-
3197
- class PyShell(Extra):
3198
- '''
3199
- Treats unindented Python interactive shell sessions as <code>
3200
- blocks.
3201
- '''
3202
-
3203
- name = 'pyshell'
3204
- order = (), (Stage.LISTS,)
3205
-
3206
- def test(self, text):
3207
- return ">>>" in text
3208
-
3209
- def sub(self, match: re.Match) -> str:
3210
- if "fenced-code-blocks" in self.md.extras:
3211
- dedented = _dedent(match.group(0))
3212
- return self.md.extra_classes['fenced-code-blocks'].run("```pycon\n" + dedented + "```\n")
3213
-
3214
- lines = match.group(0).splitlines(0)
3215
- _dedentlines(lines)
3216
- indent = ' ' * self.md.tab_width
3217
- s = ('\n' # separate from possible cuddled paragraph
3218
- + indent + ('\n'+indent).join(lines)
3219
- + '\n')
3220
- return s
3221
-
3222
- def run(self, text):
3223
- less_than_tab = self.md.tab_width - 1
3224
- _pyshell_block_re = re.compile(r"""
3225
- ^([ ]{0,%d})>>>[ ].*\n # first line
3226
- ^(\1[^\S\n]*\S.*\n)* # any number of subsequent lines with at least one character
3227
- (?=^\1?\n|\Z) # ends with a blank line or end of document
3228
- """ % less_than_tab, re.M | re.X)
3229
-
3230
- return _pyshell_block_re.sub(self.sub, text)
3231
-
3232
-
3233
- class SmartyPants(Extra):
3234
- '''
3235
- Replaces ' and " with curly quotation marks or curly
3236
- apostrophes. Replaces --, ---, ..., and . . . with en dashes, em dashes,
3237
- and ellipses.
3238
- '''
3239
- name = 'smarty-pants'
3240
- order = (), (Stage.SPAN_GAMUT,)
3241
-
3242
- _opening_single_quote_re = re.compile(r"(?<!\S)'(?=\S)")
3243
- _opening_double_quote_re = re.compile(r'(?<!\S)"(?=\S)')
3244
- _closing_single_quote_re = re.compile(r"(?<=\S)'")
3245
- _closing_double_quote_re = re.compile(r'(?<=\S)"(?=(\s|,|;|\.|\?|!|$))')
3246
- # "smarty-pants" extra: Very liberal in interpreting a single prime as an
3247
- # apostrophe; e.g. ignores the fact that "round", "bout", "twer", and
3248
- # "twixt" can be written without an initial apostrophe. This is fine because
3249
- # using scare quotes (single quotation marks) is rare.
3250
- _apostrophe_year_re = re.compile(r"'(\d\d)(?=(\s|,|;|\.|\?|!|$))")
3251
- _contractions = ["tis", "twas", "twer", "neath", "o", "n",
3252
- "round", "bout", "twixt", "nuff", "fraid", "sup"]
3253
-
3254
-
3255
- def contractions(self, text: str) -> str:
3256
- text = self._apostrophe_year_re.sub(r"&#8217;\1", text)
3257
- for c in self._contractions:
3258
- text = text.replace("'%s" % c, "&#8217;%s" % c)
3259
- text = text.replace("'%s" % c.capitalize(),
3260
- "&#8217;%s" % c.capitalize())
3261
- return text
3262
-
3263
- def run(self, text):
3264
- """Fancifies 'single quotes', "double quotes", and apostrophes.
3265
- Converts --, ---, and ... into en dashes, em dashes, and ellipses.
3266
-
3267
- Inspiration is: <http://daringfireball.net/projects/smartypants/>
3268
- See "test/tm-cases/smarty_pants.text" for a full discussion of the
3269
- support here and
3270
- <http://code.google.com/p/python-markdown2/issues/detail?id=42> for a
3271
- discussion of some diversion from the original SmartyPants.
3272
- """
3273
- if "'" in text: # guard for perf
3274
- text = self.contractions(text)
3275
- text = self._opening_single_quote_re.sub("&#8216;", text)
3276
- text = self._closing_single_quote_re.sub("&#8217;", text)
3277
-
3278
- if '"' in text: # guard for perf
3279
- text = self._opening_double_quote_re.sub("&#8220;", text)
3280
- text = self._closing_double_quote_re.sub("&#8221;", text)
3281
-
3282
- text = text.replace("---", "&#8212;")
3283
- text = text.replace("--", "&#8211;")
3284
- text = text.replace("...", "&#8230;")
3285
- text = text.replace(" . . . ", "&#8230;")
3286
- text = text.replace(". . .", "&#8230;")
3287
-
3288
- # TODO: Temporary hack to fix https://github.com/trentm/python-markdown2/issues/150
3289
- if "footnotes" in self.md.extras and "footnote-ref" in text:
3290
- # Quotes in the footnote back ref get converted to "smart" quotes
3291
- # Change them back here to ensure they work.
3292
- text = text.replace('class="footnote-ref&#8221;', 'class="footnote-ref"')
3293
-
3294
- return text
3295
-
3296
- def test(self, text):
3297
- return "'" in text or '"' in text
3298
-
3299
-
3300
- class Strike(Extra):
3301
- '''
3302
- Text inside of double tilde is ~~strikethrough~~
3303
- '''
3304
- name = 'strike'
3305
- order = (Stage.ITALIC_AND_BOLD,), ()
3306
-
3307
- _strike_re = re.compile(r"~~(?=\S)(.+?)(?<=\S)~~", re.S)
3308
-
3309
- def run(self, text):
3310
- return self._strike_re.sub(r"<s>\1</s>", text)
3311
-
3312
- def test(self, text):
3313
- return '~~' in text
3314
-
3315
-
3316
- class Tables(Extra):
3317
- '''
3318
- Tables using the same format as GFM
3319
- <https://help.github.com/articles/github-flavored-markdown#tables> and
3320
- PHP-Markdown Extra <https://michelf.ca/projects/php-markdown/extra/#table>.
3321
- '''
3322
- name = 'tables'
3323
- order = (), (Stage.LISTS,)
3324
-
3325
- def run(self, text):
3326
- """Copying PHP-Markdown and GFM table syntax. Some regex borrowed from
3327
- https://github.com/michelf/php-markdown/blob/lib/Michelf/Markdown.php#L2538
3328
- """
3329
- less_than_tab = self.md.tab_width - 1
3330
- table_re = re.compile(r'''
3331
- (?:(?<=\n)|\A\n?) # leading blank line
3332
-
3333
- ^[ ]{0,%d} # allowed whitespace
3334
- (.*[|].*)[ ]*\n # $1: header row (at least one pipe)
3335
-
3336
- ^[ ]{0,%d} # allowed whitespace
3337
- ( # $2: underline row
3338
- # underline row with leading bar
3339
- (?: \|\ *:?-+:?\ * )+ \|? \s?[ ]*\n
3340
- |
3341
- # or, underline row without leading bar
3342
- (?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \s?[ ]*\n
3343
- )
3344
-
3345
- ( # $3: data rows
3346
- (?:
3347
- ^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces
3348
- .*\|.*[ ]*\n
3349
- )+
3350
- )
3351
- ''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X)
3352
- return table_re.sub(self.sub, text)
3353
-
3354
- def sub(self, match: re.Match) -> str:
3355
- trim_space_re = '^[ \t\n]+|[ \t\n]+$'
3356
- trim_bar_re = r'^\||\|$'
3357
- split_bar_re = r'^\||(?<![\`\\])\|'
3358
- escape_bar_re = r'\\\|'
3359
-
3360
- head, underline, body = match.groups()
3361
-
3362
- # Determine aligns for columns.
3363
- cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", underline)))]
3364
- align_from_col_idx = {}
3365
- for col_idx, col in enumerate(cols):
3366
- if col[0] == ':' and col[-1] == ':':
3367
- align_from_col_idx[col_idx] = ' style="text-align:center;"'
3368
- elif col[0] == ':':
3369
- align_from_col_idx[col_idx] = ' style="text-align:left;"'
3370
- elif col[-1] == ':':
3371
- align_from_col_idx[col_idx] = ' style="text-align:right;"'
3372
-
3373
- # thead
3374
- hlines = ['<table%s>' % self.md._html_class_str_from_tag('table'), '<thead%s>' % self.md._html_class_str_from_tag('thead'), '<tr>']
3375
- cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", head)))]
3376
- for col_idx, col in enumerate(cols):
3377
- hlines.append(' <th%s>%s</th>' % (
3378
- align_from_col_idx.get(col_idx, ''),
3379
- self.md._run_span_gamut(col)
3380
- ))
3381
- hlines.append('</tr>')
3382
- hlines.append('</thead>')
3383
-
3384
- # tbody
3385
- hlines.append('<tbody>')
3386
- for line in body.strip('\n').split('\n'):
3387
- hlines.append('<tr>')
3388
- cols = [re.sub(escape_bar_re, '|', cell.strip()) for cell in re.split(split_bar_re, re.sub(trim_bar_re, "", re.sub(trim_space_re, "", line)))]
3389
- for col_idx, col in enumerate(cols):
3390
- hlines.append(' <td%s>%s</td>' % (
3391
- align_from_col_idx.get(col_idx, ''),
3392
- self.md._run_span_gamut(col)
3393
- ))
3394
- hlines.append('</tr>')
3395
- hlines.append('</tbody>')
3396
- hlines.append('</table>')
3397
-
3398
- return '\n'.join(hlines) + '\n'
3399
-
3400
-
3401
- class TelegramSpoiler(Extra):
3402
- name = 'tg-spoiler'
3403
- order = (), (Stage.ITALIC_AND_BOLD,)
3404
-
3405
- _tg_spoiler_re = re.compile(r"\|\|\s?(.+?)\s?\|\|", re.S)
3406
-
3407
- def run(self, text):
3408
- return self._tg_spoiler_re.sub(r"<tg-spoiler>\1</tg-spoiler>", text)
3409
-
3410
- def test(self, text):
3411
- return '||' in text
3412
-
3413
-
3414
- class Underline(Extra):
3415
- '''
3416
- Text inside of double dash is --underlined--.
3417
- '''
3418
- name = 'underline'
3419
- order = (Stage.ITALIC_AND_BOLD,), ()
3420
-
3421
- _underline_re = re.compile(r"(?<!<!)--(?!>)(?=\S)(.+?)(?<=\S)(?<!<!)--(?!>)", re.S)
3422
-
3423
- def run(self, text):
3424
- return self._underline_re.sub(r"<u>\1</u>", text)
3425
-
3426
- def test(self, text):
3427
- return '--' in text
3428
-
3429
-
3430
- class _WavedromExtraOpts(TypedDict, total=False):
3431
- '''Options for the `Wavedrom` extra'''
3432
- prefer_embed_svg: bool
3433
- '''
3434
- Use the `wavedrom` library to convert diagrams to SVGs and embed them directly.
3435
- This will only work if the `wavedrom` library has been installed.
3436
-
3437
- Defaults to `True`
3438
- '''
3439
-
3440
-
3441
- class Wavedrom(Extra):
3442
- '''
3443
- Support for generating Wavedrom digital timing diagrams
3444
- '''
3445
- name = 'wavedrom'
3446
- order = (Stage.CODE_BLOCKS, FencedCodeBlocks), ()
3447
- options: _WavedromExtraOpts
3448
-
3449
- def test(self, text):
3450
- match = FencedCodeBlocks.fenced_code_block_re.search(text)
3451
- return match is None or match.group(2) == 'wavedrom'
3452
-
3453
- def sub(self, match: re.Match) -> str:
3454
- # dedent the block for processing
3455
- lead_indent, waves = self.md._uniform_outdent(match.group(3))
3456
- # default tags to wrap the wavedrom block in
3457
- open_tag, close_tag = '<script type="WaveDrom">\n', '</script>'
3458
-
3459
- # check if the user would prefer to have the SVG embedded directly
3460
- embed_svg = self.options.get('prefer_embed_svg', True)
3461
-
3462
- if embed_svg:
3463
- try:
3464
- import wavedrom
3465
- waves = wavedrom.render(waves).tostring()
3466
- open_tag, close_tag = '<div>', '\n</div>'
3467
- except ImportError:
3468
- pass
3469
-
3470
- # hash SVG to prevent <> chars being messed with
3471
- self.md._escape_table[waves] = _hash_text(waves)
3472
-
3473
- return self.md._uniform_indent(
3474
- '\n%s%s%s\n' % (open_tag, self.md._escape_table[waves], close_tag),
3475
- lead_indent, include_empty_lines=True
3476
- )
3477
-
3478
- def run(self, text):
3479
- return FencedCodeBlocks.fenced_code_block_re.sub(self.sub, text)
3480
-
3481
-
3482
- class WikiTables(Extra):
3483
- '''
3484
- Google Code Wiki-style tables. See
3485
- <http://code.google.com/p/support/wiki/WikiSyntax#Tables>.
3486
- '''
3487
- name = 'wiki-tables'
3488
- order = (Tables,), ()
3489
-
3490
- def run(self, text):
3491
- less_than_tab = self.md.tab_width - 1
3492
- wiki_table_re = re.compile(r'''
3493
- (?:(?<=\n\n)|\A\n?) # leading blank line
3494
- ^([ ]{0,%d})\|\|.+?\|\|[ ]*\n # first line
3495
- (^\1\|\|.+?\|\|\n)* # any number of subsequent lines
3496
- ''' % less_than_tab, re.M | re.X)
3497
- return wiki_table_re.sub(self.sub, text)
3498
-
3499
- def sub(self, match: re.Match) -> str:
3500
- ttext = match.group(0).strip()
3501
- rows = []
3502
- for line in ttext.splitlines(0):
3503
- line = line.strip()[2:-2].strip()
3504
- row = [c.strip() for c in re.split(r'(?<!\\)\|\|', line)]
3505
- rows.append(row)
3506
-
3507
- hlines = []
3508
-
3509
- def add_hline(line, indents=0):
3510
- hlines.append((self.md.tab * indents) + line)
3511
-
3512
- def format_cell(text):
3513
- return self.md._run_span_gamut(re.sub(r"^\s*~", "", cell).strip(" "))
3514
-
3515
- add_hline('<table%s>' % self.md._html_class_str_from_tag('table'))
3516
- # Check if first cell of first row is a header cell. If so, assume the whole row is a header row.
3517
- if rows and rows[0] and re.match(r"^\s*~", rows[0][0]):
3518
- add_hline('<thead%s>' % self.md._html_class_str_from_tag('thead'), 1)
3519
- add_hline('<tr>', 2)
3520
- for cell in rows[0]:
3521
- add_hline("<th>{}</th>".format(format_cell(cell)), 3)
3522
- add_hline('</tr>', 2)
3523
- add_hline('</thead>', 1)
3524
- # Only one header row allowed.
3525
- rows = rows[1:]
3526
- # If no more rows, don't create a tbody.
3527
- if rows:
3528
- add_hline('<tbody>', 1)
3529
- for row in rows:
3530
- add_hline('<tr>', 2)
3531
- for cell in row:
3532
- add_hline('<td>{}</td>'.format(format_cell(cell)), 3)
3533
- add_hline('</tr>', 2)
3534
- add_hline('</tbody>', 1)
3535
- add_hline('</table>')
3536
- return '\n'.join(hlines) + '\n'
3537
-
3538
- def test(self, text):
3539
- return '||' in text
3540
-
3541
-
3542
- # Register extras
3543
- Admonitions.register()
3544
- Alerts.register()
3545
- Breaks.register()
3546
- CodeFriendly.register()
3547
- FencedCodeBlocks.register()
3548
- Latex.register()
3549
- LinkPatterns.register()
3550
- MarkdownInHTML.register()
3551
- MiddleWordEm.register()
3552
- Mermaid.register()
3553
- Numbering.register()
3554
- PyShell.register()
3555
- SmartyPants.register()
3556
- Strike.register()
3557
- Tables.register()
3558
- TelegramSpoiler.register()
3559
- Underline.register()
3560
- Wavedrom.register()
3561
- WikiTables.register()
3562
-
3563
-
3564
- # ----------------------------------------------------------
3565
-
3566
-
3567
- # ---- internal support functions
3568
-
3569
-
3570
- def calculate_toc_html(toc: Union[List[Tuple[int, str, str]], None]) -> Optional[str]:
3571
- """Return the HTML for the current TOC.
3572
-
3573
- This expects the `_toc` attribute to have been set on this instance.
3574
- """
3575
- if toc is None:
3576
- return None
3577
-
3578
- def indent():
3579
- return ' ' * (len(h_stack) - 1)
3580
- lines = []
3581
- h_stack = [0] # stack of header-level numbers
3582
- for level, id, name in toc:
3583
- if level > h_stack[-1]:
3584
- lines.append("%s<ul>" % indent())
3585
- h_stack.append(level)
3586
- elif level == h_stack[-1]:
3587
- lines[-1] += "</li>"
3588
- else:
3589
- while level < h_stack[-1]:
3590
- h_stack.pop()
3591
- if not lines[-1].endswith("</li>"):
3592
- lines[-1] += "</li>"
3593
- lines.append("%s</ul></li>" % indent())
3594
- lines.append('%s<li><a href="#%s">%s</a>' % (
3595
- indent(), id, name))
3596
- while len(h_stack) > 1:
3597
- h_stack.pop()
3598
- if not lines[-1].endswith("</li>"):
3599
- lines[-1] += "</li>"
3600
- lines.append("%s</ul>" % indent())
3601
- return '\n'.join(lines) + '\n'
3602
-
3603
-
3604
- class UnicodeWithAttrs(str):
3605
- """A subclass of unicode used for the return value of conversion to
3606
- possibly attach some attributes. E.g. the "toc_html" attribute when
3607
- the "toc" extra is used.
3608
- """
3609
- metadata: Optional[Dict[str, str]] = None
3610
- toc_html: Optional[str] = None
3611
-
3612
- ## {{{ http://code.activestate.com/recipes/577257/ (r1)
3613
- _slugify_strip_re = re.compile(r'[^\w\s-]')
3614
- _slugify_hyphenate_re = re.compile(r'[-\s]+')
3615
- def _slugify(value: str) -> str:
3616
- """
3617
- Normalizes string, converts to lowercase, removes non-alpha characters,
3618
- and converts spaces to hyphens.
3619
-
3620
- From Django's "django/template/defaultfilters.py".
3621
- """
3622
- import unicodedata
3623
- value = unicodedata.normalize('NFKD', value).encode('utf-8', 'ignore').decode()
3624
- value = _slugify_strip_re.sub('', value).strip().lower()
3625
- return _slugify_hyphenate_re.sub('-', value)
3626
- ## end of http://code.activestate.com/recipes/577257/ }}}
3627
-
3628
-
3629
- # From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
3630
- def _curry(function: Callable, *args, **kwargs) -> Callable:
3631
- def result(*rest, **kwrest):
3632
- combined = kwargs.copy()
3633
- combined.update(kwrest)
3634
- return function(*args + rest, **combined)
3635
- return result
3636
-
3637
-
3638
- # Recipe: regex_from_encoded_pattern (1.0)
3639
- def _regex_from_encoded_pattern(s: str) -> re.Pattern:
3640
- """'foo' -> re.compile(re.escape('foo'))
3641
- '/foo/' -> re.compile('foo')
3642
- '/foo/i' -> re.compile('foo', re.I)
3643
- """
3644
- if s.startswith('/') and s.rfind('/') != 0:
3645
- # Parse it: /PATTERN/FLAGS
3646
- idx = s.rfind('/')
3647
- _, flags_str = s[1:idx], s[idx+1:]
3648
- flag_from_char = {
3649
- "i": re.IGNORECASE,
3650
- "l": re.LOCALE,
3651
- "s": re.DOTALL,
3652
- "m": re.MULTILINE,
3653
- "u": re.UNICODE,
3654
- }
3655
- flags = 0
3656
- for char in flags_str:
3657
- try:
3658
- flags |= flag_from_char[char]
3659
- except KeyError:
3660
- raise ValueError("unsupported regex flag: '%s' in '%s' "
3661
- "(must be one of '%s')"
3662
- % (char, s, ''.join(list(flag_from_char.keys()))))
3663
- return re.compile(s[1:idx], flags)
3664
- else: # not an encoded regex
3665
- return re.compile(re.escape(s))
3666
-
3667
-
3668
- # Recipe: dedent (0.1.2)
3669
- def _dedentlines(lines: List[str], tabsize: int = 8, skip_first_line: bool = False) -> List[str]:
3670
- """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
3671
-
3672
- "lines" is a list of lines to dedent.
3673
- "tabsize" is the tab width to use for indent width calculations.
3674
- "skip_first_line" is a boolean indicating if the first line should
3675
- be skipped for calculating the indent width and for dedenting.
3676
- This is sometimes useful for docstrings and similar.
3677
-
3678
- Same as dedent() except operates on a sequence of lines. Note: the
3679
- lines list is modified **in-place**.
3680
- """
3681
- DEBUG = False
3682
- if DEBUG:
3683
- print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
3684
- % (tabsize, skip_first_line))
3685
- margin = None
3686
- for i, line in enumerate(lines):
3687
- if i == 0 and skip_first_line:
3688
- continue
3689
- indent = 0
3690
- for ch in line:
3691
- if ch == ' ':
3692
- indent += 1
3693
- elif ch == '\t':
3694
- indent += tabsize - (indent % tabsize)
3695
- elif ch in '\r\n':
3696
- continue # skip all-whitespace lines
3697
- else:
3698
- break
3699
- else:
3700
- continue # skip all-whitespace lines
3701
- if DEBUG:
3702
- print("dedent: indent=%d: %r" % (indent, line))
3703
- if margin is None:
3704
- margin = indent
3705
- else:
3706
- margin = min(margin, indent)
3707
- if DEBUG:
3708
- print("dedent: margin=%r" % margin)
3709
-
3710
- if margin is not None and margin > 0:
3711
- for i, line in enumerate(lines):
3712
- if i == 0 and skip_first_line:
3713
- continue
3714
- removed = 0
3715
- for j, ch in enumerate(line):
3716
- if ch == ' ':
3717
- removed += 1
3718
- elif ch == '\t':
3719
- removed += tabsize - (removed % tabsize)
3720
- elif ch in '\r\n':
3721
- if DEBUG:
3722
- print("dedent: %r: EOL -> strip up to EOL" % line)
3723
- lines[i] = lines[i][j:]
3724
- break
3725
- else:
3726
- raise ValueError("unexpected non-whitespace char %r in "
3727
- "line %r while removing %d-space margin"
3728
- % (ch, line, margin))
3729
- if DEBUG:
3730
- print("dedent: %r: %r -> removed %d/%d"\
3731
- % (line, ch, removed, margin))
3732
- if removed == margin:
3733
- lines[i] = lines[i][j+1:]
3734
- break
3735
- elif removed > margin:
3736
- lines[i] = ' '*(removed-margin) + lines[i][j+1:]
3737
- break
3738
- else:
3739
- if removed:
3740
- lines[i] = lines[i][removed:]
3741
- return lines
3742
-
3743
-
3744
- def _dedent(text: str, tabsize: int = 8, skip_first_line: bool = False) -> str:
3745
- """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
3746
-
3747
- "text" is the text to dedent.
3748
- "tabsize" is the tab width to use for indent width calculations.
3749
- "skip_first_line" is a boolean indicating if the first line should
3750
- be skipped for calculating the indent width and for dedenting.
3751
- This is sometimes useful for docstrings and similar.
3752
-
3753
- textwrap.dedent(s), but don't expand tabs to spaces
3754
- """
3755
- lines = text.splitlines(True)
3756
- _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
3757
- return ''.join(lines)
3758
-
3759
-
3760
- class _memoized(object):
3761
- """Decorator that caches a function's return value each time it is called.
3762
- If called later with the same arguments, the cached value is returned, and
3763
- not re-evaluated.
3764
-
3765
- http://wiki.python.org/moin/PythonDecoratorLibrary
3766
- """
3767
- def __init__(self, func):
3768
- self.func = func
3769
- self.cache = {}
3770
-
3771
- def __call__(self, *args):
3772
- try:
3773
- return self.cache[args]
3774
- except KeyError:
3775
- self.cache[args] = value = self.func(*args)
3776
- return value
3777
- except TypeError:
3778
- # uncachable -- for instance, passing a list as an argument.
3779
- # Better to not cache than to blow up entirely.
3780
- return self.func(*args)
3781
-
3782
- def __repr__(self):
3783
- """Return the function's docstring."""
3784
- return self.func.__doc__
3785
-
3786
-
3787
- def _xml_oneliner_re_from_tab_width(tab_width: int) -> re.Pattern:
3788
- """Standalone XML processing instruction regex."""
3789
- return re.compile(r"""
3790
- (?:
3791
- (?<=\n\n) # Starting after a blank line
3792
- | # or
3793
- \A\n? # the beginning of the doc
3794
- )
3795
- ( # save in $1
3796
- [ ]{0,%d}
3797
- (?:
3798
- <\?\w+\b\s+.*?\?> # XML processing instruction
3799
- |
3800
- <\w+:\w+\b\s+.*?/> # namespaced single tag
3801
- )
3802
- [ \t]*
3803
- (?=\n{2,}|\Z) # followed by a blank line or end of document
3804
- )
3805
- """ % (tab_width - 1), re.X)
3806
- _xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
3807
-
3808
-
3809
- def _hr_tag_re_from_tab_width(tab_width: int) -> re.Pattern:
3810
- return re.compile(r"""
3811
- (?:
3812
- (?<=\n\n) # Starting after a blank line
3813
- | # or
3814
- \A\n? # the beginning of the doc
3815
- )
3816
- ( # save in \1
3817
- [ ]{0,%d}
3818
- <(hr) # start tag = \2
3819
- \b # word break
3820
- ([^<>])*? #
3821
- /?> # the matching end tag
3822
- [ \t]*
3823
- (?=\n{2,}|\Z) # followed by a blank line or end of document
3824
- )
3825
- """ % (tab_width - 1), re.X)
3826
- _hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)
3827
-
3828
-
3829
- def _xml_escape_attr(attr: str, skip_single_quote: bool = True) -> str:
3830
- """Escape the given string for use in an HTML/XML tag attribute.
3831
-
3832
- By default this doesn't bother with escaping `'` to `&#39;`, presuming that
3833
- the tag attribute is surrounded by double quotes.
3834
- """
3835
- escaped = _AMPERSAND_RE.sub('&amp;', attr)
3836
-
3837
- escaped = (attr
3838
- .replace('"', '&quot;')
3839
- .replace('<', '&lt;')
3840
- .replace('>', '&gt;'))
3841
- if not skip_single_quote:
3842
- escaped = escaped.replace("'", "&#39;")
3843
- return escaped
3844
-
3845
-
3846
- def _xml_encode_email_char_at_random(ch: str) -> str:
3847
- r = random()
3848
- # Roughly 10% raw, 45% hex, 45% dec.
3849
- # '@' *must* be encoded. I [John Gruber] insist.
3850
- # Issue 26: '_' must be encoded.
3851
- if r > 0.9 and ch not in "@_":
3852
- return ch
3853
- elif r < 0.45:
3854
- # The [1:] is to drop leading '0': 0x63 -> x63
3855
- return '&#%s;' % hex(ord(ch))[1:]
3856
- else:
3857
- return '&#%s;' % ord(ch)
3858
-
3859
-
3860
- def _html_escape_url(
3861
- attr: str,
3862
- safe_mode: Union[_safe_mode, bool, None] = False,
3863
- charset: Optional[str] = None
3864
- ):
3865
- """
3866
- Replace special characters that are potentially malicious in url string.
3867
-
3868
- Args:
3869
- charset: don't escape characters from this charset. Currently the only
3870
- exception is for '+' when charset=='base64'
3871
- """
3872
- escaped = (attr
3873
- .replace('"', '&quot;')
3874
- .replace('<', '&lt;')
3875
- .replace('>', '&gt;'))
3876
- if safe_mode:
3877
- if charset != 'base64':
3878
- escaped = escaped.replace('+', ' ')
3879
- escaped = escaped.replace("'", "&#39;")
3880
- return escaped
3881
-
3882
-
3883
- # ---- mainline
3884
-
3885
- class _NoReflowFormatter(argparse.RawDescriptionHelpFormatter):
3886
- """An argparse formatter that does NOT reflow the description."""
3887
- def format_description(self, description):
3888
- return description or ""
3889
-
3890
-
3891
- def _test():
3892
- import doctest
3893
- doctest.testmod()
3894
-
3895
-
3896
- def main(argv=None):
3897
- if argv is None:
3898
- argv = sys.argv
3899
- if not logging.root.handlers:
3900
- logging.basicConfig()
3901
-
3902
- parser = argparse.ArgumentParser(
3903
- prog="markdown2", description=cmdln_desc, usage='%(prog)s [PATHS...]',
3904
- formatter_class=_NoReflowFormatter
3905
- )
3906
- parser.add_argument('--version', action='version',
3907
- version='%(prog)s {version}'.format(version=__version__))
3908
- parser.add_argument('paths', nargs='*',
3909
- help=(
3910
- 'optional list of files to convert.'
3911
- 'If none are given, stdin will be used'
3912
- ))
3913
- parser.add_argument("-v", "--verbose", dest="log_level",
3914
- action="store_const", const=logging.DEBUG,
3915
- help="more verbose output")
3916
- parser.add_argument("--encoding",
3917
- help="specify encoding of text content")
3918
- parser.add_argument("--html4tags", action="store_true", default=False,
3919
- help="use HTML 4 style for empty element tags")
3920
- parser.add_argument("-s", "--safe", metavar="MODE", dest="safe_mode",
3921
- help="sanitize literal HTML: 'escape' escapes "
3922
- "HTML meta chars, 'replace' replaces with an "
3923
- "[HTML_REMOVED] note")
3924
- parser.add_argument("-x", "--extras", action="append",
3925
- help="Turn on specific extra features (not part of "
3926
- "the core Markdown spec). See above.")
3927
- parser.add_argument("--use-file-vars",
3928
- help="Look for and use Emacs-style 'markdown-extras' "
3929
- "file var to turn on extras. See "
3930
- "<https://github.com/trentm/python-markdown2/wiki/Extras>")
3931
- parser.add_argument("--link-patterns-file",
3932
- help="path to a link pattern file")
3933
- parser.add_argument("--self-test", action="store_true",
3934
- help="run internal self-tests (some doctests)")
3935
- parser.add_argument("--compare", action="store_true",
3936
- help="run against Markdown.pl as well (for testing)")
3937
- parser.set_defaults(log_level=logging.INFO, compare=False,
3938
- encoding="utf-8", safe_mode=None, use_file_vars=False)
3939
- opts = parser.parse_args()
3940
- paths = opts.paths
3941
- log.setLevel(opts.log_level)
3942
-
3943
- if opts.self_test:
3944
- return _test()
3945
-
3946
- if opts.extras:
3947
- extras = {}
3948
- for s in opts.extras:
3949
- splitter = re.compile("[,;: ]+")
3950
- for e in splitter.split(s):
3951
- if '=' in e:
3952
- ename, earg = e.split('=', 1)
3953
- try:
3954
- earg = int(earg)
3955
- except ValueError:
3956
- pass
3957
- else:
3958
- ename, earg = e, None
3959
- extras[ename] = earg
3960
- else:
3961
- extras = None
3962
-
3963
- if opts.link_patterns_file:
3964
- link_patterns = []
3965
- f = open(opts.link_patterns_file)
3966
- try:
3967
- for i, line in enumerate(f.readlines()):
3968
- if not line.strip():
3969
- continue
3970
- if line.lstrip().startswith("#"):
3971
- continue
3972
- try:
3973
- pat, href = line.rstrip().rsplit(None, 1)
3974
- except ValueError:
3975
- raise MarkdownError("%s:%d: invalid link pattern line: %r"
3976
- % (opts.link_patterns_file, i+1, line))
3977
- link_patterns.append(
3978
- (_regex_from_encoded_pattern(pat), href))
3979
- finally:
3980
- f.close()
3981
- else:
3982
- link_patterns = None
3983
-
3984
- from os.path import abspath, dirname, exists, join
3985
- markdown_pl = join(dirname(dirname(abspath(__file__))), "test",
3986
- "Markdown.pl")
3987
- if not paths:
3988
- paths = ['-']
3989
- for path in paths:
3990
- if path == '-':
3991
- text = sys.stdin.read()
3992
- else:
3993
- fp = codecs.open(path, 'r', opts.encoding)
3994
- text = fp.read()
3995
- fp.close()
3996
- if opts.compare:
3997
- from subprocess import PIPE, Popen
3998
- print("==== Markdown.pl ====")
3999
- p = Popen('perl %s' % markdown_pl, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True)
4000
- p.stdin.write(text.encode('utf-8'))
4001
- p.stdin.close()
4002
- perl_html = p.stdout.read().decode('utf-8')
4003
- sys.stdout.write(perl_html)
4004
- print("==== markdown2.py ====")
4005
- html = markdown(text,
4006
- html4tags=opts.html4tags,
4007
- safe_mode=opts.safe_mode,
4008
- extras=extras, link_patterns=link_patterns,
4009
- use_file_vars=opts.use_file_vars,
4010
- cli=True)
4011
- sys.stdout.write(html)
4012
- if extras and "toc" in extras:
4013
- log.debug("toc_html: " +
4014
- str(html.toc_html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')))
4015
- if opts.compare:
4016
- test_dir = join(dirname(dirname(abspath(__file__))), "test")
4017
- if exists(join(test_dir, "test_markdown2.py")):
4018
- sys.path.insert(0, test_dir)
4019
- from test_markdown2 import norm_html_from_html
4020
- norm_html = norm_html_from_html(html)
4021
- norm_perl_html = norm_html_from_html(perl_html)
4022
- else:
4023
- norm_html = html
4024
- norm_perl_html = perl_html
4025
- print("==== match? %r ====" % (norm_perl_html == norm_html))
4026
-
4027
-
4028
- if __name__ == "__main__":
4029
- sys.exit(main(sys.argv))