markdown_convert 1.2.12__py3-none-any.whl → 1.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,338 @@
1
+ """
2
+ Module to convert a markdown file to a pdf file.
3
+ Author: @julynx
4
+ """
5
+
6
+ import os
7
+ import secrets
8
+ import time
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+
12
+ import markdown2
13
+ from playwright.sync_api import sync_playwright
14
+
15
+ from .constants import MARKDOWN_EXTENSIONS
16
+ from .resources import get_code_css_path, get_css_path, get_output_path
17
+ from .transform import (
18
+ create_sections,
19
+ render_mermaid_diagrams,
20
+ create_html_document,
21
+ render_checkboxes,
22
+ )
23
+ from .utils import drop_duplicates
24
+
25
+
26
+ def _generate_pdf_with_playwright(
27
+ html_content,
28
+ output_path,
29
+ *,
30
+ css_content=None,
31
+ base_dir=None,
32
+ dump_html=False,
33
+ nonce=None,
34
+ ):
35
+ """
36
+ Generate a PDF from HTML content using Playwright.
37
+
38
+ Args:
39
+ html_content (str): HTML content to convert.
40
+ output_path (str): Path to save the PDF file.
41
+ css_content (str, optional): CSS content to inject.
42
+ base_dir (Path, optional): Base directory for resolving relative paths in HTML.
43
+ dump_html (bool, optional): Whether to dump the HTML content to a file.
44
+ """
45
+ # Generate a cryptographic nonce for the Mermaid script
46
+
47
+ # Content Security Policy using nonce to whitelist only the Mermaid initialization script
48
+ # This prevents arbitrary JavaScript injection while allowing Mermaid to work
49
+ csp = (
50
+ "default-src 'none'; "
51
+ f"script-src 'nonce-{nonce}' https://cdn.jsdelivr.net; "
52
+ f"script-src-elem 'nonce-{nonce}' https://cdn.jsdelivr.net; "
53
+ "style-src 'unsafe-inline'; "
54
+ "img-src data: https: file:; "
55
+ "font-src data: https:; "
56
+ "connect-src https://cdn.jsdelivr.net;"
57
+ )
58
+
59
+ # Wrap HTML content with CSP and CSS
60
+ if css_content:
61
+ full_html = create_html_document(html_content, css_content, csp)
62
+ else:
63
+ full_html = html_content
64
+
65
+ with sync_playwright() as playwright:
66
+ browser = playwright.chromium.launch(
67
+ headless=True,
68
+ args=[
69
+ "--disable-dev-shm-usage",
70
+ "--disable-extensions",
71
+ "--disable-plugins",
72
+ "--disable-gpu",
73
+ "--no-first-run",
74
+ "--no-default-browser-check",
75
+ ],
76
+ )
77
+ context = browser.new_context(
78
+ java_script_enabled=True,
79
+ permissions=[],
80
+ geolocation=None,
81
+ accept_downloads=False,
82
+ )
83
+ page = context.new_page()
84
+
85
+ # Handle loading based on presence of base_dir
86
+ temp_html = None
87
+ try:
88
+ if base_dir:
89
+ temp_html = base_dir / f".temp_{os.getpid()}.html"
90
+ temp_html.write_text(full_html, encoding="utf-8")
91
+ page.goto(temp_html.as_uri(), wait_until="networkidle", timeout=30000)
92
+ else:
93
+ page.set_content(full_html, wait_until="networkidle", timeout=30000)
94
+
95
+ pdf_params = {
96
+ "format": "A4",
97
+ "print_background": True,
98
+ "margin": {
99
+ "top": "20mm",
100
+ "bottom": "20mm",
101
+ "left": "20mm",
102
+ "right": "20mm",
103
+ },
104
+ "path": output_path,
105
+ } # Playwright ignores None paths
106
+
107
+ pdf_bytes = page.pdf(**pdf_params)
108
+ return None if output_path else pdf_bytes
109
+
110
+ finally:
111
+ browser.close()
112
+ if temp_html and temp_html.exists() and not dump_html:
113
+ temp_html.unlink()
114
+
115
+
116
+ def _get_css_content(css_sources):
117
+ """
118
+ Get the CSS content from a list of CSS file paths.
119
+
120
+ Args:
121
+ css_sources (list): List of CSS file paths.
122
+ Returns:
123
+ str: Combined CSS content.
124
+ """
125
+ css_buffer = ""
126
+ for css_file in css_sources:
127
+ css_buffer += Path(css_file).read_text(encoding="utf-8") + "\n"
128
+ return css_buffer
129
+
130
+
131
+ def convert(
132
+ markdown_path,
133
+ css_path=None,
134
+ output_path=None,
135
+ *,
136
+ extend_default_css=True,
137
+ dump_html=False,
138
+ ):
139
+ """
140
+ Convert a markdown file to a pdf file.
141
+
142
+ Args:
143
+ markdown_path (str): Path to the markdown file.
144
+ css_path (str=None): Path to the CSS file.
145
+ output_path (str=None): Path to the output file.
146
+ extend_default_css (bool=True): Extend the default CSS file.
147
+ dump_html (bool=False): Dump the intermediate HTML to a file.
148
+ """
149
+ if css_path is None:
150
+ css_path = get_css_path()
151
+
152
+ if output_path is None:
153
+ output_path = get_output_path(markdown_path, None)
154
+
155
+ if extend_default_css:
156
+ css_sources = [get_code_css_path(), get_css_path(), css_path]
157
+ else:
158
+ css_sources = [get_code_css_path(), css_path]
159
+
160
+ css_sources = drop_duplicates(css_sources)
161
+
162
+ try:
163
+ nonce = secrets.token_urlsafe(16)
164
+ html = markdown2.markdown_path(markdown_path, extras=MARKDOWN_EXTENSIONS)
165
+ html = create_sections(html)
166
+ html = render_mermaid_diagrams(html, nonce=nonce)
167
+ html = render_checkboxes(html)
168
+
169
+ _generate_pdf_with_playwright(
170
+ html,
171
+ output_path,
172
+ css_content=_get_css_content(css_sources),
173
+ base_dir=Path(markdown_path).resolve().parent,
174
+ dump_html=dump_html,
175
+ nonce=nonce,
176
+ )
177
+
178
+ except Exception as exc:
179
+ raise RuntimeError(exc) from exc
180
+
181
+
182
+ def live_convert(
183
+ markdown_path, css_path=None, output_path=None, *, extend_default_css=True
184
+ ):
185
+ """
186
+ Convert a markdown file to a pdf file and watch for changes.
187
+
188
+ Args:
189
+ markdown_path (str): Path to the markdown file.
190
+ css_path (str=None): Path to the CSS file.
191
+ output_path (str=None): Path to the output file.
192
+ extend_default_css (bool=True): Extend the default CSS file.
193
+ """
194
+ if css_path is None:
195
+ css_path = get_css_path()
196
+
197
+ if output_path is None:
198
+ output_path = get_output_path(markdown_path, None)
199
+
200
+ live_converter = LiveConverter(
201
+ markdown_path,
202
+ css_path,
203
+ output_path,
204
+ extend_default_css=extend_default_css,
205
+ loud=True,
206
+ )
207
+ live_converter.observe()
208
+
209
+
210
+ def convert_text(markdown_text, css_text=None, *, extend_default_css=True):
211
+ """
212
+ Convert markdown text to a pdf file.
213
+
214
+ Args:
215
+ markdown_text (str): Markdown text.
216
+ css_text (str=None): CSS text.
217
+ extend_default_css (bool=True): Extend the default CSS file.
218
+
219
+ Returns:
220
+ PDF file as bytes.
221
+ """
222
+ default_css = Path(get_css_path()).read_text(encoding="utf-8")
223
+ code_css = Path(get_code_css_path()).read_text(encoding="utf-8")
224
+
225
+ if css_text is None:
226
+ css_text = default_css
227
+
228
+ if extend_default_css:
229
+ css_sources = [code_css, default_css, css_text]
230
+ else:
231
+ css_sources = [code_css, css_text]
232
+
233
+ try:
234
+ nonce = secrets.token_urlsafe(16)
235
+ html = markdown2.markdown(markdown_text, extras=MARKDOWN_EXTENSIONS)
236
+ html = create_sections(html)
237
+ html = render_mermaid_diagrams(html, nonce=nonce)
238
+ html = render_checkboxes(html)
239
+
240
+ return _generate_pdf_with_playwright(
241
+ html,
242
+ None,
243
+ css_content=_get_css_content(css_sources),
244
+ nonce=nonce,
245
+ )
246
+
247
+ except Exception as exc:
248
+ raise RuntimeError(exc) from exc
249
+
250
+
251
+ class LiveConverter:
252
+ """
253
+ Class to convert a markdown file to a pdf file and watch for changes.
254
+ """
255
+
256
+ def __init__(
257
+ self,
258
+ markdown_path,
259
+ css_path,
260
+ output_path,
261
+ *,
262
+ extend_default_css=True,
263
+ loud=False,
264
+ ):
265
+ """
266
+ Initialize the LiveConverter class.
267
+
268
+ Args:
269
+ markdown_path (str): Path to the markdown file.
270
+ css_path (str): Path to the CSS file.
271
+ output_path (str): Path to the output file.
272
+ extend_default_css (bool): Extend the default CSS file.
273
+ """
274
+ self.md_path = Path(markdown_path).absolute()
275
+ self.css_path = Path(css_path).absolute()
276
+ self.output_path = output_path
277
+ self.extend_default_css = extend_default_css
278
+ self.loud = loud
279
+
280
+ self.md_last_modified = None
281
+ self.css_last_modified = None
282
+
283
+ def get_last_modified_date(self, file_path):
284
+ """
285
+ Get the last modified date of a file.
286
+
287
+ Args:
288
+ file_path (str): Path to the file.
289
+
290
+ Returns:
291
+ Last modified date of the file.
292
+ """
293
+ return os.path.getmtime(file_path)
294
+
295
+ def write_pdf(self):
296
+ """
297
+ Write the pdf file.
298
+ """
299
+ convert(
300
+ self.md_path,
301
+ self.css_path,
302
+ self.output_path,
303
+ extend_default_css=self.extend_default_css,
304
+ )
305
+ if self.loud:
306
+ print(f"- PDF file updated: {datetime.now()}", flush=True)
307
+
308
+ def observe(self, poll_interval=1):
309
+ """
310
+ Observe the markdown and CSS files. Calls write_pdf() when a file is
311
+ modified.
312
+ """
313
+ self.write_pdf()
314
+
315
+ self.md_last_modified = self.get_last_modified_date(self.md_path)
316
+ self.css_last_modified = self.get_last_modified_date(self.css_path)
317
+
318
+ try:
319
+ while True:
320
+
321
+ markdown_modified = self.get_last_modified_date(self.md_path)
322
+ css_modified = self.get_last_modified_date(self.css_path)
323
+
324
+ if (
325
+ markdown_modified != self.md_last_modified
326
+ or css_modified != self.css_last_modified
327
+ ):
328
+
329
+ self.write_pdf()
330
+
331
+ self.md_last_modified = markdown_modified
332
+ self.css_last_modified = css_modified
333
+
334
+ time.sleep(poll_interval)
335
+
336
+ except KeyboardInterrupt:
337
+ if self.loud:
338
+ print("\nInterrupted by user.\n", flush=True)
@@ -0,0 +1,101 @@
1
+ """
2
+ This module contains functions that are used to get the output path, the CSS
3
+ path, and the usage message.
4
+ Author: @julynx
5
+ """
6
+
7
+ from pathlib import Path
8
+
9
+ try:
10
+ # Python 3.9+
11
+ from importlib.resources import files
12
+ except ImportError:
13
+ # Fallback for older Python versions
14
+ from importlib_resources import files
15
+
16
+ from .constants import BLUE, CYAN, GREEN, YELLOW, OPTIONS, OPTIONS_MODES
17
+ from .utils import color
18
+
19
+
20
+ def get_output_path(markdown_path, output_dir=None):
21
+ """
22
+ Get the output path for the pdf file.
23
+
24
+ Args:
25
+ markdown_path (str): The path to the markdown file.
26
+ output_dir (str): The output directory.
27
+
28
+ Returns:
29
+ str: The output path.
30
+ """
31
+ markdown_path = Path(markdown_path)
32
+
33
+ if output_dir is None:
34
+ return markdown_path.parent / f"{markdown_path.stem}.pdf"
35
+
36
+ output_dir = Path(output_dir)
37
+
38
+ if output_dir.suffix == ".pdf":
39
+ return output_dir
40
+
41
+ return output_dir.parent / f"{Path(markdown_path).stem}.pdf"
42
+
43
+
44
+ def get_css_path():
45
+ """
46
+ Get the path to the default CSS file.
47
+
48
+ Returns:
49
+ str: The path to the default CSS file.
50
+ """
51
+ package_files = files("markdown_convert")
52
+ css_file = package_files / "default.css"
53
+ return str(css_file)
54
+
55
+
56
+ def get_code_css_path():
57
+ """
58
+ Get the path to the code CSS file.
59
+
60
+ Returns:
61
+ str: The path to the code CSS file.
62
+ """
63
+ package_files = files("markdown_convert")
64
+ css_file = package_files / "code.css"
65
+ return str(css_file)
66
+
67
+
68
+ def get_usage():
69
+ """
70
+ Returns a message describing how to use the program.
71
+
72
+ Returns:
73
+ str: The usage message.
74
+ """
75
+ commd = (
76
+ f"{color(GREEN, 'markdown-convert')} "
77
+ f"[{color(YELLOW, OPTIONS[0])}] [{color(BLUE, 'options')}]"
78
+ )
79
+ option_one = (
80
+ f"{color(BLUE, OPTIONS[1])}{color(CYAN, '=')}"
81
+ f"{color(CYAN, '|'.join(OPTIONS_MODES))}"
82
+ )
83
+ option_two = (
84
+ f"{color(BLUE, OPTIONS[2])}{color(CYAN, '=')}[{color(CYAN, 'css_file_path')}]"
85
+ )
86
+ option_three = f"{color(BLUE, OPTIONS[3])}{color(CYAN, '=')}[{color(CYAN, 'output_file_path')}]"
87
+
88
+ usage = (
89
+ "\n"
90
+ "Usage:\n"
91
+ f" {commd}\n"
92
+ "\n"
93
+ "Options:\n"
94
+ f" {option_one}\n"
95
+ " Convert the markdown file once (default) or live.\n"
96
+ f" {option_two}\n"
97
+ " Use a custom CSS file.\n"
98
+ f" {option_three}\n"
99
+ " Specify the output file path.\n"
100
+ )
101
+ return usage
@@ -0,0 +1,107 @@
1
+ """
2
+ Module for transforming HTML content.
3
+ """
4
+
5
+ import re
6
+
7
+
8
+ def create_html_document(html_content, css_content, csp):
9
+ """
10
+ Creates a complete HTML document with the given content, CSS, and Content Security Policy.
11
+ Args:
12
+ html_content (str): The HTML content to include in the body.
13
+ css_content (str): The CSS styles to include in the head.
14
+ csp (str): The Content Security Policy string.
15
+ Returns:
16
+ str: A complete HTML document as a string.
17
+ """
18
+ return f"""<!DOCTYPE html>
19
+ <html>
20
+ <head>
21
+ <meta charset="UTF-8">
22
+ <meta http-equiv="Content-Security-Policy" content="{csp}">
23
+ <style>
24
+ {css_content}
25
+ </style>
26
+ </head>
27
+ <body>
28
+ {html_content}
29
+ </body>
30
+ </html>"""
31
+
32
+
33
+ def create_sections(html):
34
+ """
35
+ Creates h2 sections, from the first h2 to the next h2, wrapping them in <section> tags
36
+ using regular expressions.
37
+ Args:
38
+ html (str): HTML content.
39
+ Returns:
40
+ HTML content with sections wrapped in <section> tags.
41
+ """
42
+ pattern = re.compile(r"(<h2.*?>.*?</h2>)(.*?)(?=(<h2.*?>|$))", re.DOTALL)
43
+
44
+ def wrap_section(match):
45
+ return f"<section>\n{match.group(1)}\n{match.group(2)}\n</section>\n"
46
+
47
+ # Split by code blocks to avoid processing text inside them
48
+ parts = re.split(r"(<code>.*?</code>)", html, flags=re.DOTALL)
49
+ for part_index, _part in enumerate(parts):
50
+ # Only process parts that are NOT code blocks
51
+ if not parts[part_index].startswith("<code>"):
52
+ parts[part_index] = pattern.sub(wrap_section, parts[part_index])
53
+
54
+ return "".join(parts)
55
+
56
+
57
+ def render_mermaid_diagrams(html, *, nonce):
58
+ """
59
+ Renders Mermaid diagrams in the HTML content.
60
+
61
+ Args:
62
+ html (str): HTML content.
63
+ nonce (str): Cryptographic nonce for CSP.
64
+ Returns:
65
+ str: HTML content with rendered Mermaid diagrams.
66
+ """
67
+ mermaid_script = f"""
68
+ <script type="module" nonce="{nonce}">
69
+ import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.esm.min.mjs';
70
+ mermaid.initialize({{
71
+ startOnLoad: true,
72
+ theme: 'default',
73
+ themeVariables: {{}},
74
+ fontFamily: 'arial, verdana, sans-serif'
75
+ }});
76
+ </script>
77
+ """
78
+
79
+ if '<div class="mermaid">' in html:
80
+ html = mermaid_script + html
81
+
82
+ return html
83
+
84
+
85
+ def render_checkboxes(html):
86
+ """
87
+ Renders checkboxes in the HTML content by replacing input elements with SVG representations.
88
+ Args:
89
+ html (str): HTML content.
90
+ Returns:
91
+ str: HTML content with rendered checkboxes.
92
+ """
93
+ unchecked = "[ ]"
94
+ checked = "[x]"
95
+
96
+ unchecked_html = "<input type='checkbox'>"
97
+ checked_html = "<input type='checkbox' checked>"
98
+
99
+ # Split by code blocks to avoid processing text inside them
100
+ parts = re.split(r"(<code>.*?</code>)", html, flags=re.DOTALL)
101
+ for part_index, _part in enumerate(parts):
102
+ # Only process parts that are NOT code blocks
103
+ if not parts[part_index].startswith("<code>"):
104
+ parts[part_index] = parts[part_index].replace(unchecked, unchecked_html)
105
+ parts[part_index] = parts[part_index].replace(checked, checked_html)
106
+
107
+ return "".join(parts)
@@ -0,0 +1,38 @@
1
+ """
2
+ Utility functions for string manipulation.
3
+ Author: @julynx
4
+ """
5
+
6
+ import platform
7
+
8
+
9
+ def color(color_code, text):
10
+ """
11
+ Colorize text.
12
+
13
+ Args:
14
+ text (str): The text to colorize.
15
+ color (str): The color code.
16
+
17
+ Returns:
18
+ str: The colorized text.
19
+ """
20
+
21
+ # Disable if running on Windows
22
+ if platform.system() == "Windows":
23
+ return text
24
+
25
+ return f"\033[{color_code}m{text}\033[0m"
26
+
27
+
28
+ def drop_duplicates(lst):
29
+ """
30
+ Drops duplicates from the given list.
31
+
32
+ Args:
33
+ lst: List to remove duplicates from.
34
+
35
+ Returns:
36
+ List without duplicates.
37
+ """
38
+ return list(dict.fromkeys(lst))
@@ -0,0 +1,61 @@
1
+ """
2
+ This module contains functions to validate the input paths.
3
+ Author: @julynx
4
+ """
5
+
6
+ from pathlib import Path
7
+
8
+
9
+ def validate_markdown_path(markdown_path):
10
+ """
11
+ Validate the markdown file path.
12
+
13
+ Args:
14
+ markdown_path (str): The path to the markdown file.
15
+
16
+ Raises:
17
+ FileNotFoundError: If the file is not found.
18
+ ValueError: If the file is not a Markdown file.
19
+ """
20
+ if not Path(markdown_path).is_file():
21
+ raise FileNotFoundError(f"File not found: '{markdown_path}'")
22
+
23
+ if not markdown_path.endswith(".md"):
24
+ raise ValueError("File must be a Markdown file.")
25
+
26
+
27
+ def validate_css_path(css_path):
28
+ """
29
+ Validate the CSS file path.
30
+
31
+ Args:
32
+ css_path (str): The path to the CSS file.
33
+
34
+ Raises:
35
+ FileNotFoundError: If the file is not found.
36
+ ValueError: If the file is not a CSS file.
37
+ """
38
+ if not Path(css_path).is_file():
39
+ raise FileNotFoundError(f"File not found: '{css_path}'")
40
+
41
+ if not css_path.endswith(".css"):
42
+ raise ValueError("File must be a CSS file.")
43
+
44
+
45
+ def validate_output_path(output_dir):
46
+ """
47
+ Validate the output directory path.
48
+
49
+ Args:
50
+ output_dir (str): The path to the output directory.
51
+
52
+ Raises:
53
+ FileNotFoundError: If the directory is not found.
54
+ """
55
+ check_dir = Path(output_dir)
56
+
57
+ if output_dir.endswith(".pdf"):
58
+ check_dir = check_dir.parent
59
+
60
+ if not check_dir.is_dir():
61
+ raise FileNotFoundError(f"Directory not found: '{check_dir}'")