md2pdf-tex 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
md2pdf.py ADDED
@@ -0,0 +1,416 @@
1
+ """
2
+ Convert a Markdown file with LaTeX math to a polished PDF file using KaTeX
3
+ for math rendering and Selenium (headless Chrome) for printing.
4
+
5
+ Usage: python3 md2pdf.py <input.md> [output.pdf]
6
+ """
7
+ import re
8
+ import sys
9
+ import os
10
+ import tempfile
11
+ import base64
12
+ from selenium import webdriver
13
+ from selenium.webdriver.chrome.options import Options
14
+ from selenium.webdriver.support.ui import WebDriverWait
15
+
16
+ # Execution logic is wrapped in main() at the bottom
17
+
18
+
19
+ # --- Step 1: Protect math blocks from markdown processing ---
20
+ # Extract display math $$ ... $$ and inline math $ ... $
21
+ # We'll convert markdown manually since we need fine control.
22
+
23
+ def md_to_html(md):
24
+ """Simple markdown to HTML converter that preserves LaTeX math."""
25
+ lines = md.split('\n')
26
+ html_lines = []
27
+ in_list = False
28
+ in_sublist = False
29
+ i = 0
30
+
31
+ while i < len(lines):
32
+ line = lines[i]
33
+
34
+ # Display math block
35
+ if line.strip().startswith('$$'):
36
+ if in_list:
37
+ if in_sublist:
38
+ html_lines.append('</ul></li>')
39
+ in_sublist = False
40
+ html_lines.append('</ul>')
41
+ in_list = False
42
+ # Collect all lines until closing $$
43
+ math_content = []
44
+ if line.strip() == '$$':
45
+ i += 1
46
+ while i < len(lines) and lines[i].strip() != '$$':
47
+ math_content.append(lines[i])
48
+ i += 1
49
+ i += 1 # skip closing $$
50
+ elif line.strip().endswith('$$') and line.strip() != '$$':
51
+ # Single-line: $$...$$
52
+ inner = line.strip()[2:-2]
53
+ math_content.append(inner)
54
+ i += 1
55
+ else:
56
+ # Opens with $$ but doesn't close on same line
57
+ rest = line.strip()[2:]
58
+ if rest:
59
+ math_content.append(rest)
60
+ i += 1
61
+ while i < len(lines) and not lines[i].strip().endswith('$$'):
62
+ math_content.append(lines[i])
63
+ i += 1
64
+ if i < len(lines):
65
+ last = lines[i].strip()
66
+ if last != '$$':
67
+ math_content.append(last[:-2])
68
+ i += 1
69
+
70
+ latex = '\n'.join(math_content).strip()
71
+ html_lines.append(f'<div class="math-display">$${latex}$$</div>')
72
+ continue
73
+
74
+ # Headers
75
+ if line.startswith('###'):
76
+ if in_list:
77
+ if in_sublist:
78
+ html_lines.append('</ul></li>')
79
+ in_sublist = False
80
+ html_lines.append('</ul>')
81
+ in_list = False
82
+ html_lines.append(f'<h3>{process_inline(line.lstrip("#").strip())}</h3>')
83
+ i += 1
84
+ continue
85
+ if line.startswith('##'):
86
+ if in_list:
87
+ if in_sublist:
88
+ html_lines.append('</ul></li>')
89
+ in_sublist = False
90
+ html_lines.append('</ul>')
91
+ in_list = False
92
+ html_lines.append(f'<h2>{process_inline(line.lstrip("#").strip())}</h2>')
93
+ i += 1
94
+ continue
95
+ if line.startswith('#'):
96
+ if in_list:
97
+ if in_sublist:
98
+ html_lines.append('</ul></li>')
99
+ in_sublist = False
100
+ html_lines.append('</ul>')
101
+ in_list = False
102
+ html_lines.append(f'<h1>{process_inline(line.lstrip("#").strip())}</h1>')
103
+ i += 1
104
+ continue
105
+
106
+ # List items (detect indent level)
107
+ if line.strip().startswith('- '):
108
+ # Determine indent level
109
+ stripped = line.rstrip()
110
+ leading_ws = len(stripped) - len(stripped.lstrip())
111
+ is_sub = leading_ws >= 2 or stripped.startswith('\t-') or stripped.startswith('\t\t-')
112
+ text = line.strip().lstrip('- ').strip()
113
+
114
+ if is_sub:
115
+ if not in_list:
116
+ html_lines.append('<ul>')
117
+ in_list = True
118
+ if not in_sublist:
119
+ html_lines.append('<li><ul>')
120
+ in_sublist = True
121
+ html_lines.append(f'<li>{process_inline(text)}</li>')
122
+ else:
123
+ if in_sublist:
124
+ html_lines.append('</ul></li>')
125
+ in_sublist = False
126
+ if not in_list:
127
+ html_lines.append('<ul>')
128
+ in_list = True
129
+ html_lines.append(f'<li>{process_inline(text)}</li>')
130
+ i += 1
131
+ continue
132
+
133
+ # Close any open list
134
+ if in_list and line.strip() == '':
135
+ if in_sublist:
136
+ html_lines.append('</ul></li>')
137
+ in_sublist = False
138
+ html_lines.append('</ul>')
139
+ in_list = False
140
+ i += 1
141
+ continue
142
+
143
+ # Empty line
144
+ if line.strip() == '':
145
+ i += 1
146
+ continue
147
+
148
+ # Regular paragraph — collect contiguous lines
149
+ para_lines = [line]
150
+ i += 1
151
+ while i < len(lines):
152
+ next_line = lines[i]
153
+ if (next_line.strip() == '' or
154
+ next_line.startswith('#') or
155
+ next_line.strip().startswith('- ') or
156
+ next_line.strip().startswith('$$')):
157
+ break
158
+ para_lines.append(next_line)
159
+ i += 1
160
+
161
+ if in_list:
162
+ if in_sublist:
163
+ html_lines.append('</ul></li>')
164
+ in_sublist = False
165
+ html_lines.append('</ul>')
166
+ in_list = False
167
+
168
+ paragraph = ' '.join(l.strip() for l in para_lines)
169
+ html_lines.append(f'<p>{process_inline(paragraph)}</p>')
170
+
171
+ # Close any remaining open list
172
+ if in_sublist:
173
+ html_lines.append('</ul></li>')
174
+ if in_list:
175
+ html_lines.append('</ul>')
176
+
177
+ return '\n'.join(html_lines)
178
+
179
+
180
+ def process_inline(text):
181
+ """Process inline markdown: bold, italic, code, inline math."""
182
+ # Protect inline math first — replace with placeholders
183
+ math_parts = []
184
+ def save_math(m):
185
+ math_parts.append(m.group(0))
186
+ return f'%%MATH{len(math_parts)-1}%%'
187
+
188
+ text = re.sub(r'(?<!\$)\$(?!\$)(.+?)\$(?!\$)', save_math, text)
189
+
190
+ # Code
191
+ text = re.sub(r'`(.+?)`', r'<code>\1</code>', text)
192
+ # Bold
193
+ text = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', text)
194
+ # Italic
195
+ text = re.sub(r'\*(.+?)\*', r'<em>\1</em>', text)
196
+
197
+ # Restore math
198
+ for i, m in enumerate(math_parts):
199
+ text = text.replace(f'%%MATH{i}%%', f'<span class="math-inline">{m}</span>')
200
+
201
+ return text
202
+
203
+
204
+ def main():
205
+ if len(sys.argv) < 2:
206
+ print("Usage: md2pdf <input.md> [output.pdf]")
207
+ sys.exit(1)
208
+
209
+ INPUT = sys.argv[1]
210
+ if len(sys.argv) >= 3:
211
+ OUTPUT_PDF = sys.argv[2]
212
+ else:
213
+ OUTPUT_PDF = os.path.splitext(os.path.basename(INPUT))[0] + ".pdf"
214
+
215
+ with open(INPUT, "r") as f:
216
+ md_text = f.read()
217
+
218
+ TITLE = os.path.splitext(os.path.basename(INPUT))[0].replace('_', ' ').title()
219
+
220
+ body_html = md_to_html(md_text)
221
+
222
+ full_html = f"""<!DOCTYPE html>
223
+ <html lang="en">
224
+ <head>
225
+ <meta charset="utf-8">
226
+ <title>{TITLE}</title>
227
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.22/dist/katex.min.css">
228
+ <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.22/dist/katex.min.js"></script>
229
+ <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.22/dist/contrib/auto-render.min.js"
230
+ onload="renderMathInElement(document.body, {{
231
+ delimiters: [
232
+ {{left: '$$', right: '$$', display: true}},
233
+ {{left: '$', right: '$', display: false}}
234
+ ],
235
+ throwOnError: false
236
+ }});"></script>
237
+ <style>
238
+ @import url('https://fonts.googleapis.com/css2?family=Source+Serif+4:ital,wght@0,400;0,600;0,700;1,400&family=Source+Sans+3:wght@400;600;700&family=Source+Code+Pro:wght@400&display=swap');
239
+
240
+ :root {{
241
+ --text: #1a1a2e;
242
+ --muted: #555;
243
+ --accent: #2d5aa0;
244
+ --border: #d0d7de;
245
+ --bg-code: #f6f8fa;
246
+ }}
247
+
248
+ * {{ margin: 0; padding: 0; box-sizing: border-box; }}
249
+
250
+ body {{
251
+ font-family: 'Source Serif 4', Georgia, 'Times New Roman', serif;
252
+ font-size: 11pt;
253
+ line-height: 1.7;
254
+ color: var(--text);
255
+ max-width: 680px;
256
+ margin: 0 auto;
257
+ padding: 48px 32px;
258
+ }}
259
+
260
+ h1 {{
261
+ font-family: 'Source Sans 3', 'Helvetica Neue', Arial, sans-serif;
262
+ font-size: 24pt;
263
+ font-weight: 700;
264
+ margin: 36px 0 12px 0;
265
+ color: var(--text);
266
+ letter-spacing: -0.02em;
267
+ }}
268
+
269
+ h2 {{
270
+ font-family: 'Source Sans 3', 'Helvetica Neue', Arial, sans-serif;
271
+ font-size: 17pt;
272
+ font-weight: 700;
273
+ margin: 32px 0 8px 0;
274
+ padding-bottom: 6px;
275
+ border-bottom: 2px solid var(--border);
276
+ color: var(--text);
277
+ letter-spacing: -0.01em;
278
+ }}
279
+
280
+ h3 {{
281
+ font-family: 'Source Sans 3', 'Helvetica Neue', Arial, sans-serif;
282
+ font-size: 13pt;
283
+ font-weight: 600;
284
+ margin: 24px 0 6px 0;
285
+ color: var(--accent);
286
+ }}
287
+
288
+ p {{
289
+ margin: 10px 0;
290
+ text-align: justify;
291
+ hyphens: auto;
292
+ }}
293
+
294
+ ul {{
295
+ margin: 8px 0;
296
+ padding-left: 24px;
297
+ }}
298
+
299
+ li {{
300
+ margin-bottom: 5px;
301
+ }}
302
+
303
+ li > ul {{
304
+ margin-top: 4px;
305
+ }}
306
+
307
+ strong {{
308
+ font-weight: 600;
309
+ }}
310
+
311
+ code {{
312
+ font-family: 'Source Code Pro', 'Menlo', 'Consolas', monospace;
313
+ font-size: 0.9em;
314
+ background: var(--bg-code);
315
+ padding: 1px 5px;
316
+ border-radius: 3px;
317
+ border: 1px solid #e1e4e8;
318
+ }}
319
+
320
+ .math-display {{
321
+ margin: 20px 0;
322
+ text-align: center;
323
+ overflow-x: auto;
324
+ }}
325
+
326
+ .math-inline .katex {{
327
+ font-size: 1.0em;
328
+ }}
329
+
330
+ .math-display .katex {{
331
+ font-size: 1.15em;
332
+ }}
333
+
334
+ /* Print-specific styles */
335
+ @media print {{
336
+ body {{
337
+ padding: 0;
338
+ max-width: none;
339
+ }}
340
+ h2 {{
341
+ page-break-after: avoid;
342
+ }}
343
+ .math-display {{
344
+ page-break-inside: avoid;
345
+ }}
346
+ }}
347
+ </style>
348
+ </head>
349
+ <body>
350
+ {body_html}
351
+ </body>
352
+ </html>
353
+ """
354
+
355
+ # Write HTML to a temporary file
356
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False, encoding='utf-8') as f:
357
+ f.write(full_html)
358
+ temp_html_path = f.name
359
+
360
+ try:
361
+ options = Options()
362
+ options.add_argument('--headless')
363
+ options.add_argument('--no-sandbox')
364
+ options.add_argument('--disable-dev-shm-usage')
365
+
366
+ # Point to playwright-installed chromium in test sandbox
367
+ sandbox_chrome = "/home/agent/.cache/ms-playwright/chromium-1223/chrome-linux64/chrome"
368
+ if os.path.exists(sandbox_chrome):
369
+ options.binary_location = sandbox_chrome
370
+
371
+ # Create driver (Selenium Manager handles chromedriver automatically)
372
+ driver = webdriver.Chrome(options=options)
373
+
374
+ try:
375
+ # Load the HTML file
376
+ driver.get(f"file://{os.path.abspath(temp_html_path)}")
377
+
378
+ # Wait for KaTeX to finish rendering
379
+ WebDriverWait(driver, 10).until(
380
+ lambda d: d.execute_script("""
381
+ const mathElements = document.querySelectorAll('.math-display, .math-inline');
382
+ if (mathElements.length === 0) return true;
383
+ return document.querySelectorAll('.katex').length > 0;
384
+ """)
385
+ )
386
+
387
+ # Print to PDF using CDP
388
+ print_settings = {
389
+ "printBackground": True,
390
+ "paperWidth": 8.27, # A4 width in inches
391
+ "paperHeight": 11.69, # A4 height in inches
392
+ "marginTop": 0.5, # 48px margin in inches (48 / 96 = 0.5)
393
+ "marginBottom": 0.5,
394
+ "marginLeft": 0.5,
395
+ "marginRight": 0.5
396
+ }
397
+
398
+ result = driver.execute_cdp_cmd("Page.printToPDF", print_settings)
399
+ pdf_data = base64.b64decode(result['data'])
400
+
401
+ with open(OUTPUT_PDF, "wb") as pdf_file:
402
+ pdf_file.write(pdf_data)
403
+
404
+ print(f"PDF written to {os.path.abspath(OUTPUT_PDF)}")
405
+ finally:
406
+ driver.quit()
407
+ except Exception as e:
408
+ print(f"Error printing PDF via Selenium: {e}", file=sys.stderr)
409
+ sys.exit(1)
410
+ finally:
411
+ if os.path.exists(temp_html_path):
412
+ os.remove(temp_html_path)
413
+
414
+
415
+ if __name__ == '__main__':
416
+ main()
@@ -0,0 +1,103 @@
1
+ Metadata-Version: 2.4
2
+ Name: md2pdf-tex
3
+ Version: 0.1.0
4
+ Summary: Convert Markdown with LaTeX math to PDF using KaTeX and headless Chrome
5
+ Author: Roberto Moura
6
+ Project-URL: Homepage, https://github.com/robertoffmoura/md2pdf
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.7
11
+ Description-Content-Type: text/markdown
12
+ Requires-Dist: selenium
13
+
14
+ # md2pdf
15
+
16
+ A minimalist, high-fidelity Markdown-to-PDF converter written in Python. It parses Markdown files containing LaTeX mathematical formulas, renders them using KaTeX, and outputs a print-perfect PDF using a headless Chrome instance via Selenium.
17
+
18
+ ## Features
19
+
20
+ - **High-Fidelity PDF Output**: Employs headless Google Chrome to print the rendered HTML layout to PDF, preserving pagination, fonts, margins, and alignments.
21
+ - **LaTeX Math Support**: Seamlessly renders inline math (`$...$`) and block math (`$$...$$`) via **KaTeX**.
22
+ - **Minimal Dependencies**: Requires only Python, Chrome/Chromium, and the Python `selenium` library.
23
+ - **Polished Typography**: Features premium, print-optimized font pairings (Source Serif 4, Source Sans 3, and Source Code Pro) loaded dynamically from Google Fonts.
24
+ - **Clean Markdown Parser**: A custom regex-based parser that translates standard Markdown elements into clean HTML while protecting LaTeX delimiters from markdown interference.
25
+
26
+ ---
27
+
28
+ ## Installation
29
+
30
+ ### 1. Prerequisites
31
+ - **Python 3.x**
32
+ - **Google Chrome** or **Chromium** browser installed on your system.
33
+
34
+ ### 2. Install Python Package
35
+ Install the Python dependencies via `pip`. It is recommended to use a virtual environment:
36
+
37
+ ```bash
38
+ # Create and activate a virtual environment (optional)
39
+ python3 -m venv venv
40
+ source venv/bin/activate
41
+
42
+ # Install Selenium
43
+ pip install selenium
44
+ ```
45
+
46
+ *Note: Selenium Manager will automatically locate and download the appropriate driver (`chromedriver`) for your Chrome version. No manual driver setup is needed.*
47
+
48
+ ---
49
+
50
+ ## Usage
51
+
52
+ Run the converter from your terminal:
53
+
54
+ ```bash
55
+ python3 md2pdf.py <input.md> [output.pdf]
56
+ ```
57
+
58
+ - **`<input.md>`**: The path to your input Markdown file.
59
+ - **`[output.pdf]`** *(Optional)*: The path for the output PDF file. If omitted, it defaults to the input file's name with a `.pdf` extension in the same directory.
60
+
61
+ ### Example
62
+
63
+ Given a file `document.md` containing:
64
+
65
+ ```markdown
66
+ # Physics Report
67
+
68
+ Let's discuss the Maxwell's equations. In differential form, Faraday's law of induction is:
69
+
70
+ $$\nabla \times \mathbf{E} = -\frac{\partial \mathbf{B}}{\partial t}$$
71
+
72
+ Where:
73
+ - $\mathbf{E}$ is the electric field.
74
+ - $\mathbf{B}$ is the magnetic field.
75
+ ```
76
+
77
+ Convert it using:
78
+
79
+ ```bash
80
+ python3 md2pdf.py document.md
81
+ ```
82
+
83
+ This generates `document.pdf` with properly formatted headers, bulleted lists, and beautiful, high-resolution mathematical equations.
84
+
85
+ ---
86
+
87
+ ## Supported Markdown Elements
88
+
89
+ - **Headings**: `#` (H1), `##` (H2), and `###` (H3).
90
+ - **Inline Math**: `$ ... $` for inline formulas (e.g. $E = mc^2$).
91
+ - **Block Math**: `$$ ... $$` for centered display equations.
92
+ - **Unordered Lists**: `- item` and indented `- subitem`.
93
+ - **Text Styling**: `**bold**`, `*italic*`, and inline `` `code` `` fragments.
94
+ - **Paragraphs**: Contiguous lines are automatically joined into standard justified paragraphs.
95
+
96
+ ---
97
+
98
+ ## How It Works
99
+
100
+ 1. **Preprocessing**: The script reads the input Markdown, identifies math blocks, and translates markdown elements (headers, lists, styling) to semantic HTML tags.
101
+ 2. **HTML Generation**: It constructs a self-contained HTML document loading **KaTeX** stylesheets/scripts and applying typography rules.
102
+ 3. **Rendering & Math Processing**: Selenium starts a headless Chrome browser, loads the HTML, and waits for KaTeX's auto-render extension to process the mathematical formatting.
103
+ 4. **PDF Printing**: Headless Chrome's print-to-PDF functionality (`Page.printToPDF` via Chrome DevTools Protocol) is triggered with A4 measurements and standard margins to produce a high-fidelity document.
@@ -0,0 +1,6 @@
1
+ md2pdf.py,sha256=bP61rxR8Y9ishF3uCg35c26Yeo7S3HamN1pcns3IZSE,10389
2
+ md2pdf_tex-0.1.0.dist-info/METADATA,sha256=b3cSQNYeElIEryQfOktrBzBcG6oHNhqkl0VfUSLKx0w,4041
3
+ md2pdf_tex-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
4
+ md2pdf_tex-0.1.0.dist-info/entry_points.txt,sha256=XIFGjuiUXJ4L6dHsDEx0qC0mzoD2FOk4GQSljXl9J0A,39
5
+ md2pdf_tex-0.1.0.dist-info/top_level.txt,sha256=M6w-SJJb4vYpGkCZOC_PLJ47_mldkUsf0HvtDmdhJJ0,7
6
+ md2pdf_tex-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ md2pdf = md2pdf:main
@@ -0,0 +1 @@
1
+ md2pdf