md2hwpx 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- md2hwpx/MarkdownToHtml.py +386 -0
- md2hwpx/MarkdownToHwpx.py +2862 -0
- md2hwpx/__init__.py +36 -0
- md2hwpx/__main__.py +6 -0
- md2hwpx/blank.hwpx +0 -0
- md2hwpx/cli.py +148 -0
- md2hwpx/config.py +74 -0
- md2hwpx/converter_api.py +54 -0
- md2hwpx/exceptions.py +33 -0
- md2hwpx/frontmatter_parser.py +123 -0
- md2hwpx/marko_adapter.py +510 -0
- md2hwpx-0.1.2.dist-info/METADATA +232 -0
- md2hwpx-0.1.2.dist-info/RECORD +18 -0
- md2hwpx-0.1.2.dist-info/WHEEL +5 -0
- md2hwpx-0.1.2.dist-info/entry_points.txt +2 -0
- md2hwpx-0.1.2.dist-info/licenses/LICENSE +22 -0
- md2hwpx-0.1.2.dist-info/licenses/NOTICE +7 -0
- md2hwpx-0.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import zipfile
|
|
6
|
+
import io
|
|
7
|
+
from PIL import Image
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MarkdownToHtml:
|
|
11
|
+
@staticmethod
|
|
12
|
+
def convert_to_html(input_path, output_path, json_ast=None):
|
|
13
|
+
"""
|
|
14
|
+
Convert Markdown to HTML.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
input_path: Original input file path (for image resolution)
|
|
18
|
+
output_path: Output HTML file path
|
|
19
|
+
json_ast: Pre-parsed Pandoc-like AST dict (from MarkoToPandocAdapter)
|
|
20
|
+
"""
|
|
21
|
+
if json_ast is None:
|
|
22
|
+
raise ValueError("json_ast parameter is required")
|
|
23
|
+
|
|
24
|
+
converter = MarkdownToHtml(json_ast)
|
|
25
|
+
final_html = converter.convert()
|
|
26
|
+
|
|
27
|
+
# Output Directory
|
|
28
|
+
output_dir = os.path.dirname(output_path)
|
|
29
|
+
if not output_dir:
|
|
30
|
+
output_dir = "."
|
|
31
|
+
|
|
32
|
+
# Images Directory
|
|
33
|
+
images_dir = os.path.join(output_dir, "images")
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
if converter.images:
|
|
37
|
+
if not os.path.exists(images_dir):
|
|
38
|
+
os.makedirs(images_dir, exist_ok=True)
|
|
39
|
+
|
|
40
|
+
for img in converter.images:
|
|
41
|
+
# img = {'path': ..., 'filename': ...}
|
|
42
|
+
src_path = img['src_path'] # original path in AST (media/image1.png)
|
|
43
|
+
fname = img['filename'] # image1.png
|
|
44
|
+
target_path = os.path.join(images_dir, fname)
|
|
45
|
+
|
|
46
|
+
embedded = False
|
|
47
|
+
|
|
48
|
+
# Candidates for image source
|
|
49
|
+
candidates_to_check = []
|
|
50
|
+
|
|
51
|
+
# 1. As-is
|
|
52
|
+
candidates_to_check.append(src_path)
|
|
53
|
+
|
|
54
|
+
# 2. Relative to Input File
|
|
55
|
+
input_dir = os.path.dirname(os.path.abspath(input_path))
|
|
56
|
+
candidates_to_check.append(os.path.join(input_dir, src_path))
|
|
57
|
+
|
|
58
|
+
# Try Local File Candidates
|
|
59
|
+
for cand_path in candidates_to_check:
|
|
60
|
+
if os.path.exists(cand_path):
|
|
61
|
+
shutil.copy2(cand_path, target_path)
|
|
62
|
+
embedded = True
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
if not embedded:
|
|
66
|
+
print(f"[Warn] Image not found: {src_path}", file=sys.stderr)
|
|
67
|
+
|
|
68
|
+
except Exception as e:
|
|
69
|
+
print(f"[Error] Image processing failed: {e}", file=sys.stderr)
|
|
70
|
+
|
|
71
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
72
|
+
f.write(final_html)
|
|
73
|
+
|
|
74
|
+
def __init__(self, json_ast):
|
|
75
|
+
self.ast = json_ast
|
|
76
|
+
self.output = []
|
|
77
|
+
# Footnotes list
|
|
78
|
+
self.footnotes = []
|
|
79
|
+
self.images = [] # metadata for images
|
|
80
|
+
self.title = None
|
|
81
|
+
self._extract_metadata()
|
|
82
|
+
|
|
83
|
+
def _extract_metadata(self):
|
|
84
|
+
if not self.ast:
|
|
85
|
+
return
|
|
86
|
+
meta = self.ast.get('meta', {})
|
|
87
|
+
|
|
88
|
+
# Title
|
|
89
|
+
if 'title' in meta:
|
|
90
|
+
t_obj = meta['title']
|
|
91
|
+
if t_obj.get('t') == 'MetaInlines':
|
|
92
|
+
self.title = self._get_plain_text(t_obj.get('c', []))
|
|
93
|
+
elif t_obj.get('t') == 'MetaString':
|
|
94
|
+
self.title = t_obj.get('c', "")
|
|
95
|
+
|
|
96
|
+
def _get_plain_text(self, inlines):
|
|
97
|
+
if not isinstance(inlines, list):
|
|
98
|
+
return ""
|
|
99
|
+
text = []
|
|
100
|
+
for item in inlines:
|
|
101
|
+
t = item.get('t')
|
|
102
|
+
c = item.get('c')
|
|
103
|
+
if t == 'Str':
|
|
104
|
+
text.append(c)
|
|
105
|
+
elif t == 'Space':
|
|
106
|
+
text.append(" ")
|
|
107
|
+
elif t in ['Strong', 'Emph', 'Underline', 'Strikeout', 'Superscript', 'Subscript', 'SmallCaps']:
|
|
108
|
+
text.append(self._get_plain_text(c))
|
|
109
|
+
elif t == 'Link':
|
|
110
|
+
text.append(self._get_plain_text(c[1]))
|
|
111
|
+
elif t == 'Image':
|
|
112
|
+
text.append(self._get_plain_text(c[1]))
|
|
113
|
+
elif t == 'Code':
|
|
114
|
+
text.append(c[1])
|
|
115
|
+
elif t == 'Quoted':
|
|
116
|
+
text.append('"' + self._get_plain_text(c[1]) + '"')
|
|
117
|
+
return "".join(text)
|
|
118
|
+
|
|
119
|
+
def convert(self):
|
|
120
|
+
blocks = self.ast.get('blocks', [])
|
|
121
|
+
body_content = self._process_blocks(blocks)
|
|
122
|
+
|
|
123
|
+
# Footnotes
|
|
124
|
+
if self.footnotes:
|
|
125
|
+
body_content += "\n<hr />\n<div class='footnotes'>\n<ol>\n"
|
|
126
|
+
for idx, note_blocks in enumerate(self.footnotes):
|
|
127
|
+
note_html = self._process_blocks(note_blocks)
|
|
128
|
+
body_content += f"<li id='fn{idx+1}'>{note_html}</li>\n"
|
|
129
|
+
body_content += "</ol>\n</div>"
|
|
130
|
+
|
|
131
|
+
# Wrap in HTML
|
|
132
|
+
title_tag = f"<title>{self.title}</title>" if self.title else "<title>Document</title>"
|
|
133
|
+
html = f"""<!DOCTYPE html>
|
|
134
|
+
<html lang="ko">
|
|
135
|
+
<head>
|
|
136
|
+
<meta charset="UTF-8">
|
|
137
|
+
{title_tag}
|
|
138
|
+
<style>
|
|
139
|
+
body {{ font-family: sans-serif; line-height: 1.6; max-width: 800px; margin: 0 auto; padding: 2rem; }}
|
|
140
|
+
img {{ max-width: 100%; height: auto; }}
|
|
141
|
+
table {{ border-collapse: collapse; width: 100%; }}
|
|
142
|
+
th, td {{ border: 1px solid #ddd; padding: 8px; }}
|
|
143
|
+
th {{ background-color: #f2f2f2; }}
|
|
144
|
+
code {{ background-color: #f0f0f0; padding: 2px 4px; border-radius: 4px; }}
|
|
145
|
+
pre {{ background-color: #f0f0f0; padding: 1rem; overflow-x: auto; }}
|
|
146
|
+
</style>
|
|
147
|
+
</head>
|
|
148
|
+
<body>
|
|
149
|
+
{body_content}
|
|
150
|
+
</body>
|
|
151
|
+
</html>"""
|
|
152
|
+
return html
|
|
153
|
+
|
|
154
|
+
def _process_blocks(self, blocks):
|
|
155
|
+
result = []
|
|
156
|
+
if not isinstance(blocks, list):
|
|
157
|
+
return ""
|
|
158
|
+
|
|
159
|
+
for block in blocks:
|
|
160
|
+
if not isinstance(block, dict):
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
b_type = block.get('t')
|
|
164
|
+
b_content = block.get('c')
|
|
165
|
+
|
|
166
|
+
if b_type == 'Header':
|
|
167
|
+
result.append(self._handle_header(b_content))
|
|
168
|
+
elif b_type == 'Para':
|
|
169
|
+
result.append(self._handle_para(b_content))
|
|
170
|
+
elif b_type == 'Plain':
|
|
171
|
+
result.append(self._handle_plain(b_content))
|
|
172
|
+
elif b_type == 'BulletList':
|
|
173
|
+
result.append(self._handle_bullet_list(b_content))
|
|
174
|
+
elif b_type == 'OrderedList':
|
|
175
|
+
result.append(self._handle_ordered_list(b_content))
|
|
176
|
+
elif b_type == 'CodeBlock':
|
|
177
|
+
result.append(self._handle_code_block(b_content))
|
|
178
|
+
elif b_type == 'Table':
|
|
179
|
+
result.append(self._handle_table(b_content))
|
|
180
|
+
elif b_type == 'BlockQuote':
|
|
181
|
+
result.append(self._handle_blockquote(b_content))
|
|
182
|
+
elif b_type == 'HorizontalRule':
|
|
183
|
+
result.append("<hr />")
|
|
184
|
+
else:
|
|
185
|
+
# Skip unknown block types
|
|
186
|
+
pass
|
|
187
|
+
|
|
188
|
+
return "\n".join(result)
|
|
189
|
+
|
|
190
|
+
def _process_inlines(self, inlines):
|
|
191
|
+
result = []
|
|
192
|
+
if not isinstance(inlines, list):
|
|
193
|
+
return ""
|
|
194
|
+
|
|
195
|
+
for item in inlines:
|
|
196
|
+
if not isinstance(item, dict):
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
i_type = item.get('t')
|
|
200
|
+
i_content = item.get('c')
|
|
201
|
+
|
|
202
|
+
if i_type == 'Str':
|
|
203
|
+
result.append(i_content)
|
|
204
|
+
elif i_type == 'Space':
|
|
205
|
+
result.append(" ")
|
|
206
|
+
elif i_type == 'Strong':
|
|
207
|
+
result.append(f"<strong>{self._process_inlines(i_content)}</strong>")
|
|
208
|
+
elif i_type == 'Emph':
|
|
209
|
+
result.append(f"<em>{self._process_inlines(i_content)}</em>")
|
|
210
|
+
elif i_type == 'Link':
|
|
211
|
+
text_content = i_content[1]
|
|
212
|
+
target_url = i_content[2][0]
|
|
213
|
+
result.append(f'<a href="{target_url}">{self._process_inlines(text_content)}</a>')
|
|
214
|
+
elif i_type == 'Code':
|
|
215
|
+
result.append(f"<code>{i_content[1]}</code>")
|
|
216
|
+
elif i_type == 'SoftBreak':
|
|
217
|
+
result.append(" ")
|
|
218
|
+
elif i_type == 'LineBreak':
|
|
219
|
+
result.append("<br />")
|
|
220
|
+
elif i_type == 'Underline':
|
|
221
|
+
result.append(f"<u>{self._process_inlines(i_content)}</u>")
|
|
222
|
+
elif i_type == 'Superscript':
|
|
223
|
+
result.append(f"<sup>{self._process_inlines(i_content)}</sup>")
|
|
224
|
+
elif i_type == 'Subscript':
|
|
225
|
+
result.append(f"<sub>{self._process_inlines(i_content)}</sub>")
|
|
226
|
+
elif i_type == 'Strikeout':
|
|
227
|
+
result.append(f"<del>{self._process_inlines(i_content)}</del>")
|
|
228
|
+
elif i_type == 'Image':
|
|
229
|
+
result.append(self._handle_image(i_content))
|
|
230
|
+
elif i_type == 'Note':
|
|
231
|
+
result.append(self._handle_note(i_content))
|
|
232
|
+
else:
|
|
233
|
+
# Skip unknown inline types
|
|
234
|
+
pass
|
|
235
|
+
|
|
236
|
+
return "".join(result)
|
|
237
|
+
|
|
238
|
+
def _handle_header(self, content):
|
|
239
|
+
level = content[0]
|
|
240
|
+
text = self._process_inlines(content[2])
|
|
241
|
+
return f"<h{level}>{text}</h{level}>"
|
|
242
|
+
|
|
243
|
+
def _handle_para(self, content):
|
|
244
|
+
return f"<p>{self._process_inlines(content)}</p>"
|
|
245
|
+
|
|
246
|
+
def _handle_plain(self, content):
|
|
247
|
+
return self._process_inlines(content)
|
|
248
|
+
|
|
249
|
+
def _handle_bullet_list(self, content):
|
|
250
|
+
items_html = [f"<li>{self._process_blocks(item)}</li>" for item in content]
|
|
251
|
+
return "<ul>\n" + "\n".join(items_html) + "\n</ul>"
|
|
252
|
+
|
|
253
|
+
def _handle_ordered_list(self, content):
|
|
254
|
+
items_html = [f"<li>{self._process_blocks(item)}</li>" for item in content[1]]
|
|
255
|
+
return "<ol>\n" + "\n".join(items_html) + "\n</ol>"
|
|
256
|
+
|
|
257
|
+
def _handle_code_block(self, content):
|
|
258
|
+
return f'<pre><code>{content[1]}</code></pre>'
|
|
259
|
+
|
|
260
|
+
def _handle_blockquote(self, content):
|
|
261
|
+
inner = self._process_blocks(content)
|
|
262
|
+
return f"<blockquote>{inner}</blockquote>"
|
|
263
|
+
|
|
264
|
+
def _handle_image(self, content):
|
|
265
|
+
# content = [attr, caption, [target, title]]
|
|
266
|
+
attr = content[0]
|
|
267
|
+
attr_dict = dict(attr[2]) if attr and len(attr) > 2 else {}
|
|
268
|
+
|
|
269
|
+
alt_text = self._process_inlines(content[1])
|
|
270
|
+
src_path = content[2][0]
|
|
271
|
+
title = content[2][1] if len(content[2]) > 1 else ""
|
|
272
|
+
|
|
273
|
+
filename = os.path.basename(src_path)
|
|
274
|
+
|
|
275
|
+
# Store for extraction
|
|
276
|
+
self.images.append({
|
|
277
|
+
'src_path': src_path,
|
|
278
|
+
'filename': filename
|
|
279
|
+
})
|
|
280
|
+
|
|
281
|
+
# Update src to point to images/ folder
|
|
282
|
+
new_src = f"images/{filename}"
|
|
283
|
+
|
|
284
|
+
# Calculate Dimensions
|
|
285
|
+
width_attr_val = ""
|
|
286
|
+
height_attr_val = ""
|
|
287
|
+
|
|
288
|
+
w_int = None
|
|
289
|
+
h_int = None
|
|
290
|
+
|
|
291
|
+
import re
|
|
292
|
+
def parse_to_px(val_str):
|
|
293
|
+
if not val_str: return None
|
|
294
|
+
s = val_str.lower().strip()
|
|
295
|
+
m = re.match(r'([\d\.]+)([a-z%]+)?', s)
|
|
296
|
+
if not m: return None
|
|
297
|
+
val = float(m.group(1))
|
|
298
|
+
unit = m.group(2)
|
|
299
|
+
|
|
300
|
+
if not unit or unit == 'px': return int(val)
|
|
301
|
+
if unit == 'in': return int(val * 96)
|
|
302
|
+
if unit == 'cm': return int(val * 37.8)
|
|
303
|
+
if unit == 'mm': return int(val * 3.78)
|
|
304
|
+
if unit == '%': return None
|
|
305
|
+
|
|
306
|
+
return int(val)
|
|
307
|
+
|
|
308
|
+
if 'width' in attr_dict:
|
|
309
|
+
w_int = parse_to_px(attr_dict['width'])
|
|
310
|
+
|
|
311
|
+
if 'height' in attr_dict:
|
|
312
|
+
h_int = parse_to_px(attr_dict['height'])
|
|
313
|
+
|
|
314
|
+
# Pillow Auto-Sizing
|
|
315
|
+
if w_int is None:
|
|
316
|
+
try:
|
|
317
|
+
if os.path.exists(src_path):
|
|
318
|
+
with Image.open(src_path) as im:
|
|
319
|
+
w_int, h_int = im.size
|
|
320
|
+
except:
|
|
321
|
+
pass
|
|
322
|
+
|
|
323
|
+
# Max Width Logic
|
|
324
|
+
MAX_WIDTH_PX = 600
|
|
325
|
+
|
|
326
|
+
if w_int and w_int > MAX_WIDTH_PX:
|
|
327
|
+
ratio = MAX_WIDTH_PX / w_int
|
|
328
|
+
w_int = MAX_WIDTH_PX
|
|
329
|
+
if h_int:
|
|
330
|
+
h_int = int(h_int * ratio)
|
|
331
|
+
|
|
332
|
+
if w_int:
|
|
333
|
+
width_attr_val = f' width="{w_int}"'
|
|
334
|
+
if h_int:
|
|
335
|
+
height_attr_val = f' height="{h_int}"'
|
|
336
|
+
|
|
337
|
+
title_attr = f' title="{title}"' if title else ""
|
|
338
|
+
return f'<img src="{new_src}" alt="{alt_text}"{title_attr}{width_attr_val}{height_attr_val} />'
|
|
339
|
+
|
|
340
|
+
def _handle_note(self, content):
|
|
341
|
+
self.footnotes.append(content)
|
|
342
|
+
fn_num = len(self.footnotes)
|
|
343
|
+
return f'<sup><a href="#fn{fn_num}">[{fn_num}]</a></sup>'
|
|
344
|
+
|
|
345
|
+
def _handle_table(self, content):
|
|
346
|
+
table_head = content[3]
|
|
347
|
+
table_bodies = content[4]
|
|
348
|
+
html_parts = ["<table border='1'>"]
|
|
349
|
+
|
|
350
|
+
head_rows = table_head[1]
|
|
351
|
+
if head_rows:
|
|
352
|
+
html_parts.append("<thead>")
|
|
353
|
+
for row in head_rows:
|
|
354
|
+
html_parts.append(self._process_table_row(row, is_header=True))
|
|
355
|
+
html_parts.append("</thead>")
|
|
356
|
+
|
|
357
|
+
if table_bodies:
|
|
358
|
+
html_parts.append("<tbody>")
|
|
359
|
+
for body in table_bodies:
|
|
360
|
+
body_rows = body[3]
|
|
361
|
+
for row in body_rows:
|
|
362
|
+
html_parts.append(self._process_table_row(row, is_header=False))
|
|
363
|
+
html_parts.append("</tbody>")
|
|
364
|
+
|
|
365
|
+
html_parts.append("</table>")
|
|
366
|
+
return "\n".join(html_parts)
|
|
367
|
+
|
|
368
|
+
def _process_table_row(self, row, is_header=False):
|
|
369
|
+
cells = row[1]
|
|
370
|
+
row_html = ["<tr>"]
|
|
371
|
+
tag = "th" if is_header else "td"
|
|
372
|
+
|
|
373
|
+
for cell in cells:
|
|
374
|
+
cell_blocks = cell[4]
|
|
375
|
+
cell_content = self._process_blocks(cell_blocks)
|
|
376
|
+
row_span = cell[2]
|
|
377
|
+
col_span = cell[3]
|
|
378
|
+
|
|
379
|
+
attrs = ""
|
|
380
|
+
if row_span > 1: attrs += f' rowspan="{row_span}"'
|
|
381
|
+
if col_span > 1: attrs += f' colspan="{col_span}"'
|
|
382
|
+
|
|
383
|
+
row_html.append(f'<{tag}{attrs}>{cell_content}</{tag}>')
|
|
384
|
+
|
|
385
|
+
row_html.append("</tr>")
|
|
386
|
+
return "".join(row_html)
|