md2hwpx 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,386 @@
1
+ import sys
2
+ import json
3
+ import os
4
+ import shutil
5
+ import zipfile
6
+ import io
7
+ from PIL import Image
8
+
9
+
10
+ class MarkdownToHtml:
11
+ @staticmethod
12
+ def convert_to_html(input_path, output_path, json_ast=None):
13
+ """
14
+ Convert Markdown to HTML.
15
+
16
+ Args:
17
+ input_path: Original input file path (for image resolution)
18
+ output_path: Output HTML file path
19
+ json_ast: Pre-parsed Pandoc-like AST dict (from MarkoToPandocAdapter)
20
+ """
21
+ if json_ast is None:
22
+ raise ValueError("json_ast parameter is required")
23
+
24
+ converter = MarkdownToHtml(json_ast)
25
+ final_html = converter.convert()
26
+
27
+ # Output Directory
28
+ output_dir = os.path.dirname(output_path)
29
+ if not output_dir:
30
+ output_dir = "."
31
+
32
+ # Images Directory
33
+ images_dir = os.path.join(output_dir, "images")
34
+
35
+ try:
36
+ if converter.images:
37
+ if not os.path.exists(images_dir):
38
+ os.makedirs(images_dir, exist_ok=True)
39
+
40
+ for img in converter.images:
41
+ # img = {'path': ..., 'filename': ...}
42
+ src_path = img['src_path'] # original path in AST (media/image1.png)
43
+ fname = img['filename'] # image1.png
44
+ target_path = os.path.join(images_dir, fname)
45
+
46
+ embedded = False
47
+
48
+ # Candidates for image source
49
+ candidates_to_check = []
50
+
51
+ # 1. As-is
52
+ candidates_to_check.append(src_path)
53
+
54
+ # 2. Relative to Input File
55
+ input_dir = os.path.dirname(os.path.abspath(input_path))
56
+ candidates_to_check.append(os.path.join(input_dir, src_path))
57
+
58
+ # Try Local File Candidates
59
+ for cand_path in candidates_to_check:
60
+ if os.path.exists(cand_path):
61
+ shutil.copy2(cand_path, target_path)
62
+ embedded = True
63
+ break
64
+
65
+ if not embedded:
66
+ print(f"[Warn] Image not found: {src_path}", file=sys.stderr)
67
+
68
+ except Exception as e:
69
+ print(f"[Error] Image processing failed: {e}", file=sys.stderr)
70
+
71
+ with open(output_path, "w", encoding="utf-8") as f:
72
+ f.write(final_html)
73
+
74
+ def __init__(self, json_ast):
75
+ self.ast = json_ast
76
+ self.output = []
77
+ # Footnotes list
78
+ self.footnotes = []
79
+ self.images = [] # metadata for images
80
+ self.title = None
81
+ self._extract_metadata()
82
+
83
+ def _extract_metadata(self):
84
+ if not self.ast:
85
+ return
86
+ meta = self.ast.get('meta', {})
87
+
88
+ # Title
89
+ if 'title' in meta:
90
+ t_obj = meta['title']
91
+ if t_obj.get('t') == 'MetaInlines':
92
+ self.title = self._get_plain_text(t_obj.get('c', []))
93
+ elif t_obj.get('t') == 'MetaString':
94
+ self.title = t_obj.get('c', "")
95
+
96
+ def _get_plain_text(self, inlines):
97
+ if not isinstance(inlines, list):
98
+ return ""
99
+ text = []
100
+ for item in inlines:
101
+ t = item.get('t')
102
+ c = item.get('c')
103
+ if t == 'Str':
104
+ text.append(c)
105
+ elif t == 'Space':
106
+ text.append(" ")
107
+ elif t in ['Strong', 'Emph', 'Underline', 'Strikeout', 'Superscript', 'Subscript', 'SmallCaps']:
108
+ text.append(self._get_plain_text(c))
109
+ elif t == 'Link':
110
+ text.append(self._get_plain_text(c[1]))
111
+ elif t == 'Image':
112
+ text.append(self._get_plain_text(c[1]))
113
+ elif t == 'Code':
114
+ text.append(c[1])
115
+ elif t == 'Quoted':
116
+ text.append('"' + self._get_plain_text(c[1]) + '"')
117
+ return "".join(text)
118
+
119
+ def convert(self):
120
+ blocks = self.ast.get('blocks', [])
121
+ body_content = self._process_blocks(blocks)
122
+
123
+ # Footnotes
124
+ if self.footnotes:
125
+ body_content += "\n<hr />\n<div class='footnotes'>\n<ol>\n"
126
+ for idx, note_blocks in enumerate(self.footnotes):
127
+ note_html = self._process_blocks(note_blocks)
128
+ body_content += f"<li id='fn{idx+1}'>{note_html}</li>\n"
129
+ body_content += "</ol>\n</div>"
130
+
131
+ # Wrap in HTML
132
+ title_tag = f"<title>{self.title}</title>" if self.title else "<title>Document</title>"
133
+ html = f"""<!DOCTYPE html>
134
+ <html lang="ko">
135
+ <head>
136
+ <meta charset="UTF-8">
137
+ {title_tag}
138
+ <style>
139
+ body {{ font-family: sans-serif; line-height: 1.6; max-width: 800px; margin: 0 auto; padding: 2rem; }}
140
+ img {{ max-width: 100%; height: auto; }}
141
+ table {{ border-collapse: collapse; width: 100%; }}
142
+ th, td {{ border: 1px solid #ddd; padding: 8px; }}
143
+ th {{ background-color: #f2f2f2; }}
144
+ code {{ background-color: #f0f0f0; padding: 2px 4px; border-radius: 4px; }}
145
+ pre {{ background-color: #f0f0f0; padding: 1rem; overflow-x: auto; }}
146
+ </style>
147
+ </head>
148
+ <body>
149
+ {body_content}
150
+ </body>
151
+ </html>"""
152
+ return html
153
+
154
+ def _process_blocks(self, blocks):
155
+ result = []
156
+ if not isinstance(blocks, list):
157
+ return ""
158
+
159
+ for block in blocks:
160
+ if not isinstance(block, dict):
161
+ continue
162
+
163
+ b_type = block.get('t')
164
+ b_content = block.get('c')
165
+
166
+ if b_type == 'Header':
167
+ result.append(self._handle_header(b_content))
168
+ elif b_type == 'Para':
169
+ result.append(self._handle_para(b_content))
170
+ elif b_type == 'Plain':
171
+ result.append(self._handle_plain(b_content))
172
+ elif b_type == 'BulletList':
173
+ result.append(self._handle_bullet_list(b_content))
174
+ elif b_type == 'OrderedList':
175
+ result.append(self._handle_ordered_list(b_content))
176
+ elif b_type == 'CodeBlock':
177
+ result.append(self._handle_code_block(b_content))
178
+ elif b_type == 'Table':
179
+ result.append(self._handle_table(b_content))
180
+ elif b_type == 'BlockQuote':
181
+ result.append(self._handle_blockquote(b_content))
182
+ elif b_type == 'HorizontalRule':
183
+ result.append("<hr />")
184
+ else:
185
+ # Skip unknown block types
186
+ pass
187
+
188
+ return "\n".join(result)
189
+
190
+ def _process_inlines(self, inlines):
191
+ result = []
192
+ if not isinstance(inlines, list):
193
+ return ""
194
+
195
+ for item in inlines:
196
+ if not isinstance(item, dict):
197
+ continue
198
+
199
+ i_type = item.get('t')
200
+ i_content = item.get('c')
201
+
202
+ if i_type == 'Str':
203
+ result.append(i_content)
204
+ elif i_type == 'Space':
205
+ result.append(" ")
206
+ elif i_type == 'Strong':
207
+ result.append(f"<strong>{self._process_inlines(i_content)}</strong>")
208
+ elif i_type == 'Emph':
209
+ result.append(f"<em>{self._process_inlines(i_content)}</em>")
210
+ elif i_type == 'Link':
211
+ text_content = i_content[1]
212
+ target_url = i_content[2][0]
213
+ result.append(f'<a href="{target_url}">{self._process_inlines(text_content)}</a>')
214
+ elif i_type == 'Code':
215
+ result.append(f"<code>{i_content[1]}</code>")
216
+ elif i_type == 'SoftBreak':
217
+ result.append(" ")
218
+ elif i_type == 'LineBreak':
219
+ result.append("<br />")
220
+ elif i_type == 'Underline':
221
+ result.append(f"<u>{self._process_inlines(i_content)}</u>")
222
+ elif i_type == 'Superscript':
223
+ result.append(f"<sup>{self._process_inlines(i_content)}</sup>")
224
+ elif i_type == 'Subscript':
225
+ result.append(f"<sub>{self._process_inlines(i_content)}</sub>")
226
+ elif i_type == 'Strikeout':
227
+ result.append(f"<del>{self._process_inlines(i_content)}</del>")
228
+ elif i_type == 'Image':
229
+ result.append(self._handle_image(i_content))
230
+ elif i_type == 'Note':
231
+ result.append(self._handle_note(i_content))
232
+ else:
233
+ # Skip unknown inline types
234
+ pass
235
+
236
+ return "".join(result)
237
+
238
+ def _handle_header(self, content):
239
+ level = content[0]
240
+ text = self._process_inlines(content[2])
241
+ return f"<h{level}>{text}</h{level}>"
242
+
243
+ def _handle_para(self, content):
244
+ return f"<p>{self._process_inlines(content)}</p>"
245
+
246
+ def _handle_plain(self, content):
247
+ return self._process_inlines(content)
248
+
249
+ def _handle_bullet_list(self, content):
250
+ items_html = [f"<li>{self._process_blocks(item)}</li>" for item in content]
251
+ return "<ul>\n" + "\n".join(items_html) + "\n</ul>"
252
+
253
+ def _handle_ordered_list(self, content):
254
+ items_html = [f"<li>{self._process_blocks(item)}</li>" for item in content[1]]
255
+ return "<ol>\n" + "\n".join(items_html) + "\n</ol>"
256
+
257
+ def _handle_code_block(self, content):
258
+ return f'<pre><code>{content[1]}</code></pre>'
259
+
260
+ def _handle_blockquote(self, content):
261
+ inner = self._process_blocks(content)
262
+ return f"<blockquote>{inner}</blockquote>"
263
+
264
+ def _handle_image(self, content):
265
+ # content = [attr, caption, [target, title]]
266
+ attr = content[0]
267
+ attr_dict = dict(attr[2]) if attr and len(attr) > 2 else {}
268
+
269
+ alt_text = self._process_inlines(content[1])
270
+ src_path = content[2][0]
271
+ title = content[2][1] if len(content[2]) > 1 else ""
272
+
273
+ filename = os.path.basename(src_path)
274
+
275
+ # Store for extraction
276
+ self.images.append({
277
+ 'src_path': src_path,
278
+ 'filename': filename
279
+ })
280
+
281
+ # Update src to point to images/ folder
282
+ new_src = f"images/{filename}"
283
+
284
+ # Calculate Dimensions
285
+ width_attr_val = ""
286
+ height_attr_val = ""
287
+
288
+ w_int = None
289
+ h_int = None
290
+
291
+ import re
292
+ def parse_to_px(val_str):
293
+ if not val_str: return None
294
+ s = val_str.lower().strip()
295
+ m = re.match(r'([\d\.]+)([a-z%]+)?', s)
296
+ if not m: return None
297
+ val = float(m.group(1))
298
+ unit = m.group(2)
299
+
300
+ if not unit or unit == 'px': return int(val)
301
+ if unit == 'in': return int(val * 96)
302
+ if unit == 'cm': return int(val * 37.8)
303
+ if unit == 'mm': return int(val * 3.78)
304
+ if unit == '%': return None
305
+
306
+ return int(val)
307
+
308
+ if 'width' in attr_dict:
309
+ w_int = parse_to_px(attr_dict['width'])
310
+
311
+ if 'height' in attr_dict:
312
+ h_int = parse_to_px(attr_dict['height'])
313
+
314
+ # Pillow Auto-Sizing
315
+ if w_int is None:
316
+ try:
317
+ if os.path.exists(src_path):
318
+ with Image.open(src_path) as im:
319
+ w_int, h_int = im.size
320
+ except:
321
+ pass
322
+
323
+ # Max Width Logic
324
+ MAX_WIDTH_PX = 600
325
+
326
+ if w_int and w_int > MAX_WIDTH_PX:
327
+ ratio = MAX_WIDTH_PX / w_int
328
+ w_int = MAX_WIDTH_PX
329
+ if h_int:
330
+ h_int = int(h_int * ratio)
331
+
332
+ if w_int:
333
+ width_attr_val = f' width="{w_int}"'
334
+ if h_int:
335
+ height_attr_val = f' height="{h_int}"'
336
+
337
+ title_attr = f' title="{title}"' if title else ""
338
+ return f'<img src="{new_src}" alt="{alt_text}"{title_attr}{width_attr_val}{height_attr_val} />'
339
+
340
+ def _handle_note(self, content):
341
+ self.footnotes.append(content)
342
+ fn_num = len(self.footnotes)
343
+ return f'<sup><a href="#fn{fn_num}">[{fn_num}]</a></sup>'
344
+
345
+ def _handle_table(self, content):
346
+ table_head = content[3]
347
+ table_bodies = content[4]
348
+ html_parts = ["<table border='1'>"]
349
+
350
+ head_rows = table_head[1]
351
+ if head_rows:
352
+ html_parts.append("<thead>")
353
+ for row in head_rows:
354
+ html_parts.append(self._process_table_row(row, is_header=True))
355
+ html_parts.append("</thead>")
356
+
357
+ if table_bodies:
358
+ html_parts.append("<tbody>")
359
+ for body in table_bodies:
360
+ body_rows = body[3]
361
+ for row in body_rows:
362
+ html_parts.append(self._process_table_row(row, is_header=False))
363
+ html_parts.append("</tbody>")
364
+
365
+ html_parts.append("</table>")
366
+ return "\n".join(html_parts)
367
+
368
+ def _process_table_row(self, row, is_header=False):
369
+ cells = row[1]
370
+ row_html = ["<tr>"]
371
+ tag = "th" if is_header else "td"
372
+
373
+ for cell in cells:
374
+ cell_blocks = cell[4]
375
+ cell_content = self._process_blocks(cell_blocks)
376
+ row_span = cell[2]
377
+ col_span = cell[3]
378
+
379
+ attrs = ""
380
+ if row_span > 1: attrs += f' rowspan="{row_span}"'
381
+ if col_span > 1: attrs += f' colspan="{col_span}"'
382
+
383
+ row_html.append(f'<{tag}{attrs}>{cell_content}</{tag}>')
384
+
385
+ row_html.append("</tr>")
386
+ return "".join(row_html)