hos-m2f 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hos_m2f/cli/__init__.py +1 -1
- hos_m2f/converters/md_to_docx.py +172 -41
- hos_m2f/converters/md_to_epub.py +37 -77
- hos_m2f/converters/md_to_html.py +2 -14
- hos_m2f/converters/md_to_json.py +40 -20
- hos_m2f/converters/md_to_latex.py +63 -0
- hos_m2f/converters/md_to_xml.py +40 -20
- hos_m2f/converters/pdf_to_md.py +120 -0
- {hos_m2f-0.5.3.dist-info → hos_m2f-0.5.5.dist-info}/METADATA +1 -1
- hos_m2f-0.5.5.dist-info/RECORD +26 -0
- {hos_m2f-0.5.3.dist-info → hos_m2f-0.5.5.dist-info}/entry_points.txt +1 -0
- {hos_m2f-0.5.3.dist-info → hos_m2f-0.5.5.dist-info}/top_level.txt +1 -0
- tests/__init__.py +1 -0
- tests/test_converters.py +179 -0
- tests/test_latex.py +182 -0
- tests/test_modes.py +202 -0
- hos_m2f-0.5.3.dist-info/RECORD +0 -20
- {hos_m2f-0.5.3.dist-info → hos_m2f-0.5.5.dist-info}/WHEEL +0 -0
hos_m2f/cli/__init__.py
CHANGED
hos_m2f/converters/md_to_docx.py
CHANGED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
from typing import Any, Optional, Dict
|
|
4
4
|
from docx import Document
|
|
5
|
-
from docx.shared import Inches, Pt
|
|
6
|
-
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
5
|
+
from docx.shared import Inches, Pt, RGBColor
|
|
6
|
+
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_UNDERLINE
|
|
7
7
|
from docx.enum.style import WD_STYLE_TYPE
|
|
8
8
|
from hos_m2f.converters.base_converter import BaseConverter
|
|
9
9
|
import mistune
|
|
@@ -31,22 +31,6 @@ class MDToDOCXConverter(BaseConverter):
|
|
|
31
31
|
# 设置默认样式
|
|
32
32
|
self._setup_styles(doc)
|
|
33
33
|
|
|
34
|
-
# 解析Markdown
|
|
35
|
-
markdown = mistune.create_markdown(
|
|
36
|
-
plugins=[
|
|
37
|
-
'url',
|
|
38
|
-
'abbr',
|
|
39
|
-
'def_list',
|
|
40
|
-
'footnotes',
|
|
41
|
-
'tables',
|
|
42
|
-
'task_lists',
|
|
43
|
-
'strikethrough',
|
|
44
|
-
'highlight',
|
|
45
|
-
'superscript',
|
|
46
|
-
'subscript'
|
|
47
|
-
]
|
|
48
|
-
)
|
|
49
|
-
|
|
50
34
|
# 自定义渲染器
|
|
51
35
|
class DOCXRenderer(mistune.HTMLRenderer):
|
|
52
36
|
def __init__(self, doc):
|
|
@@ -92,35 +76,130 @@ class MDToDOCXConverter(BaseConverter):
|
|
|
92
76
|
return ''
|
|
93
77
|
|
|
94
78
|
def table(self, text):
|
|
95
|
-
#
|
|
96
|
-
|
|
79
|
+
# 解析Markdown表格并转换为DOCX表格
|
|
80
|
+
try:
|
|
81
|
+
# 分割表格行
|
|
82
|
+
rows = text.strip().split('\n')
|
|
83
|
+
if not rows:
|
|
84
|
+
return ''
|
|
85
|
+
|
|
86
|
+
# 解析表头
|
|
87
|
+
header_cells = [cell.strip() for cell in rows[0].split('|') if cell.strip()]
|
|
88
|
+
if not header_cells:
|
|
89
|
+
return ''
|
|
90
|
+
|
|
91
|
+
# 创建表格
|
|
92
|
+
table = self.doc.add_table(rows=1, cols=len(header_cells))
|
|
93
|
+
table.style = 'Table Grid'
|
|
94
|
+
|
|
95
|
+
# 填充表头
|
|
96
|
+
header_row = table.rows[0]
|
|
97
|
+
for i, cell_text in enumerate(header_cells):
|
|
98
|
+
cell = header_row.cells[i]
|
|
99
|
+
cell.text = cell_text
|
|
100
|
+
# 设置表头样式
|
|
101
|
+
for paragraph in cell.paragraphs:
|
|
102
|
+
for run in paragraph.runs:
|
|
103
|
+
run.bold = True
|
|
104
|
+
run.font.size = Pt(11)
|
|
105
|
+
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
106
|
+
|
|
107
|
+
# 跳过分隔线行,并解析对齐方式
|
|
108
|
+
alignments = []
|
|
109
|
+
if len(rows) > 1 and '---' in rows[1]:
|
|
110
|
+
# 解析对齐方式
|
|
111
|
+
alignment_row = rows[1]
|
|
112
|
+
alignment_cells = [cell.strip() for cell in alignment_row.split('|') if cell.strip()]
|
|
113
|
+
for cell in alignment_cells:
|
|
114
|
+
if cell.startswith(':') and cell.endswith(':'):
|
|
115
|
+
alignments.append(WD_ALIGN_PARAGRAPH.CENTER)
|
|
116
|
+
elif cell.endswith(':'):
|
|
117
|
+
alignments.append(WD_ALIGN_PARAGRAPH.RIGHT)
|
|
118
|
+
else:
|
|
119
|
+
alignments.append(WD_ALIGN_PARAGRAPH.LEFT)
|
|
120
|
+
data_rows = rows[2:]
|
|
121
|
+
else:
|
|
122
|
+
data_rows = rows[1:]
|
|
123
|
+
|
|
124
|
+
# 填充数据行
|
|
125
|
+
for row in data_rows:
|
|
126
|
+
cells = [cell.strip() for cell in row.split('|') if cell.strip()]
|
|
127
|
+
if cells:
|
|
128
|
+
new_row = table.add_row()
|
|
129
|
+
for i, cell_text in enumerate(cells):
|
|
130
|
+
if i < len(new_row.cells):
|
|
131
|
+
cell = new_row.cells[i]
|
|
132
|
+
cell.text = cell_text
|
|
133
|
+
# 设置对齐方式
|
|
134
|
+
if i < len(alignments):
|
|
135
|
+
for paragraph in cell.paragraphs:
|
|
136
|
+
paragraph.alignment = alignments[i]
|
|
137
|
+
except Exception as e:
|
|
138
|
+
# 如果解析失败,回退到简单处理
|
|
139
|
+
self.doc.add_paragraph('Table: ' + text[:100] + '...')
|
|
97
140
|
return ''
|
|
98
141
|
|
|
99
|
-
def image(self,
|
|
142
|
+
def image(self, text, url=None, title=None, alt=None):
|
|
100
143
|
try:
|
|
101
|
-
#
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
144
|
+
# 尝试处理本地和远程图片
|
|
145
|
+
import os
|
|
146
|
+
import requests
|
|
147
|
+
from io import BytesIO
|
|
148
|
+
|
|
149
|
+
# 使用alt作为替代文本
|
|
150
|
+
if alt is None:
|
|
151
|
+
alt = text
|
|
152
|
+
|
|
153
|
+
# 检查是否有图片URL
|
|
154
|
+
if not url:
|
|
155
|
+
self.doc.add_paragraph(f'Image: {alt}')
|
|
156
|
+
return ''
|
|
157
|
+
|
|
158
|
+
# 检查是否是本地图片
|
|
159
|
+
if os.path.exists(url):
|
|
160
|
+
# 添加本地图片
|
|
161
|
+
self.doc.add_picture(url)
|
|
162
|
+
else:
|
|
163
|
+
# 尝试从网络获取图片
|
|
164
|
+
response = requests.get(url, timeout=5)
|
|
165
|
+
if response.status_code == 200:
|
|
166
|
+
# 添加远程图片
|
|
167
|
+
image_stream = BytesIO(response.content)
|
|
168
|
+
self.doc.add_picture(image_stream)
|
|
169
|
+
else:
|
|
170
|
+
# 如果获取失败,添加图片描述
|
|
171
|
+
self.doc.add_paragraph(f'Image: {alt} ({url})')
|
|
172
|
+
except Exception as e:
|
|
173
|
+
# 如果处理失败,添加图片描述
|
|
174
|
+
self.doc.add_paragraph(f'Image: {alt or text} ({url or ""})')
|
|
105
175
|
return ''
|
|
106
176
|
|
|
107
|
-
def link(self,
|
|
108
|
-
if text:
|
|
177
|
+
def link(self, text, url=None, title=None):
|
|
178
|
+
if text and url:
|
|
179
|
+
# 简化处理,直接添加文本和链接
|
|
109
180
|
p = self.doc.add_paragraph()
|
|
110
181
|
run = p.add_run(text)
|
|
111
|
-
#
|
|
182
|
+
run.font.color.rgb = RGBColor(0, 0, 255) # 蓝色
|
|
183
|
+
run.underline = WD_UNDERLINE.SINGLE
|
|
184
|
+
p.add_run(f' ({url})')
|
|
185
|
+
elif text:
|
|
186
|
+
p = self.doc.add_paragraph(text)
|
|
187
|
+
elif url:
|
|
188
|
+
p = self.doc.add_paragraph(url)
|
|
112
189
|
return ''
|
|
113
190
|
|
|
114
191
|
def emphasis(self, text):
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
192
|
+
# 直接添加斜体文本
|
|
193
|
+
p = self.doc.add_paragraph()
|
|
194
|
+
run = p.add_run(text)
|
|
195
|
+
run.italic = True
|
|
118
196
|
return ''
|
|
119
197
|
|
|
120
198
|
def strong(self, text):
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
199
|
+
# 直接添加粗体文本
|
|
200
|
+
p = self.doc.add_paragraph()
|
|
201
|
+
run = p.add_run(text)
|
|
202
|
+
run.bold = True
|
|
124
203
|
return ''
|
|
125
204
|
|
|
126
205
|
def codespan(self, text):
|
|
@@ -129,16 +208,68 @@ class MDToDOCXConverter(BaseConverter):
|
|
|
129
208
|
run.font.name = 'Courier New'
|
|
130
209
|
return ''
|
|
131
210
|
|
|
132
|
-
def block_code(self, code,
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
211
|
+
def block_code(self, code, info=None):
|
|
212
|
+
# 处理Mermaid图表
|
|
213
|
+
if info == 'mermaid':
|
|
214
|
+
try:
|
|
215
|
+
# 尝试渲染Mermaid图表为图片
|
|
216
|
+
mermaid_image = self._render_mermaid(code)
|
|
217
|
+
if mermaid_image:
|
|
218
|
+
# 添加图片
|
|
219
|
+
self.doc.add_picture(mermaid_image)
|
|
220
|
+
return ''
|
|
221
|
+
else:
|
|
222
|
+
# 如果渲染失败,添加代码块
|
|
223
|
+
p = self.doc.add_paragraph('Mermaid Chart:')
|
|
224
|
+
p = self.doc.add_paragraph(code)
|
|
225
|
+
p.paragraph_format.left_indent = Inches(0.5)
|
|
226
|
+
return ''
|
|
227
|
+
except Exception as e:
|
|
228
|
+
# 如果处理失败,添加代码块
|
|
229
|
+
p = self.doc.add_paragraph('Mermaid Chart:')
|
|
230
|
+
p = self.doc.add_paragraph(code)
|
|
231
|
+
p.paragraph_format.left_indent = Inches(0.5)
|
|
232
|
+
return ''
|
|
233
|
+
else:
|
|
234
|
+
# 处理普通代码块
|
|
235
|
+
p = self.doc.add_paragraph()
|
|
236
|
+
run = p.add_run(code)
|
|
237
|
+
run.font.name = 'Courier New'
|
|
238
|
+
p.paragraph_format.left_indent = Inches(0.5)
|
|
239
|
+
return ''
|
|
240
|
+
|
|
241
|
+
def _render_mermaid(self, mermaid_code):
|
|
242
|
+
"""渲染Mermaid图表为图片"""
|
|
243
|
+
# 使用mermaid.ink API渲染Mermaid图表
|
|
244
|
+
try:
|
|
245
|
+
import requests
|
|
246
|
+
from io import BytesIO
|
|
247
|
+
import urllib.parse
|
|
248
|
+
|
|
249
|
+
# 编码Mermaid代码
|
|
250
|
+
encoded_code = urllib.parse.quote(mermaid_code)
|
|
251
|
+
|
|
252
|
+
# 构建API URL
|
|
253
|
+
url = f"https://mermaid.ink/img/{encoded_code}"
|
|
254
|
+
|
|
255
|
+
# 发送请求
|
|
256
|
+
response = requests.get(url, timeout=10)
|
|
257
|
+
|
|
258
|
+
if response.status_code == 200:
|
|
259
|
+
# 返回图片数据流
|
|
260
|
+
return BytesIO(response.content)
|
|
261
|
+
else:
|
|
262
|
+
# 如果API调用失败,返回None
|
|
263
|
+
return None
|
|
264
|
+
except Exception as e:
|
|
265
|
+
# 如果处理失败,返回None
|
|
266
|
+
print(f"Error rendering Mermaid chart: {e}")
|
|
267
|
+
return None
|
|
138
268
|
|
|
139
269
|
# 渲染Markdown
|
|
140
270
|
renderer = DOCXRenderer(doc)
|
|
141
|
-
markdown(
|
|
271
|
+
markdown = mistune.create_markdown(renderer=renderer)
|
|
272
|
+
markdown(input_content)
|
|
142
273
|
|
|
143
274
|
# 保存为二进制数据
|
|
144
275
|
import io
|
hos_m2f/converters/md_to_epub.py
CHANGED
|
@@ -2,14 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
from typing import Any, Optional, Dict
|
|
4
4
|
from hos_m2f.converters.base_converter import BaseConverter
|
|
5
|
-
import
|
|
6
|
-
from
|
|
7
|
-
import mistune
|
|
5
|
+
from hos_m2f.renderers.epub_renderer import EPUBRenderer
|
|
6
|
+
from hos_m2f.structure.book_parser import BookParser
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
class MDToEPUBConverter(BaseConverter):
|
|
11
10
|
"""Markdown到EPUB格式转换器"""
|
|
12
11
|
|
|
12
|
+
def __init__(self):
|
|
13
|
+
"""初始化转换器"""
|
|
14
|
+
self.renderer = EPUBRenderer()
|
|
15
|
+
self.book_parser = BookParser()
|
|
16
|
+
|
|
13
17
|
def convert(self, input_content: str, options: Optional[Dict[str, Any]] = None) -> bytes:
|
|
14
18
|
"""将Markdown转换为EPUB
|
|
15
19
|
|
|
@@ -23,85 +27,41 @@ class MDToEPUBConverter(BaseConverter):
|
|
|
23
27
|
if options is None:
|
|
24
28
|
options = {}
|
|
25
29
|
|
|
26
|
-
#
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
# 设置元数据
|
|
30
|
-
book.set_identifier('id12345')
|
|
31
|
-
book.set_title(options.get('title', 'Untitled'))
|
|
32
|
-
book.set_language(options.get('language', 'zh'))
|
|
33
|
-
book.add_author(options.get('author', 'Unknown'))
|
|
34
|
-
|
|
35
|
-
# 添加封面
|
|
36
|
-
if 'cover' in options:
|
|
37
|
-
cover_image = epub.EpubItem(
|
|
38
|
-
uid='cover-image',
|
|
39
|
-
file_name='images/cover.jpg',
|
|
40
|
-
media_type='image/jpeg',
|
|
41
|
-
content=options['cover']
|
|
42
|
-
)
|
|
43
|
-
book.add_item(cover_image)
|
|
44
|
-
book.set_cover('images/cover.jpg', cover_image)
|
|
45
|
-
|
|
46
|
-
# 解析Markdown
|
|
47
|
-
markdown = mistune.create_markdown(
|
|
48
|
-
plugins=[
|
|
49
|
-
'url',
|
|
50
|
-
'abbr',
|
|
51
|
-
'def_list',
|
|
52
|
-
'footnotes',
|
|
53
|
-
'tables',
|
|
54
|
-
'task_lists',
|
|
55
|
-
'strikethrough',
|
|
56
|
-
'highlight',
|
|
57
|
-
'superscript',
|
|
58
|
-
'subscript'
|
|
59
|
-
]
|
|
60
|
-
)
|
|
30
|
+
# 使用BookParser解析Markdown内容
|
|
31
|
+
parsed_content = self.book_parser.parse(input_content, options)
|
|
61
32
|
|
|
62
|
-
#
|
|
63
|
-
|
|
33
|
+
# 增强解析结果
|
|
34
|
+
parsed_content = self._enhance_parsed_content(parsed_content, options)
|
|
64
35
|
|
|
65
|
-
#
|
|
66
|
-
|
|
67
|
-
title=options.get('title', 'Chapter 1'),
|
|
68
|
-
file_name='chapter1.xhtml',
|
|
69
|
-
lang='zh'
|
|
70
|
-
)
|
|
71
|
-
chapter.content = f'''
|
|
72
|
-
<!DOCTYPE html>
|
|
73
|
-
<html>
|
|
74
|
-
<head>
|
|
75
|
-
<title>{options.get('title', 'Untitled')}</title>
|
|
76
|
-
<meta charset="utf-8" />
|
|
77
|
-
</head>
|
|
78
|
-
<body>
|
|
79
|
-
<h1>{options.get('title', 'Untitled')}</h1>
|
|
80
|
-
{html_content}
|
|
81
|
-
</body>
|
|
82
|
-
</html>
|
|
83
|
-
'''
|
|
36
|
+
# 使用EPUBRenderer渲染EPUB文件
|
|
37
|
+
epub_content = self.renderer.render(parsed_content, options)
|
|
84
38
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
39
|
+
return epub_content
|
|
40
|
+
|
|
41
|
+
def _enhance_parsed_content(self, parsed_content: Dict[str, Any], options: Dict[str, Any]) -> Dict[str, Any]:
|
|
42
|
+
"""增强解析结果"""
|
|
43
|
+
# 添加选项中的元数据
|
|
44
|
+
if 'title' in options:
|
|
45
|
+
parsed_content.setdefault('book_metadata', {})['title'] = options['title']
|
|
46
|
+
if 'author' in options:
|
|
47
|
+
parsed_content.setdefault('book_metadata', {})['author'] = options['author']
|
|
48
|
+
if 'language' in options:
|
|
49
|
+
parsed_content.setdefault('book_metadata', {})['language'] = options['language']
|
|
50
|
+
if 'publisher' in options:
|
|
51
|
+
parsed_content.setdefault('book_metadata', {})['publisher'] = options['publisher']
|
|
52
|
+
if 'publish_date' in options:
|
|
53
|
+
parsed_content.setdefault('book_metadata', {})['publish_date'] = options['publish_date']
|
|
54
|
+
if 'description' in options:
|
|
55
|
+
parsed_content.setdefault('book_metadata', {})['description'] = options['description']
|
|
97
56
|
|
|
98
|
-
#
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
57
|
+
# 添加封面信息
|
|
58
|
+
if 'cover' in options:
|
|
59
|
+
parsed_content['cover'] = {
|
|
60
|
+
'src': options['cover'],
|
|
61
|
+
'type': 'image'
|
|
62
|
+
}
|
|
103
63
|
|
|
104
|
-
return
|
|
64
|
+
return parsed_content
|
|
105
65
|
|
|
106
66
|
def get_supported_formats(self) -> tuple:
|
|
107
67
|
"""获取支持的格式"""
|
hos_m2f/converters/md_to_html.py
CHANGED
|
@@ -22,20 +22,8 @@ class MDToHTMLConverter(BaseConverter):
|
|
|
22
22
|
options = {}
|
|
23
23
|
|
|
24
24
|
# 解析Markdown
|
|
25
|
-
markdown = mistune.create_markdown(
|
|
26
|
-
|
|
27
|
-
'url',
|
|
28
|
-
'abbr',
|
|
29
|
-
'def_list',
|
|
30
|
-
'footnotes',
|
|
31
|
-
'tables',
|
|
32
|
-
'task_lists',
|
|
33
|
-
'strikethrough',
|
|
34
|
-
'highlight',
|
|
35
|
-
'superscript',
|
|
36
|
-
'subscript'
|
|
37
|
-
]
|
|
38
|
-
)
|
|
25
|
+
markdown = mistune.create_markdown()
|
|
26
|
+
|
|
39
27
|
|
|
40
28
|
# 转换为HTML
|
|
41
29
|
html_content = markdown(input_content)
|
hos_m2f/converters/md_to_json.py
CHANGED
|
@@ -193,19 +193,30 @@ class MDToJSONConverter(BaseConverter):
|
|
|
193
193
|
language = line[3:].strip()
|
|
194
194
|
|
|
195
195
|
# 读取代码内容
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
196
|
+
try:
|
|
197
|
+
line_idx = lines.index(line)
|
|
198
|
+
code_end_idx = line_idx + 1
|
|
199
|
+
for i, code_line in enumerate(lines[line_idx+1:]):
|
|
200
|
+
if code_line.startswith('```'):
|
|
201
|
+
code_end_idx = line_idx + i + 1
|
|
202
|
+
break
|
|
203
|
+
code_lines.append(code_line)
|
|
204
|
+
code_end_idx = line_idx + i + 1
|
|
205
|
+
|
|
206
|
+
structure['children'].append({
|
|
207
|
+
'type': 'code_block',
|
|
208
|
+
'language': language,
|
|
209
|
+
'content': '\n'.join(code_lines)
|
|
210
|
+
})
|
|
211
|
+
|
|
212
|
+
# 跳过已处理的代码行
|
|
213
|
+
if code_end_idx < len(lines):
|
|
214
|
+
lines = lines[:line_idx] + lines[code_end_idx+1:]
|
|
215
|
+
else:
|
|
216
|
+
lines = lines[:line_idx]
|
|
217
|
+
except ValueError:
|
|
218
|
+
# 如果找不到行,跳过代码块解析
|
|
219
|
+
continue
|
|
209
220
|
|
|
210
221
|
# 处理表格
|
|
211
222
|
elif line.startswith('|') and '|' in line[1:]:
|
|
@@ -231,19 +242,28 @@ class MDToJSONConverter(BaseConverter):
|
|
|
231
242
|
table_lines = [line]
|
|
232
243
|
|
|
233
244
|
# 读取表格内容
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
245
|
+
try:
|
|
246
|
+
line_idx = lines.index(line)
|
|
247
|
+
for i, table_line in enumerate(lines[line_idx+1:]):
|
|
248
|
+
if table_line.startswith('|'):
|
|
249
|
+
table_lines.append(table_line)
|
|
250
|
+
else:
|
|
251
|
+
break
|
|
252
|
+
except ValueError:
|
|
253
|
+
# 如果找不到行,跳过表格解析
|
|
254
|
+
continue
|
|
239
255
|
|
|
240
256
|
# 解析表格结构
|
|
241
257
|
if len(table_lines) >= 2:
|
|
242
258
|
headers = [h.strip() for h in table_lines[0].split('|') if h.strip()]
|
|
243
259
|
rows = []
|
|
244
260
|
|
|
245
|
-
#
|
|
246
|
-
|
|
261
|
+
# 跳过分隔线(如果存在)
|
|
262
|
+
start_idx = 1
|
|
263
|
+
if len(table_lines) > 1 and any('---' in cell for cell in table_lines[1].split('|')):
|
|
264
|
+
start_idx = 2
|
|
265
|
+
|
|
266
|
+
for table_line in table_lines[start_idx:]:
|
|
247
267
|
cells = [c.strip() for c in table_line.split('|') if c.strip()]
|
|
248
268
|
if cells:
|
|
249
269
|
rows.append(dict(zip(headers, cells)))
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Markdown到LaTeX格式转换器"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Dict
|
|
4
|
+
from hos_m2f.converters.base_converter import BaseConverter
|
|
5
|
+
from hos_m2f.renderers.latex_renderer import LaTeXRenderer
|
|
6
|
+
from hos_m2f.structure.semantic_parser import SemanticParser
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class MDToLaTeXConverter(BaseConverter):
|
|
10
|
+
"""Markdown到LaTeX格式转换器"""
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
"""初始化转换器"""
|
|
14
|
+
self.renderer = LaTeXRenderer()
|
|
15
|
+
self.parser = SemanticParser()
|
|
16
|
+
|
|
17
|
+
def convert(self, input_content: str, options: Optional[Dict[str, Any]] = None) -> bytes:
|
|
18
|
+
"""将Markdown转换为LaTeX
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
input_content: Markdown内容
|
|
22
|
+
options: 转换选项
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
bytes: LaTeX文件的二进制数据
|
|
26
|
+
"""
|
|
27
|
+
if options is None:
|
|
28
|
+
options = {}
|
|
29
|
+
|
|
30
|
+
# 使用SemanticParser解析Markdown内容
|
|
31
|
+
parsed_content = self.parser.parse(input_content)
|
|
32
|
+
|
|
33
|
+
# 增强解析结果
|
|
34
|
+
parsed_content = self._enhance_parsed_content(parsed_content, options)
|
|
35
|
+
|
|
36
|
+
# 使用LaTeXRenderer渲染LaTeX文件
|
|
37
|
+
latex_content = self.renderer.render(parsed_content, options)
|
|
38
|
+
|
|
39
|
+
return latex_content
|
|
40
|
+
|
|
41
|
+
def _enhance_parsed_content(self, parsed_content: Dict[str, Any], options: Dict[str, Any]) -> Dict[str, Any]:
|
|
42
|
+
"""增强解析结果"""
|
|
43
|
+
# 添加选项中的元数据
|
|
44
|
+
if 'title' in options:
|
|
45
|
+
parsed_content.setdefault('metadata', {})['title'] = options['title']
|
|
46
|
+
if 'author' in options:
|
|
47
|
+
parsed_content.setdefault('metadata', {})['author'] = options['author']
|
|
48
|
+
if 'date' in options:
|
|
49
|
+
parsed_content.setdefault('metadata', {})['date'] = options['date']
|
|
50
|
+
if 'abstract' in options:
|
|
51
|
+
parsed_content.setdefault('metadata', {})['abstract'] = options['abstract']
|
|
52
|
+
if 'keywords' in options:
|
|
53
|
+
parsed_content.setdefault('metadata', {})['keywords'] = options['keywords']
|
|
54
|
+
|
|
55
|
+
# 添加文档类型
|
|
56
|
+
if 'document_class' in options:
|
|
57
|
+
parsed_content['document_class'] = options['document_class']
|
|
58
|
+
|
|
59
|
+
return parsed_content
|
|
60
|
+
|
|
61
|
+
def get_supported_formats(self) -> tuple:
|
|
62
|
+
"""获取支持的格式"""
|
|
63
|
+
return ('markdown', 'latex')
|
hos_m2f/converters/md_to_xml.py
CHANGED
|
@@ -200,19 +200,30 @@ class MDToXMLConverter(BaseConverter):
|
|
|
200
200
|
language = line[3:].strip()
|
|
201
201
|
|
|
202
202
|
# 读取代码内容
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
203
|
+
try:
|
|
204
|
+
line_idx = lines.index(line)
|
|
205
|
+
code_end_idx = line_idx + 1
|
|
206
|
+
for i, code_line in enumerate(lines[line_idx+1:]):
|
|
207
|
+
if code_line.startswith('```'):
|
|
208
|
+
code_end_idx = line_idx + i + 1
|
|
209
|
+
break
|
|
210
|
+
code_lines.append(code_line)
|
|
211
|
+
code_end_idx = line_idx + i + 1
|
|
212
|
+
|
|
213
|
+
structure['children'].append({
|
|
214
|
+
'type': 'code_block',
|
|
215
|
+
'language': language,
|
|
216
|
+
'content': '\n'.join(code_lines)
|
|
217
|
+
})
|
|
218
|
+
|
|
219
|
+
# 跳过已处理的代码行
|
|
220
|
+
if code_end_idx < len(lines):
|
|
221
|
+
lines = lines[:line_idx] + lines[code_end_idx+1:]
|
|
222
|
+
else:
|
|
223
|
+
lines = lines[:line_idx]
|
|
224
|
+
except ValueError:
|
|
225
|
+
# 如果找不到行,跳过代码块解析
|
|
226
|
+
continue
|
|
216
227
|
|
|
217
228
|
# 处理表格
|
|
218
229
|
elif line.startswith('|') and '|' in line[1:]:
|
|
@@ -238,19 +249,28 @@ class MDToXMLConverter(BaseConverter):
|
|
|
238
249
|
table_lines = [line]
|
|
239
250
|
|
|
240
251
|
# 读取表格内容
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
252
|
+
try:
|
|
253
|
+
line_idx = lines.index(line)
|
|
254
|
+
for i, table_line in enumerate(lines[line_idx+1:]):
|
|
255
|
+
if table_line.startswith('|'):
|
|
256
|
+
table_lines.append(table_line)
|
|
257
|
+
else:
|
|
258
|
+
break
|
|
259
|
+
except ValueError:
|
|
260
|
+
# 如果找不到行,跳过表格解析
|
|
261
|
+
continue
|
|
246
262
|
|
|
247
263
|
# 解析表格结构
|
|
248
264
|
if len(table_lines) >= 2:
|
|
249
265
|
headers = [h.strip() for h in table_lines[0].split('|') if h.strip()]
|
|
250
266
|
rows = []
|
|
251
267
|
|
|
252
|
-
#
|
|
253
|
-
|
|
268
|
+
# 跳过分隔线(如果存在)
|
|
269
|
+
start_idx = 1
|
|
270
|
+
if len(table_lines) > 1 and any('---' in cell for cell in table_lines[1].split('|')):
|
|
271
|
+
start_idx = 2
|
|
272
|
+
|
|
273
|
+
for table_line in table_lines[start_idx:]:
|
|
254
274
|
cells = [c.strip() for c in table_line.split('|') if c.strip()]
|
|
255
275
|
if cells:
|
|
256
276
|
rows.append(dict(zip(headers, cells)))
|