hos-m2f 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hos_m2f/cli/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
1
  """CLI模块"""
2
2
 
3
- from hos_m2f.cli.cli import CLI
3
+ from .cli import CLI
4
4
 
5
5
  __all__ = ['CLI']
@@ -2,8 +2,8 @@
2
2
 
3
3
  from typing import Any, Optional, Dict
4
4
  from docx import Document
5
- from docx.shared import Inches, Pt
6
- from docx.enum.text import WD_ALIGN_PARAGRAPH
5
+ from docx.shared import Inches, Pt, RGBColor
6
+ from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_UNDERLINE
7
7
  from docx.enum.style import WD_STYLE_TYPE
8
8
  from hos_m2f.converters.base_converter import BaseConverter
9
9
  import mistune
@@ -31,22 +31,6 @@ class MDToDOCXConverter(BaseConverter):
31
31
  # 设置默认样式
32
32
  self._setup_styles(doc)
33
33
 
34
- # 解析Markdown
35
- markdown = mistune.create_markdown(
36
- plugins=[
37
- 'url',
38
- 'abbr',
39
- 'def_list',
40
- 'footnotes',
41
- 'tables',
42
- 'task_lists',
43
- 'strikethrough',
44
- 'highlight',
45
- 'superscript',
46
- 'subscript'
47
- ]
48
- )
49
-
50
34
  # 自定义渲染器
51
35
  class DOCXRenderer(mistune.HTMLRenderer):
52
36
  def __init__(self, doc):
@@ -92,35 +76,130 @@ class MDToDOCXConverter(BaseConverter):
92
76
  return ''
93
77
 
94
78
  def table(self, text):
95
- # 简化处理,实际项目中需要更复杂的表格解析
96
- self.doc.add_paragraph('Table: ' + text[:100] + '...')
79
+ # 解析Markdown表格并转换为DOCX表格
80
+ try:
81
+ # 分割表格行
82
+ rows = text.strip().split('\n')
83
+ if not rows:
84
+ return ''
85
+
86
+ # 解析表头
87
+ header_cells = [cell.strip() for cell in rows[0].split('|') if cell.strip()]
88
+ if not header_cells:
89
+ return ''
90
+
91
+ # 创建表格
92
+ table = self.doc.add_table(rows=1, cols=len(header_cells))
93
+ table.style = 'Table Grid'
94
+
95
+ # 填充表头
96
+ header_row = table.rows[0]
97
+ for i, cell_text in enumerate(header_cells):
98
+ cell = header_row.cells[i]
99
+ cell.text = cell_text
100
+ # 设置表头样式
101
+ for paragraph in cell.paragraphs:
102
+ for run in paragraph.runs:
103
+ run.bold = True
104
+ run.font.size = Pt(11)
105
+ paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
106
+
107
+ # 跳过分隔线行,并解析对齐方式
108
+ alignments = []
109
+ if len(rows) > 1 and '---' in rows[1]:
110
+ # 解析对齐方式
111
+ alignment_row = rows[1]
112
+ alignment_cells = [cell.strip() for cell in alignment_row.split('|') if cell.strip()]
113
+ for cell in alignment_cells:
114
+ if cell.startswith(':') and cell.endswith(':'):
115
+ alignments.append(WD_ALIGN_PARAGRAPH.CENTER)
116
+ elif cell.endswith(':'):
117
+ alignments.append(WD_ALIGN_PARAGRAPH.RIGHT)
118
+ else:
119
+ alignments.append(WD_ALIGN_PARAGRAPH.LEFT)
120
+ data_rows = rows[2:]
121
+ else:
122
+ data_rows = rows[1:]
123
+
124
+ # 填充数据行
125
+ for row in data_rows:
126
+ cells = [cell.strip() for cell in row.split('|') if cell.strip()]
127
+ if cells:
128
+ new_row = table.add_row()
129
+ for i, cell_text in enumerate(cells):
130
+ if i < len(new_row.cells):
131
+ cell = new_row.cells[i]
132
+ cell.text = cell_text
133
+ # 设置对齐方式
134
+ if i < len(alignments):
135
+ for paragraph in cell.paragraphs:
136
+ paragraph.alignment = alignments[i]
137
+ except Exception as e:
138
+ # 如果解析失败,回退到简单处理
139
+ self.doc.add_paragraph('Table: ' + text[:100] + '...')
97
140
  return ''
98
141
 
99
- def image(self, src, alt='', title=None):
142
+ def image(self, text, url=None, title=None, alt=None):
100
143
  try:
101
- # 简化处理,实际项目中需要处理本地和远程图片
102
- self.doc.add_paragraph(f'Image: {alt} ({src})')
103
- except Exception:
104
- pass
144
+ # 尝试处理本地和远程图片
145
+ import os
146
+ import requests
147
+ from io import BytesIO
148
+
149
+ # 使用alt作为替代文本
150
+ if alt is None:
151
+ alt = text
152
+
153
+ # 检查是否有图片URL
154
+ if not url:
155
+ self.doc.add_paragraph(f'Image: {alt}')
156
+ return ''
157
+
158
+ # 检查是否是本地图片
159
+ if os.path.exists(url):
160
+ # 添加本地图片
161
+ self.doc.add_picture(url)
162
+ else:
163
+ # 尝试从网络获取图片
164
+ response = requests.get(url, timeout=5)
165
+ if response.status_code == 200:
166
+ # 添加远程图片
167
+ image_stream = BytesIO(response.content)
168
+ self.doc.add_picture(image_stream)
169
+ else:
170
+ # 如果获取失败,添加图片描述
171
+ self.doc.add_paragraph(f'Image: {alt} ({url})')
172
+ except Exception as e:
173
+ # 如果处理失败,添加图片描述
174
+ self.doc.add_paragraph(f'Image: {alt or text} ({url or ""})')
105
175
  return ''
106
176
 
107
- def link(self, link, text=None, title=None):
108
- if text:
177
+ def link(self, text, url=None, title=None):
178
+ if text and url:
179
+ # 简化处理,直接添加文本和链接
109
180
  p = self.doc.add_paragraph()
110
181
  run = p.add_run(text)
111
- # 实际项目中需要添加超链接
182
+ run.font.color.rgb = RGBColor(0, 0, 255) # 蓝色
183
+ run.underline = WD_UNDERLINE.SINGLE
184
+ p.add_run(f' ({url})')
185
+ elif text:
186
+ p = self.doc.add_paragraph(text)
187
+ elif url:
188
+ p = self.doc.add_paragraph(url)
112
189
  return ''
113
190
 
114
191
  def emphasis(self, text):
115
- if self.current_paragraph:
116
- run = self.current_paragraph.add_run(text)
117
- run.italic = True
192
+ # 直接添加斜体文本
193
+ p = self.doc.add_paragraph()
194
+ run = p.add_run(text)
195
+ run.italic = True
118
196
  return ''
119
197
 
120
198
  def strong(self, text):
121
- if self.current_paragraph:
122
- run = self.current_paragraph.add_run(text)
123
- run.bold = True
199
+ # 直接添加粗体文本
200
+ p = self.doc.add_paragraph()
201
+ run = p.add_run(text)
202
+ run.bold = True
124
203
  return ''
125
204
 
126
205
  def codespan(self, text):
@@ -129,16 +208,68 @@ class MDToDOCXConverter(BaseConverter):
129
208
  run.font.name = 'Courier New'
130
209
  return ''
131
210
 
132
- def block_code(self, code, lang=None):
133
- p = self.doc.add_paragraph()
134
- run = p.add_run(code)
135
- run.font.name = 'Courier New'
136
- p.paragraph_format.left_indent = Inches(0.5)
137
- return ''
211
+ def block_code(self, code, info=None):
212
+ # 处理Mermaid图表
213
+ if info == 'mermaid':
214
+ try:
215
+ # 尝试渲染Mermaid图表为图片
216
+ mermaid_image = self._render_mermaid(code)
217
+ if mermaid_image:
218
+ # 添加图片
219
+ self.doc.add_picture(mermaid_image)
220
+ return ''
221
+ else:
222
+ # 如果渲染失败,添加代码块
223
+ p = self.doc.add_paragraph('Mermaid Chart:')
224
+ p = self.doc.add_paragraph(code)
225
+ p.paragraph_format.left_indent = Inches(0.5)
226
+ return ''
227
+ except Exception as e:
228
+ # 如果处理失败,添加代码块
229
+ p = self.doc.add_paragraph('Mermaid Chart:')
230
+ p = self.doc.add_paragraph(code)
231
+ p.paragraph_format.left_indent = Inches(0.5)
232
+ return ''
233
+ else:
234
+ # 处理普通代码块
235
+ p = self.doc.add_paragraph()
236
+ run = p.add_run(code)
237
+ run.font.name = 'Courier New'
238
+ p.paragraph_format.left_indent = Inches(0.5)
239
+ return ''
240
+
241
+ def _render_mermaid(self, mermaid_code):
242
+ """渲染Mermaid图表为图片"""
243
+ # 使用mermaid.ink API渲染Mermaid图表
244
+ try:
245
+ import requests
246
+ from io import BytesIO
247
+ import urllib.parse
248
+
249
+ # 编码Mermaid代码
250
+ encoded_code = urllib.parse.quote(mermaid_code)
251
+
252
+ # 构建API URL
253
+ url = f"https://mermaid.ink/img/{encoded_code}"
254
+
255
+ # 发送请求
256
+ response = requests.get(url, timeout=10)
257
+
258
+ if response.status_code == 200:
259
+ # 返回图片数据流
260
+ return BytesIO(response.content)
261
+ else:
262
+ # 如果API调用失败,返回None
263
+ return None
264
+ except Exception as e:
265
+ # 如果处理失败,返回None
266
+ print(f"Error rendering Mermaid chart: {e}")
267
+ return None
138
268
 
139
269
  # 渲染Markdown
140
270
  renderer = DOCXRenderer(doc)
141
- markdown(input_content, renderer)
271
+ markdown = mistune.create_markdown(renderer=renderer)
272
+ markdown(input_content)
142
273
 
143
274
  # 保存为二进制数据
144
275
  import io
@@ -2,14 +2,18 @@
2
2
 
3
3
  from typing import Any, Optional, Dict
4
4
  from hos_m2f.converters.base_converter import BaseConverter
5
- import ebooklib
6
- from ebooklib import epub
7
- import mistune
5
+ from hos_m2f.renderers.epub_renderer import EPUBRenderer
6
+ from hos_m2f.structure.book_parser import BookParser
8
7
 
9
8
 
10
9
  class MDToEPUBConverter(BaseConverter):
11
10
  """Markdown到EPUB格式转换器"""
12
11
 
12
+ def __init__(self):
13
+ """初始化转换器"""
14
+ self.renderer = EPUBRenderer()
15
+ self.book_parser = BookParser()
16
+
13
17
  def convert(self, input_content: str, options: Optional[Dict[str, Any]] = None) -> bytes:
14
18
  """将Markdown转换为EPUB
15
19
 
@@ -23,85 +27,41 @@ class MDToEPUBConverter(BaseConverter):
23
27
  if options is None:
24
28
  options = {}
25
29
 
26
- # 创建EPUB书籍
27
- book = epub.EpubBook()
28
-
29
- # 设置元数据
30
- book.set_identifier('id12345')
31
- book.set_title(options.get('title', 'Untitled'))
32
- book.set_language(options.get('language', 'zh'))
33
- book.add_author(options.get('author', 'Unknown'))
34
-
35
- # 添加封面
36
- if 'cover' in options:
37
- cover_image = epub.EpubItem(
38
- uid='cover-image',
39
- file_name='images/cover.jpg',
40
- media_type='image/jpeg',
41
- content=options['cover']
42
- )
43
- book.add_item(cover_image)
44
- book.set_cover('images/cover.jpg', cover_image)
45
-
46
- # 解析Markdown
47
- markdown = mistune.create_markdown(
48
- plugins=[
49
- 'url',
50
- 'abbr',
51
- 'def_list',
52
- 'footnotes',
53
- 'tables',
54
- 'task_lists',
55
- 'strikethrough',
56
- 'highlight',
57
- 'superscript',
58
- 'subscript'
59
- ]
60
- )
30
+ # 使用BookParser解析Markdown内容
31
+ parsed_content = self.book_parser.parse(input_content, options)
61
32
 
62
- # 转换为HTML
63
- html_content = markdown(input_content)
33
+ # 增强解析结果
34
+ parsed_content = self._enhance_parsed_content(parsed_content, options)
64
35
 
65
- # 创建章节
66
- chapter = epub.EpubHtml(
67
- title=options.get('title', 'Chapter 1'),
68
- file_name='chapter1.xhtml',
69
- lang='zh'
70
- )
71
- chapter.content = f'''
72
- <!DOCTYPE html>
73
- <html>
74
- <head>
75
- <title>{options.get('title', 'Untitled')}</title>
76
- <meta charset="utf-8" />
77
- </head>
78
- <body>
79
- <h1>{options.get('title', 'Untitled')}</h1>
80
- {html_content}
81
- </body>
82
- </html>
83
- '''
36
+ # 使用EPUBRenderer渲染EPUB文件
37
+ epub_content = self.renderer.render(parsed_content, options)
84
38
 
85
- # 添加章节
86
- book.add_item(chapter)
87
-
88
- # 创建目录
89
- book.toc = [chapter]
90
-
91
- # 添加导航文件
92
- book.add_item(epub.EpubNcx())
93
- book.add_item(epub.EpubNav())
94
-
95
- # 定义spine
96
- book.spine = ['nav', chapter]
39
+ return epub_content
40
+
41
+ def _enhance_parsed_content(self, parsed_content: Dict[str, Any], options: Dict[str, Any]) -> Dict[str, Any]:
42
+ """增强解析结果"""
43
+ # 添加选项中的元数据
44
+ if 'title' in options:
45
+ parsed_content.setdefault('book_metadata', {})['title'] = options['title']
46
+ if 'author' in options:
47
+ parsed_content.setdefault('book_metadata', {})['author'] = options['author']
48
+ if 'language' in options:
49
+ parsed_content.setdefault('book_metadata', {})['language'] = options['language']
50
+ if 'publisher' in options:
51
+ parsed_content.setdefault('book_metadata', {})['publisher'] = options['publisher']
52
+ if 'publish_date' in options:
53
+ parsed_content.setdefault('book_metadata', {})['publish_date'] = options['publish_date']
54
+ if 'description' in options:
55
+ parsed_content.setdefault('book_metadata', {})['description'] = options['description']
97
56
 
98
- # 保存为二进制数据
99
- import io
100
- output = io.BytesIO()
101
- epub.write_epub(output, book, {})
102
- output.seek(0)
57
+ # 添加封面信息
58
+ if 'cover' in options:
59
+ parsed_content['cover'] = {
60
+ 'src': options['cover'],
61
+ 'type': 'image'
62
+ }
103
63
 
104
- return output.getvalue()
64
+ return parsed_content
105
65
 
106
66
  def get_supported_formats(self) -> tuple:
107
67
  """获取支持的格式"""
@@ -22,20 +22,8 @@ class MDToHTMLConverter(BaseConverter):
22
22
  options = {}
23
23
 
24
24
  # 解析Markdown
25
- markdown = mistune.create_markdown(
26
- plugins=[
27
- 'url',
28
- 'abbr',
29
- 'def_list',
30
- 'footnotes',
31
- 'tables',
32
- 'task_lists',
33
- 'strikethrough',
34
- 'highlight',
35
- 'superscript',
36
- 'subscript'
37
- ]
38
- )
25
+ markdown = mistune.create_markdown()
26
+
39
27
 
40
28
  # 转换为HTML
41
29
  html_content = markdown(input_content)
@@ -193,19 +193,30 @@ class MDToJSONConverter(BaseConverter):
193
193
  language = line[3:].strip()
194
194
 
195
195
  # 读取代码内容
196
- for i, code_line in enumerate(lines[lines.index(line)+1:]):
197
- if code_line.startswith('```'):
198
- break
199
- code_lines.append(code_line)
200
-
201
- structure['children'].append({
202
- 'type': 'code_block',
203
- 'language': language,
204
- 'content': '\n'.join(code_lines)
205
- })
206
-
207
- # 跳过已处理的代码行
208
- lines = lines[:lines.index(line)] + lines[lines.index(line)+i+2:]
196
+ try:
197
+ line_idx = lines.index(line)
198
+ code_end_idx = line_idx + 1
199
+ for i, code_line in enumerate(lines[line_idx+1:]):
200
+ if code_line.startswith('```'):
201
+ code_end_idx = line_idx + i + 1
202
+ break
203
+ code_lines.append(code_line)
204
+ code_end_idx = line_idx + i + 1
205
+
206
+ structure['children'].append({
207
+ 'type': 'code_block',
208
+ 'language': language,
209
+ 'content': '\n'.join(code_lines)
210
+ })
211
+
212
+ # 跳过已处理的代码行
213
+ if code_end_idx < len(lines):
214
+ lines = lines[:line_idx] + lines[code_end_idx+1:]
215
+ else:
216
+ lines = lines[:line_idx]
217
+ except ValueError:
218
+ # 如果找不到行,跳过代码块解析
219
+ continue
209
220
 
210
221
  # 处理表格
211
222
  elif line.startswith('|') and '|' in line[1:]:
@@ -231,19 +242,28 @@ class MDToJSONConverter(BaseConverter):
231
242
  table_lines = [line]
232
243
 
233
244
  # 读取表格内容
234
- for i, table_line in enumerate(lines[lines.index(line)+1:]):
235
- if table_line.startswith('|'):
236
- table_lines.append(table_line)
237
- else:
238
- break
245
+ try:
246
+ line_idx = lines.index(line)
247
+ for i, table_line in enumerate(lines[line_idx+1:]):
248
+ if table_line.startswith('|'):
249
+ table_lines.append(table_line)
250
+ else:
251
+ break
252
+ except ValueError:
253
+ # 如果找不到行,跳过表格解析
254
+ continue
239
255
 
240
256
  # 解析表格结构
241
257
  if len(table_lines) >= 2:
242
258
  headers = [h.strip() for h in table_lines[0].split('|') if h.strip()]
243
259
  rows = []
244
260
 
245
- # 跳过分隔线
246
- for table_line in table_lines[2:]:
261
+ # 跳过分隔线(如果存在)
262
+ start_idx = 1
263
+ if len(table_lines) > 1 and any('---' in cell for cell in table_lines[1].split('|')):
264
+ start_idx = 2
265
+
266
+ for table_line in table_lines[start_idx:]:
247
267
  cells = [c.strip() for c in table_line.split('|') if c.strip()]
248
268
  if cells:
249
269
  rows.append(dict(zip(headers, cells)))
@@ -0,0 +1,63 @@
1
+ """Markdown到LaTeX格式转换器"""
2
+
3
+ from typing import Any, Optional, Dict
4
+ from hos_m2f.converters.base_converter import BaseConverter
5
+ from hos_m2f.renderers.latex_renderer import LaTeXRenderer
6
+ from hos_m2f.structure.semantic_parser import SemanticParser
7
+
8
+
9
+ class MDToLaTeXConverter(BaseConverter):
10
+ """Markdown到LaTeX格式转换器"""
11
+
12
+ def __init__(self):
13
+ """初始化转换器"""
14
+ self.renderer = LaTeXRenderer()
15
+ self.parser = SemanticParser()
16
+
17
+ def convert(self, input_content: str, options: Optional[Dict[str, Any]] = None) -> bytes:
18
+ """将Markdown转换为LaTeX
19
+
20
+ Args:
21
+ input_content: Markdown内容
22
+ options: 转换选项
23
+
24
+ Returns:
25
+ bytes: LaTeX文件的二进制数据
26
+ """
27
+ if options is None:
28
+ options = {}
29
+
30
+ # 使用SemanticParser解析Markdown内容
31
+ parsed_content = self.parser.parse(input_content)
32
+
33
+ # 增强解析结果
34
+ parsed_content = self._enhance_parsed_content(parsed_content, options)
35
+
36
+ # 使用LaTeXRenderer渲染LaTeX文件
37
+ latex_content = self.renderer.render(parsed_content, options)
38
+
39
+ return latex_content
40
+
41
+ def _enhance_parsed_content(self, parsed_content: Dict[str, Any], options: Dict[str, Any]) -> Dict[str, Any]:
42
+ """增强解析结果"""
43
+ # 添加选项中的元数据
44
+ if 'title' in options:
45
+ parsed_content.setdefault('metadata', {})['title'] = options['title']
46
+ if 'author' in options:
47
+ parsed_content.setdefault('metadata', {})['author'] = options['author']
48
+ if 'date' in options:
49
+ parsed_content.setdefault('metadata', {})['date'] = options['date']
50
+ if 'abstract' in options:
51
+ parsed_content.setdefault('metadata', {})['abstract'] = options['abstract']
52
+ if 'keywords' in options:
53
+ parsed_content.setdefault('metadata', {})['keywords'] = options['keywords']
54
+
55
+ # 添加文档类型
56
+ if 'document_class' in options:
57
+ parsed_content['document_class'] = options['document_class']
58
+
59
+ return parsed_content
60
+
61
+ def get_supported_formats(self) -> tuple:
62
+ """获取支持的格式"""
63
+ return ('markdown', 'latex')
@@ -200,19 +200,30 @@ class MDToXMLConverter(BaseConverter):
200
200
  language = line[3:].strip()
201
201
 
202
202
  # 读取代码内容
203
- for i, code_line in enumerate(lines[lines.index(line)+1:]):
204
- if code_line.startswith('```'):
205
- break
206
- code_lines.append(code_line)
207
-
208
- structure['children'].append({
209
- 'type': 'code_block',
210
- 'language': language,
211
- 'content': '\n'.join(code_lines)
212
- })
213
-
214
- # 跳过已处理的代码行
215
- lines = lines[:lines.index(line)] + lines[lines.index(line)+i+2:]
203
+ try:
204
+ line_idx = lines.index(line)
205
+ code_end_idx = line_idx + 1
206
+ for i, code_line in enumerate(lines[line_idx+1:]):
207
+ if code_line.startswith('```'):
208
+ code_end_idx = line_idx + i + 1
209
+ break
210
+ code_lines.append(code_line)
211
+ code_end_idx = line_idx + i + 1
212
+
213
+ structure['children'].append({
214
+ 'type': 'code_block',
215
+ 'language': language,
216
+ 'content': '\n'.join(code_lines)
217
+ })
218
+
219
+ # 跳过已处理的代码行
220
+ if code_end_idx < len(lines):
221
+ lines = lines[:line_idx] + lines[code_end_idx+1:]
222
+ else:
223
+ lines = lines[:line_idx]
224
+ except ValueError:
225
+ # 如果找不到行,跳过代码块解析
226
+ continue
216
227
 
217
228
  # 处理表格
218
229
  elif line.startswith('|') and '|' in line[1:]:
@@ -238,19 +249,28 @@ class MDToXMLConverter(BaseConverter):
238
249
  table_lines = [line]
239
250
 
240
251
  # 读取表格内容
241
- for i, table_line in enumerate(lines[lines.index(line)+1:]):
242
- if table_line.startswith('|'):
243
- table_lines.append(table_line)
244
- else:
245
- break
252
+ try:
253
+ line_idx = lines.index(line)
254
+ for i, table_line in enumerate(lines[line_idx+1:]):
255
+ if table_line.startswith('|'):
256
+ table_lines.append(table_line)
257
+ else:
258
+ break
259
+ except ValueError:
260
+ # 如果找不到行,跳过表格解析
261
+ continue
246
262
 
247
263
  # 解析表格结构
248
264
  if len(table_lines) >= 2:
249
265
  headers = [h.strip() for h in table_lines[0].split('|') if h.strip()]
250
266
  rows = []
251
267
 
252
- # 跳过分隔线
253
- for table_line in table_lines[2:]:
268
+ # 跳过分隔线(如果存在)
269
+ start_idx = 1
270
+ if len(table_lines) > 1 and any('---' in cell for cell in table_lines[1].split('|')):
271
+ start_idx = 2
272
+
273
+ for table_line in table_lines[start_idx:]:
254
274
  cells = [c.strip() for c in table_line.split('|') if c.strip()]
255
275
  if cells:
256
276
  rows.append(dict(zip(headers, cells)))