hos-m2f 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hos_m2f/cli/cli.py +34 -7
- hos_m2f/converters/__init__.py +27 -0
- hos_m2f/converters/base_converter.py +30 -0
- hos_m2f/converters/docx_to_md.py +89 -0
- hos_m2f/converters/epub_to_md.py +118 -0
- hos_m2f/converters/html_to_md.py +132 -0
- hos_m2f/converters/json_to_md.py +97 -0
- hos_m2f/converters/md_to_docx.py +171 -0
- hos_m2f/converters/md_to_epub.py +108 -0
- hos_m2f/converters/md_to_html.py +100 -0
- hos_m2f/converters/md_to_json.py +284 -0
- hos_m2f/converters/md_to_xml.py +362 -0
- hos_m2f/converters/xml_to_md.py +109 -0
- {hos_m2f-0.5.1.dist-info → hos_m2f-0.5.2.dist-info}/METADATA +1 -1
- hos_m2f-0.5.2.dist-info/RECORD +20 -0
- hos_m2f-0.5.1.dist-info/RECORD +0 -8
- {hos_m2f-0.5.1.dist-info → hos_m2f-0.5.2.dist-info}/WHEEL +0 -0
- {hos_m2f-0.5.1.dist-info → hos_m2f-0.5.2.dist-info}/entry_points.txt +0 -0
- {hos_m2f-0.5.1.dist-info → hos_m2f-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""Markdown到DOCX格式转换器"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Dict
|
|
4
|
+
from docx import Document
|
|
5
|
+
from docx.shared import Inches, Pt
|
|
6
|
+
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
7
|
+
from docx.enum.style import WD_STYLE_TYPE
|
|
8
|
+
from hos_m2f.converters.base_converter import BaseConverter
|
|
9
|
+
import mistune
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MDToDOCXConverter(BaseConverter):
|
|
13
|
+
"""Markdown到DOCX格式转换器"""
|
|
14
|
+
|
|
15
|
+
def convert(self, input_content: str, options: Optional[Dict[str, Any]] = None) -> bytes:
|
|
16
|
+
"""将Markdown转换为DOCX
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
input_content: Markdown内容
|
|
20
|
+
options: 转换选项
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
bytes: DOCX文件的二进制数据
|
|
24
|
+
"""
|
|
25
|
+
if options is None:
|
|
26
|
+
options = {}
|
|
27
|
+
|
|
28
|
+
# 创建文档
|
|
29
|
+
doc = Document()
|
|
30
|
+
|
|
31
|
+
# 设置默认样式
|
|
32
|
+
self._setup_styles(doc)
|
|
33
|
+
|
|
34
|
+
# 解析Markdown
|
|
35
|
+
markdown = mistune.create_markdown(
|
|
36
|
+
plugins=[
|
|
37
|
+
'url',
|
|
38
|
+
'abbr',
|
|
39
|
+
'def_list',
|
|
40
|
+
'footnotes',
|
|
41
|
+
'tables',
|
|
42
|
+
'task_lists',
|
|
43
|
+
'strikethrough',
|
|
44
|
+
'highlight',
|
|
45
|
+
'superscript',
|
|
46
|
+
'subscript'
|
|
47
|
+
]
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# 自定义渲染器
|
|
51
|
+
class DOCXRenderer(mistune.HTMLRenderer):
|
|
52
|
+
def __init__(self, doc):
|
|
53
|
+
super().__init__()
|
|
54
|
+
self.doc = doc
|
|
55
|
+
self.current_paragraph = None
|
|
56
|
+
self.list_level = 0
|
|
57
|
+
self.lists = []
|
|
58
|
+
|
|
59
|
+
def paragraph(self, text):
|
|
60
|
+
if text.strip():
|
|
61
|
+
p = self.doc.add_paragraph()
|
|
62
|
+
p.add_run(text)
|
|
63
|
+
return ''
|
|
64
|
+
|
|
65
|
+
def heading(self, text, level):
|
|
66
|
+
if level == 1:
|
|
67
|
+
self.doc.add_heading(text, level=0)
|
|
68
|
+
else:
|
|
69
|
+
self.doc.add_heading(text, level=level-1)
|
|
70
|
+
return ''
|
|
71
|
+
|
|
72
|
+
def list(self, text, ordered, level, start=None):
|
|
73
|
+
self.list_level += 1
|
|
74
|
+
self.lists.append(ordered)
|
|
75
|
+
return ''
|
|
76
|
+
|
|
77
|
+
def list_item(self, text, level):
|
|
78
|
+
if text.strip():
|
|
79
|
+
p = self.doc.add_paragraph(
|
|
80
|
+
text,
|
|
81
|
+
style='List Number' if self.lists[level-1] else 'List Bullet'
|
|
82
|
+
)
|
|
83
|
+
# 缩进
|
|
84
|
+
for i in range(level-1):
|
|
85
|
+
p.paragraph_format.left_indent += Inches(0.5)
|
|
86
|
+
return ''
|
|
87
|
+
|
|
88
|
+
def list_end(self, level):
|
|
89
|
+
self.list_level -= 1
|
|
90
|
+
if self.lists:
|
|
91
|
+
self.lists.pop()
|
|
92
|
+
return ''
|
|
93
|
+
|
|
94
|
+
def table(self, text):
|
|
95
|
+
# 简化处理,实际项目中需要更复杂的表格解析
|
|
96
|
+
self.doc.add_paragraph('Table: ' + text[:100] + '...')
|
|
97
|
+
return ''
|
|
98
|
+
|
|
99
|
+
def image(self, src, alt='', title=None):
|
|
100
|
+
try:
|
|
101
|
+
# 简化处理,实际项目中需要处理本地和远程图片
|
|
102
|
+
self.doc.add_paragraph(f'Image: {alt} ({src})')
|
|
103
|
+
except Exception:
|
|
104
|
+
pass
|
|
105
|
+
return ''
|
|
106
|
+
|
|
107
|
+
def link(self, link, text=None, title=None):
|
|
108
|
+
if text:
|
|
109
|
+
p = self.doc.add_paragraph()
|
|
110
|
+
run = p.add_run(text)
|
|
111
|
+
# 实际项目中需要添加超链接
|
|
112
|
+
return ''
|
|
113
|
+
|
|
114
|
+
def emphasis(self, text):
|
|
115
|
+
if self.current_paragraph:
|
|
116
|
+
run = self.current_paragraph.add_run(text)
|
|
117
|
+
run.italic = True
|
|
118
|
+
return ''
|
|
119
|
+
|
|
120
|
+
def strong(self, text):
|
|
121
|
+
if self.current_paragraph:
|
|
122
|
+
run = self.current_paragraph.add_run(text)
|
|
123
|
+
run.bold = True
|
|
124
|
+
return ''
|
|
125
|
+
|
|
126
|
+
def codespan(self, text):
|
|
127
|
+
p = self.doc.add_paragraph()
|
|
128
|
+
run = p.add_run(text)
|
|
129
|
+
run.font.name = 'Courier New'
|
|
130
|
+
return ''
|
|
131
|
+
|
|
132
|
+
def block_code(self, code, lang=None):
|
|
133
|
+
p = self.doc.add_paragraph()
|
|
134
|
+
run = p.add_run(code)
|
|
135
|
+
run.font.name = 'Courier New'
|
|
136
|
+
p.paragraph_format.left_indent = Inches(0.5)
|
|
137
|
+
return ''
|
|
138
|
+
|
|
139
|
+
# 渲染Markdown
|
|
140
|
+
renderer = DOCXRenderer(doc)
|
|
141
|
+
markdown(input_content, renderer)
|
|
142
|
+
|
|
143
|
+
# 保存为二进制数据
|
|
144
|
+
import io
|
|
145
|
+
output = io.BytesIO()
|
|
146
|
+
doc.save(output)
|
|
147
|
+
output.seek(0)
|
|
148
|
+
|
|
149
|
+
return output.getvalue()
|
|
150
|
+
|
|
151
|
+
def _setup_styles(self, doc):
|
|
152
|
+
"""设置文档样式"""
|
|
153
|
+
styles = doc.styles
|
|
154
|
+
|
|
155
|
+
# 设置正文样式
|
|
156
|
+
normal_style = styles['Normal']
|
|
157
|
+
font = normal_style.font
|
|
158
|
+
font.name = 'Microsoft YaHei'
|
|
159
|
+
font.size = Pt(12)
|
|
160
|
+
|
|
161
|
+
# 设置标题样式
|
|
162
|
+
for i in range(1, 6):
|
|
163
|
+
heading_style = styles[f'Heading {i}']
|
|
164
|
+
font = heading_style.font
|
|
165
|
+
font.name = 'Microsoft YaHei'
|
|
166
|
+
font.size = Pt(14 + (6 - i) * 2)
|
|
167
|
+
font.bold = True
|
|
168
|
+
|
|
169
|
+
def get_supported_formats(self) -> tuple:
|
|
170
|
+
"""获取支持的格式"""
|
|
171
|
+
return ('markdown', 'docx')
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Markdown到EPUB格式转换器"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Dict
|
|
4
|
+
from hos_m2f.converters.base_converter import BaseConverter
|
|
5
|
+
import ebooklib
|
|
6
|
+
from ebooklib import epub
|
|
7
|
+
import mistune
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MDToEPUBConverter(BaseConverter):
|
|
11
|
+
"""Markdown到EPUB格式转换器"""
|
|
12
|
+
|
|
13
|
+
def convert(self, input_content: str, options: Optional[Dict[str, Any]] = None) -> bytes:
|
|
14
|
+
"""将Markdown转换为EPUB
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
input_content: Markdown内容
|
|
18
|
+
options: 转换选项
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
bytes: EPUB文件的二进制数据
|
|
22
|
+
"""
|
|
23
|
+
if options is None:
|
|
24
|
+
options = {}
|
|
25
|
+
|
|
26
|
+
# 创建EPUB书籍
|
|
27
|
+
book = epub.EpubBook()
|
|
28
|
+
|
|
29
|
+
# 设置元数据
|
|
30
|
+
book.set_identifier('id12345')
|
|
31
|
+
book.set_title(options.get('title', 'Untitled'))
|
|
32
|
+
book.set_language(options.get('language', 'zh'))
|
|
33
|
+
book.add_author(options.get('author', 'Unknown'))
|
|
34
|
+
|
|
35
|
+
# 添加封面
|
|
36
|
+
if 'cover' in options:
|
|
37
|
+
cover_image = epub.EpubItem(
|
|
38
|
+
uid='cover-image',
|
|
39
|
+
file_name='images/cover.jpg',
|
|
40
|
+
media_type='image/jpeg',
|
|
41
|
+
content=options['cover']
|
|
42
|
+
)
|
|
43
|
+
book.add_item(cover_image)
|
|
44
|
+
book.set_cover('images/cover.jpg', cover_image)
|
|
45
|
+
|
|
46
|
+
# 解析Markdown
|
|
47
|
+
markdown = mistune.create_markdown(
|
|
48
|
+
plugins=[
|
|
49
|
+
'url',
|
|
50
|
+
'abbr',
|
|
51
|
+
'def_list',
|
|
52
|
+
'footnotes',
|
|
53
|
+
'tables',
|
|
54
|
+
'task_lists',
|
|
55
|
+
'strikethrough',
|
|
56
|
+
'highlight',
|
|
57
|
+
'superscript',
|
|
58
|
+
'subscript'
|
|
59
|
+
]
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# 转换为HTML
|
|
63
|
+
html_content = markdown(input_content)
|
|
64
|
+
|
|
65
|
+
# 创建章节
|
|
66
|
+
chapter = epub.EpubHtml(
|
|
67
|
+
title=options.get('title', 'Chapter 1'),
|
|
68
|
+
file_name='chapter1.xhtml',
|
|
69
|
+
lang='zh'
|
|
70
|
+
)
|
|
71
|
+
chapter.content = f'''
|
|
72
|
+
<!DOCTYPE html>
|
|
73
|
+
<html>
|
|
74
|
+
<head>
|
|
75
|
+
<title>{options.get('title', 'Untitled')}</title>
|
|
76
|
+
<meta charset="utf-8" />
|
|
77
|
+
</head>
|
|
78
|
+
<body>
|
|
79
|
+
<h1>{options.get('title', 'Untitled')}</h1>
|
|
80
|
+
{html_content}
|
|
81
|
+
</body>
|
|
82
|
+
</html>
|
|
83
|
+
'''
|
|
84
|
+
|
|
85
|
+
# 添加章节
|
|
86
|
+
book.add_item(chapter)
|
|
87
|
+
|
|
88
|
+
# 创建目录
|
|
89
|
+
book.toc = [chapter]
|
|
90
|
+
|
|
91
|
+
# 添加导航文件
|
|
92
|
+
book.add_item(epub.EpubNcx())
|
|
93
|
+
book.add_item(epub.EpubNav())
|
|
94
|
+
|
|
95
|
+
# 定义spine
|
|
96
|
+
book.spine = ['nav', chapter]
|
|
97
|
+
|
|
98
|
+
# 保存为二进制数据
|
|
99
|
+
import io
|
|
100
|
+
output = io.BytesIO()
|
|
101
|
+
epub.write_epub(output, book, {})
|
|
102
|
+
output.seek(0)
|
|
103
|
+
|
|
104
|
+
return output.getvalue()
|
|
105
|
+
|
|
106
|
+
def get_supported_formats(self) -> tuple:
|
|
107
|
+
"""获取支持的格式"""
|
|
108
|
+
return ('markdown', 'epub')
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Markdown到HTML格式转换器"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Dict
|
|
4
|
+
from hos_m2f.converters.base_converter import BaseConverter
|
|
5
|
+
import mistune
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class MDToHTMLConverter(BaseConverter):
|
|
9
|
+
"""Markdown到HTML格式转换器"""
|
|
10
|
+
|
|
11
|
+
def convert(self, input_content: str, options: Optional[Dict[str, Any]] = None) -> bytes:
|
|
12
|
+
"""将Markdown转换为HTML
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
input_content: Markdown内容
|
|
16
|
+
options: 转换选项
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
bytes: HTML文件的二进制数据
|
|
20
|
+
"""
|
|
21
|
+
if options is None:
|
|
22
|
+
options = {}
|
|
23
|
+
|
|
24
|
+
# 解析Markdown
|
|
25
|
+
markdown = mistune.create_markdown(
|
|
26
|
+
plugins=[
|
|
27
|
+
'url',
|
|
28
|
+
'abbr',
|
|
29
|
+
'def_list',
|
|
30
|
+
'footnotes',
|
|
31
|
+
'tables',
|
|
32
|
+
'task_lists',
|
|
33
|
+
'strikethrough',
|
|
34
|
+
'highlight',
|
|
35
|
+
'superscript',
|
|
36
|
+
'subscript'
|
|
37
|
+
]
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# 转换为HTML
|
|
41
|
+
html_content = markdown(input_content)
|
|
42
|
+
|
|
43
|
+
# 生成完整的HTML文档
|
|
44
|
+
full_html = f'''
|
|
45
|
+
<!DOCTYPE html>
|
|
46
|
+
<html>
|
|
47
|
+
<head>
|
|
48
|
+
<title>{options.get('title', 'Untitled')}</title>
|
|
49
|
+
<meta charset="utf-8" />
|
|
50
|
+
<style>
|
|
51
|
+
body {{
|
|
52
|
+
font-family: Arial, sans-serif;
|
|
53
|
+
line-height: 1.6;
|
|
54
|
+
margin: 20px;
|
|
55
|
+
max-width: 800px;
|
|
56
|
+
}}
|
|
57
|
+
h1, h2, h3, h4, h5, h6 {{
|
|
58
|
+
color: #333;
|
|
59
|
+
}}
|
|
60
|
+
code {{
|
|
61
|
+
background-color: #f4f4f4;
|
|
62
|
+
padding: 2px 4px;
|
|
63
|
+
border-radius: 3px;
|
|
64
|
+
}}
|
|
65
|
+
pre {{
|
|
66
|
+
background-color: #f4f4f4;
|
|
67
|
+
padding: 10px;
|
|
68
|
+
border-radius: 3px;
|
|
69
|
+
overflow-x: auto;
|
|
70
|
+
}}
|
|
71
|
+
table {{
|
|
72
|
+
border-collapse: collapse;
|
|
73
|
+
width: 100%;
|
|
74
|
+
margin: 20px 0;
|
|
75
|
+
}}
|
|
76
|
+
th, td {{
|
|
77
|
+
border: 1px solid #ddd;
|
|
78
|
+
padding: 8px;
|
|
79
|
+
text-align: left;
|
|
80
|
+
}}
|
|
81
|
+
th {{
|
|
82
|
+
background-color: #f2f2f2;
|
|
83
|
+
}}
|
|
84
|
+
img {{
|
|
85
|
+
max-width: 100%;
|
|
86
|
+
height: auto;
|
|
87
|
+
}}
|
|
88
|
+
</style>
|
|
89
|
+
</head>
|
|
90
|
+
<body>
|
|
91
|
+
{html_content}
|
|
92
|
+
</body>
|
|
93
|
+
</html>
|
|
94
|
+
'''
|
|
95
|
+
|
|
96
|
+
return full_html.encode('utf-8')
|
|
97
|
+
|
|
98
|
+
def get_supported_formats(self) -> tuple:
|
|
99
|
+
"""获取支持的格式"""
|
|
100
|
+
return ('markdown', 'html')
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""Markdown到JSON格式转换器"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Dict, List
|
|
4
|
+
from hos_m2f.converters.base_converter import BaseConverter
|
|
5
|
+
import mistune
|
|
6
|
+
import json
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class MDToJSONConverter(BaseConverter):
|
|
10
|
+
"""Markdown到JSON格式转换器"""
|
|
11
|
+
|
|
12
|
+
def convert(self, input_content: str, options: Optional[Dict[str, Any]] = None) -> bytes:
|
|
13
|
+
"""将Markdown转换为JSON
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
input_content: Markdown内容
|
|
17
|
+
options: 转换选项
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
bytes: JSON文件的二进制数据
|
|
21
|
+
"""
|
|
22
|
+
if options is None:
|
|
23
|
+
options = {}
|
|
24
|
+
|
|
25
|
+
# 解析Markdown结构
|
|
26
|
+
structure = self._parse_markdown(input_content)
|
|
27
|
+
|
|
28
|
+
# 转换为JSON
|
|
29
|
+
json_content = json.dumps(structure, ensure_ascii=False, indent=2)
|
|
30
|
+
|
|
31
|
+
return json_content.encode('utf-8')
|
|
32
|
+
|
|
33
|
+
def _parse_markdown(self, content: str) -> Dict[str, Any]:
|
|
34
|
+
"""解析Markdown内容为结构化数据
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
content: Markdown内容
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Dict[str, Any]: 结构化数据
|
|
41
|
+
"""
|
|
42
|
+
lines = content.split('\n')
|
|
43
|
+
structure = {
|
|
44
|
+
'type': 'document',
|
|
45
|
+
'children': [],
|
|
46
|
+
'metadata': {}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
current_heading = None
|
|
50
|
+
current_level = 0
|
|
51
|
+
current_paragraph = []
|
|
52
|
+
current_list = None
|
|
53
|
+
current_list_items = []
|
|
54
|
+
list_level = 0
|
|
55
|
+
|
|
56
|
+
# 解析YAML头
|
|
57
|
+
if lines and lines[0] == '---':
|
|
58
|
+
metadata = []
|
|
59
|
+
for i, line in enumerate(lines[1:]):
|
|
60
|
+
if line == '---':
|
|
61
|
+
break
|
|
62
|
+
metadata.append(line)
|
|
63
|
+
|
|
64
|
+
# 解析YAML元数据
|
|
65
|
+
if metadata:
|
|
66
|
+
import yaml
|
|
67
|
+
try:
|
|
68
|
+
metadata_content = '\n'.join(metadata)
|
|
69
|
+
structure['metadata'] = yaml.safe_load(metadata_content)
|
|
70
|
+
except Exception:
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
# 跳过YAML头
|
|
74
|
+
lines = lines[i+2:]
|
|
75
|
+
|
|
76
|
+
# 解析内容
|
|
77
|
+
for line in lines:
|
|
78
|
+
line = line.rstrip()
|
|
79
|
+
|
|
80
|
+
# 处理标题
|
|
81
|
+
if line.startswith('#'):
|
|
82
|
+
# 保存当前段落
|
|
83
|
+
if current_paragraph:
|
|
84
|
+
structure['children'].append({
|
|
85
|
+
'type': 'paragraph',
|
|
86
|
+
'content': '\n'.join(current_paragraph)
|
|
87
|
+
})
|
|
88
|
+
current_paragraph = []
|
|
89
|
+
|
|
90
|
+
# 保存当前列表
|
|
91
|
+
if current_list is not None:
|
|
92
|
+
structure['children'].append({
|
|
93
|
+
'type': 'list',
|
|
94
|
+
'ordered': current_list,
|
|
95
|
+
'items': current_list_items
|
|
96
|
+
})
|
|
97
|
+
current_list = None
|
|
98
|
+
current_list_items = []
|
|
99
|
+
|
|
100
|
+
# 解析标题
|
|
101
|
+
level = len(line.split(' ')[0])
|
|
102
|
+
title = line[level:].strip()
|
|
103
|
+
structure['children'].append({
|
|
104
|
+
'type': 'heading',
|
|
105
|
+
'level': level,
|
|
106
|
+
'content': title
|
|
107
|
+
})
|
|
108
|
+
current_heading = title
|
|
109
|
+
current_level = level
|
|
110
|
+
|
|
111
|
+
# 处理有序列表
|
|
112
|
+
elif line.startswith('1. ') or line.startswith('\t1. ') or line.startswith(' 1. '):
|
|
113
|
+
# 保存当前段落
|
|
114
|
+
if current_paragraph:
|
|
115
|
+
structure['children'].append({
|
|
116
|
+
'type': 'paragraph',
|
|
117
|
+
'content': '\n'.join(current_paragraph)
|
|
118
|
+
})
|
|
119
|
+
current_paragraph = []
|
|
120
|
+
|
|
121
|
+
# 开始新列表
|
|
122
|
+
if current_list is None:
|
|
123
|
+
current_list = True
|
|
124
|
+
elif current_list != True:
|
|
125
|
+
structure['children'].append({
|
|
126
|
+
'type': 'list',
|
|
127
|
+
'ordered': current_list,
|
|
128
|
+
'items': current_list_items
|
|
129
|
+
})
|
|
130
|
+
current_list = True
|
|
131
|
+
current_list_items = []
|
|
132
|
+
|
|
133
|
+
# 解析列表项
|
|
134
|
+
content = line.lstrip('1234567890. \t')
|
|
135
|
+
current_list_items.append({
|
|
136
|
+
'type': 'list_item',
|
|
137
|
+
'content': content
|
|
138
|
+
})
|
|
139
|
+
|
|
140
|
+
# 处理无序列表
|
|
141
|
+
elif line.startswith('- ') or line.startswith('* ') or line.startswith('+ ') or \
|
|
142
|
+
line.startswith('\t- ') or line.startswith('\t* ') or line.startswith('\t+ ') or \
|
|
143
|
+
line.startswith(' - ') or line.startswith(' * ') or line.startswith(' + '):
|
|
144
|
+
# 保存当前段落
|
|
145
|
+
if current_paragraph:
|
|
146
|
+
structure['children'].append({
|
|
147
|
+
'type': 'paragraph',
|
|
148
|
+
'content': '\n'.join(current_paragraph)
|
|
149
|
+
})
|
|
150
|
+
current_paragraph = []
|
|
151
|
+
|
|
152
|
+
# 开始新列表
|
|
153
|
+
if current_list is None:
|
|
154
|
+
current_list = False
|
|
155
|
+
elif current_list != False:
|
|
156
|
+
structure['children'].append({
|
|
157
|
+
'type': 'list',
|
|
158
|
+
'ordered': current_list,
|
|
159
|
+
'items': current_list_items
|
|
160
|
+
})
|
|
161
|
+
current_list = False
|
|
162
|
+
current_list_items = []
|
|
163
|
+
|
|
164
|
+
# 解析列表项
|
|
165
|
+
content = line.lstrip('-*+ \t')
|
|
166
|
+
current_list_items.append({
|
|
167
|
+
'type': 'list_item',
|
|
168
|
+
'content': content
|
|
169
|
+
})
|
|
170
|
+
|
|
171
|
+
# 处理代码块
|
|
172
|
+
elif line.startswith('```'):
|
|
173
|
+
# 保存当前段落
|
|
174
|
+
if current_paragraph:
|
|
175
|
+
structure['children'].append({
|
|
176
|
+
'type': 'paragraph',
|
|
177
|
+
'content': '\n'.join(current_paragraph)
|
|
178
|
+
})
|
|
179
|
+
current_paragraph = []
|
|
180
|
+
|
|
181
|
+
# 保存当前列表
|
|
182
|
+
if current_list is not None:
|
|
183
|
+
structure['children'].append({
|
|
184
|
+
'type': 'list',
|
|
185
|
+
'ordered': current_list,
|
|
186
|
+
'items': current_list_items
|
|
187
|
+
})
|
|
188
|
+
current_list = None
|
|
189
|
+
current_list_items = []
|
|
190
|
+
|
|
191
|
+
# 解析代码块
|
|
192
|
+
code_lines = []
|
|
193
|
+
language = line[3:].strip()
|
|
194
|
+
|
|
195
|
+
# 读取代码内容
|
|
196
|
+
for i, code_line in enumerate(lines[lines.index(line)+1:]):
|
|
197
|
+
if code_line.startswith('```'):
|
|
198
|
+
break
|
|
199
|
+
code_lines.append(code_line)
|
|
200
|
+
|
|
201
|
+
structure['children'].append({
|
|
202
|
+
'type': 'code_block',
|
|
203
|
+
'language': language,
|
|
204
|
+
'content': '\n'.join(code_lines)
|
|
205
|
+
})
|
|
206
|
+
|
|
207
|
+
# 跳过已处理的代码行
|
|
208
|
+
lines = lines[:lines.index(line)] + lines[lines.index(line)+i+2:]
|
|
209
|
+
|
|
210
|
+
# 处理表格
|
|
211
|
+
elif line.startswith('|') and '|' in line[1:]:
|
|
212
|
+
# 保存当前段落
|
|
213
|
+
if current_paragraph:
|
|
214
|
+
structure['children'].append({
|
|
215
|
+
'type': 'paragraph',
|
|
216
|
+
'content': '\n'.join(current_paragraph)
|
|
217
|
+
})
|
|
218
|
+
current_paragraph = []
|
|
219
|
+
|
|
220
|
+
# 保存当前列表
|
|
221
|
+
if current_list is not None:
|
|
222
|
+
structure['children'].append({
|
|
223
|
+
'type': 'list',
|
|
224
|
+
'ordered': current_list,
|
|
225
|
+
'items': current_list_items
|
|
226
|
+
})
|
|
227
|
+
current_list = None
|
|
228
|
+
current_list_items = []
|
|
229
|
+
|
|
230
|
+
# 解析表格
|
|
231
|
+
table_lines = [line]
|
|
232
|
+
|
|
233
|
+
# 读取表格内容
|
|
234
|
+
for i, table_line in enumerate(lines[lines.index(line)+1:]):
|
|
235
|
+
if table_line.startswith('|'):
|
|
236
|
+
table_lines.append(table_line)
|
|
237
|
+
else:
|
|
238
|
+
break
|
|
239
|
+
|
|
240
|
+
# 解析表格结构
|
|
241
|
+
if len(table_lines) >= 2:
|
|
242
|
+
headers = [h.strip() for h in table_lines[0].split('|') if h.strip()]
|
|
243
|
+
rows = []
|
|
244
|
+
|
|
245
|
+
# 跳过分隔线
|
|
246
|
+
for table_line in table_lines[2:]:
|
|
247
|
+
cells = [c.strip() for c in table_line.split('|') if c.strip()]
|
|
248
|
+
if cells:
|
|
249
|
+
rows.append(dict(zip(headers, cells)))
|
|
250
|
+
|
|
251
|
+
structure['children'].append({
|
|
252
|
+
'type': 'table',
|
|
253
|
+
'headers': headers,
|
|
254
|
+
'rows': rows
|
|
255
|
+
})
|
|
256
|
+
|
|
257
|
+
# 跳过已处理的表格行
|
|
258
|
+
lines = lines[:lines.index(line)] + lines[lines.index(line)+i+1:]
|
|
259
|
+
|
|
260
|
+
# 处理段落
|
|
261
|
+
else:
|
|
262
|
+
if line or current_paragraph:
|
|
263
|
+
current_paragraph.append(line)
|
|
264
|
+
|
|
265
|
+
# 保存最后一个段落
|
|
266
|
+
if current_paragraph:
|
|
267
|
+
structure['children'].append({
|
|
268
|
+
'type': 'paragraph',
|
|
269
|
+
'content': '\n'.join(current_paragraph)
|
|
270
|
+
})
|
|
271
|
+
|
|
272
|
+
# 保存最后一个列表
|
|
273
|
+
if current_list is not None:
|
|
274
|
+
structure['children'].append({
|
|
275
|
+
'type': 'list',
|
|
276
|
+
'ordered': current_list,
|
|
277
|
+
'items': current_list_items
|
|
278
|
+
})
|
|
279
|
+
|
|
280
|
+
return structure
|
|
281
|
+
|
|
282
|
+
def get_supported_formats(self) -> tuple:
|
|
283
|
+
"""获取支持的格式"""
|
|
284
|
+
return ('markdown', 'json')
|