hos-m2f 0.5.0__tar.gz → 0.5.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hos_m2f-0.5.2/PKG-INFO +47 -0
- {hos_m2f-0.5.0 → hos_m2f-0.5.2}/hos_m2f/cli/cli.py +34 -7
- hos_m2f-0.5.2/hos_m2f/converters/__init__.py +27 -0
- hos_m2f-0.5.2/hos_m2f/converters/base_converter.py +30 -0
- hos_m2f-0.5.2/hos_m2f/converters/docx_to_md.py +89 -0
- hos_m2f-0.5.2/hos_m2f/converters/epub_to_md.py +118 -0
- hos_m2f-0.5.2/hos_m2f/converters/html_to_md.py +132 -0
- hos_m2f-0.5.2/hos_m2f/converters/json_to_md.py +97 -0
- hos_m2f-0.5.2/hos_m2f/converters/md_to_docx.py +171 -0
- hos_m2f-0.5.2/hos_m2f/converters/md_to_epub.py +108 -0
- hos_m2f-0.5.2/hos_m2f/converters/md_to_html.py +100 -0
- hos_m2f-0.5.2/hos_m2f/converters/md_to_json.py +284 -0
- hos_m2f-0.5.2/hos_m2f/converters/md_to_xml.py +362 -0
- hos_m2f-0.5.2/hos_m2f/converters/xml_to_md.py +109 -0
- hos_m2f-0.5.2/hos_m2f.egg-info/PKG-INFO +47 -0
- hos_m2f-0.5.2/hos_m2f.egg-info/SOURCES.txt +23 -0
- hos_m2f-0.5.2/setup.py +52 -0
- hos_m2f-0.5.0/PKG-INFO +0 -22
- hos_m2f-0.5.0/hos_m2f.egg-info/PKG-INFO +0 -22
- hos_m2f-0.5.0/hos_m2f.egg-info/SOURCES.txt +0 -11
- hos_m2f-0.5.0/setup.py +0 -30
- {hos_m2f-0.5.0 → hos_m2f-0.5.2}/README.md +0 -0
- {hos_m2f-0.5.0 → hos_m2f-0.5.2}/hos_m2f/__init__.py +0 -0
- {hos_m2f-0.5.0 → hos_m2f-0.5.2}/hos_m2f/cli/__init__.py +0 -0
- {hos_m2f-0.5.0 → hos_m2f-0.5.2}/hos_m2f.egg-info/dependency_links.txt +0 -0
- {hos_m2f-0.5.0 → hos_m2f-0.5.2}/hos_m2f.egg-info/entry_points.txt +0 -0
- {hos_m2f-0.5.0 → hos_m2f-0.5.2}/hos_m2f.egg-info/requires.txt +0 -0
- {hos_m2f-0.5.0 → hos_m2f-0.5.2}/hos_m2f.egg-info/top_level.txt +0 -0
- {hos_m2f-0.5.0 → hos_m2f-0.5.2}/setup.cfg +0 -0
hos_m2f-0.5.2/PKG-INFO
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hos-m2f
|
|
3
|
+
Version: 0.5.2
|
|
4
|
+
Summary: HOS-M2F: Markdown to Industry Standard Format Compiler Engine
|
|
5
|
+
Author: HOS Team
|
|
6
|
+
Author-email: team@hos-m2f.com
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.8
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Requires-Dist: mistune>=2.0.0
|
|
13
|
+
Requires-Dist: pyyaml>=6.0
|
|
14
|
+
Requires-Dist: click>=8.0.0
|
|
15
|
+
Requires-Dist: ebooklib>=0.17.0
|
|
16
|
+
Requires-Dist: weasyprint>=54.0
|
|
17
|
+
Requires-Dist: python-docx>=0.8.11
|
|
18
|
+
Dynamic: author
|
|
19
|
+
Dynamic: author-email
|
|
20
|
+
Dynamic: classifier
|
|
21
|
+
Dynamic: description
|
|
22
|
+
Dynamic: description-content-type
|
|
23
|
+
Dynamic: requires-dist
|
|
24
|
+
Dynamic: requires-python
|
|
25
|
+
Dynamic: summary
|
|
26
|
+
|
|
27
|
+
HOS-M2F is a powerful compiler engine that converts Markdown files to various industry standard formats.
|
|
28
|
+
|
|
29
|
+
Key Features:
|
|
30
|
+
- Multiple output formats: PDF, DOCX, EPUB, and JSON
|
|
31
|
+
- Specialized modes for different document types:
|
|
32
|
+
- Book mode for book-length documents
|
|
33
|
+
- Paper mode for academic papers
|
|
34
|
+
- Patent mode for patent applications
|
|
35
|
+
- SOP (Standard Operating Procedure) mode for technical procedures
|
|
36
|
+
- Command-line interface for easy automation
|
|
37
|
+
- Semantic parsing for intelligent document structure
|
|
38
|
+
- Extensible architecture for custom renderers and modes
|
|
39
|
+
|
|
40
|
+
Use Cases:
|
|
41
|
+
- Convert technical documentation to professional formats
|
|
42
|
+
- Generate academic papers from Markdown sources
|
|
43
|
+
- Create standardized operating procedures
|
|
44
|
+
- Prepare patent applications with proper formatting
|
|
45
|
+
- Build eBooks from Markdown content
|
|
46
|
+
|
|
47
|
+
HOS-M2F simplifies the process of creating professionally formatted documents from plain Markdown, saving time and ensuring consistency across document types.
|
|
@@ -27,23 +27,31 @@ class CLI:
|
|
|
27
27
|
build_parser = self.subparsers.add_parser('build', help='Build document from Markdown')
|
|
28
28
|
build_parser.add_argument('input', help='Input Markdown file')
|
|
29
29
|
build_parser.add_argument('output', help='Output file path')
|
|
30
|
-
build_parser.add_argument('--mode',
|
|
31
|
-
build_parser.add_argument('--format', choices=['epub', 'pdf', 'docx', 'json'], default='epub', help='Output format')
|
|
30
|
+
build_parser.add_argument('--mode', default='paper', help='Document mode')
|
|
31
|
+
build_parser.add_argument('--format', choices=['epub', 'pdf', 'docx', 'json', 'html', 'xml'], default='epub', help='Output format')
|
|
32
32
|
build_parser.add_argument('--options', type=str, default='{}', help='Additional options as JSON string')
|
|
33
33
|
|
|
34
34
|
# check命令
|
|
35
35
|
check_parser = self.subparsers.add_parser('check', help='Check document structure and compliance')
|
|
36
36
|
check_parser.add_argument('input', help='Input Markdown file')
|
|
37
|
-
check_parser.add_argument('--mode',
|
|
37
|
+
check_parser.add_argument('--mode', default='paper', help='Document mode')
|
|
38
38
|
check_parser.add_argument('--options', type=str, default='{}', help='Additional options as JSON string')
|
|
39
39
|
|
|
40
40
|
# parse命令
|
|
41
41
|
parse_parser = self.subparsers.add_parser('parse', help='Parse Markdown content and output structured data')
|
|
42
42
|
parse_parser.add_argument('input', help='Input Markdown file')
|
|
43
|
-
parse_parser.add_argument('--mode',
|
|
43
|
+
parse_parser.add_argument('--mode', default='paper', help='Document mode')
|
|
44
44
|
parse_parser.add_argument('--output', help='Output JSON file (default: stdout)')
|
|
45
45
|
parse_parser.add_argument('--options', type=str, default='{}', help='Additional options as JSON string')
|
|
46
46
|
|
|
47
|
+
# convert命令
|
|
48
|
+
convert_parser = self.subparsers.add_parser('convert', help='Convert between different formats')
|
|
49
|
+
convert_parser.add_argument('input', help='Input file')
|
|
50
|
+
convert_parser.add_argument('output', help='Output file path')
|
|
51
|
+
convert_parser.add_argument('--from', dest='from_format', required=True, choices=['md', 'docx', 'json', 'epub', 'html', 'xml'], help='Input format')
|
|
52
|
+
convert_parser.add_argument('--to', dest='to_format', required=True, choices=['md', 'docx', 'json', 'epub', 'html', 'xml'], help='Output format')
|
|
53
|
+
convert_parser.add_argument('--options', type=str, default='{}', help='Additional options as JSON string')
|
|
54
|
+
|
|
47
55
|
# preview命令
|
|
48
56
|
preview_parser = self.subparsers.add_parser('preview', help='Start preview server')
|
|
49
57
|
preview_parser.add_argument('--port', type=int, default=8000, help='Port to run the server on')
|
|
@@ -54,7 +62,7 @@ class CLI:
|
|
|
54
62
|
|
|
55
63
|
# validate命令
|
|
56
64
|
validate_parser = self.subparsers.add_parser('validate', help='Validate options for a specific mode')
|
|
57
|
-
validate_parser.add_argument('--mode',
|
|
65
|
+
validate_parser.add_argument('--mode', required=True, help='Document mode')
|
|
58
66
|
validate_parser.add_argument('--options', type=str, required=True, help='Options to validate as JSON string')
|
|
59
67
|
|
|
60
68
|
def run(self, args=None):
|
|
@@ -82,6 +90,8 @@ class CLI:
|
|
|
82
90
|
return self._run_check(parsed_args)
|
|
83
91
|
elif parsed_args.command == 'parse':
|
|
84
92
|
return self._run_parse(parsed_args)
|
|
93
|
+
elif parsed_args.command == 'convert':
|
|
94
|
+
return self._run_convert(parsed_args)
|
|
85
95
|
elif parsed_args.command == 'preview':
|
|
86
96
|
return self._run_preview(parsed_args)
|
|
87
97
|
elif parsed_args.command == 'info':
|
|
@@ -266,12 +276,29 @@ class CLI:
|
|
|
266
276
|
|
|
267
277
|
print('\nValidation passed!')
|
|
268
278
|
return 0
|
|
279
|
+
|
|
280
|
+
def _run_convert(self, args):
|
|
281
|
+
"""运行convert命令"""
|
|
282
|
+
# 解析选项
|
|
283
|
+
options = json.loads(args.options)
|
|
284
|
+
|
|
285
|
+
# 执行转换
|
|
286
|
+
result = self.engine.convert(args.input, args.output, args.from_format, args.to_format, options)
|
|
287
|
+
|
|
288
|
+
# 输出结果
|
|
289
|
+
print(f'Successfully converted {args.input} to {args.output}')
|
|
290
|
+
print(f'From format: {args.from_format}')
|
|
291
|
+
print(f'To format: {args.to_format}')
|
|
292
|
+
if result['metadata']:
|
|
293
|
+
print('Metadata:', json.dumps(result['metadata'], ensure_ascii=False, indent=2))
|
|
294
|
+
|
|
295
|
+
return 0
|
|
269
296
|
|
|
270
297
|
|
|
271
|
-
def main():
|
|
298
|
+
def main(args=None):
|
|
272
299
|
"""CLI命令入口点"""
|
|
273
300
|
cli = CLI()
|
|
274
|
-
sys.exit(cli.run())
|
|
301
|
+
sys.exit(cli.run(args))
|
|
275
302
|
|
|
276
303
|
# 命令行入口
|
|
277
304
|
if __name__ == '__main__':
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""基础格式互转模块"""
|
|
2
|
+
|
|
3
|
+
from hos_m2f.converters.base_converter import BaseConverter
|
|
4
|
+
from hos_m2f.converters.md_to_docx import MDToDOCXConverter
|
|
5
|
+
from hos_m2f.converters.md_to_json import MDToJSONConverter
|
|
6
|
+
from hos_m2f.converters.md_to_epub import MDToEPUBConverter
|
|
7
|
+
from hos_m2f.converters.md_to_html import MDToHTMLConverter
|
|
8
|
+
from hos_m2f.converters.md_to_xml import MDToXMLConverter
|
|
9
|
+
from hos_m2f.converters.docx_to_md import DOCXToMDConverter
|
|
10
|
+
from hos_m2f.converters.json_to_md import JSONToMDConverter
|
|
11
|
+
from hos_m2f.converters.epub_to_md import EPUBToMDConverter
|
|
12
|
+
from hos_m2f.converters.html_to_md import HTMLToMDConverter
|
|
13
|
+
from hos_m2f.converters.xml_to_md import XMLToMDConverter
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"BaseConverter",
|
|
17
|
+
"MDToDOCXConverter",
|
|
18
|
+
"MDToJSONConverter",
|
|
19
|
+
"MDToEPUBConverter",
|
|
20
|
+
"MDToHTMLConverter",
|
|
21
|
+
"MDToXMLConverter",
|
|
22
|
+
"DOCXToMDConverter",
|
|
23
|
+
"JSONToMDConverter",
|
|
24
|
+
"EPUBToMDConverter",
|
|
25
|
+
"HTMLToMDConverter",
|
|
26
|
+
"XMLToMDConverter"
|
|
27
|
+
]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""基础转换器类"""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any, Optional, Dict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BaseConverter(ABC):
|
|
8
|
+
"""基础转换器抽象类"""
|
|
9
|
+
|
|
10
|
+
@abstractmethod
|
|
11
|
+
def convert(self, input_content: Any, options: Optional[Dict[str, Any]] = None) -> Any:
|
|
12
|
+
"""转换方法
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
input_content: 输入内容
|
|
16
|
+
options: 转换选项
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Any: 转换后的内容
|
|
20
|
+
"""
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def get_supported_formats(self) -> tuple:
|
|
25
|
+
"""获取支持的格式
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
tuple: (输入格式, 输出格式)
|
|
29
|
+
"""
|
|
30
|
+
pass
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""DOCX到Markdown格式转换器"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Dict
|
|
4
|
+
from hos_m2f.converters.base_converter import BaseConverter
|
|
5
|
+
from docx import Document
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DOCXToMDConverter(BaseConverter):
|
|
9
|
+
"""DOCX到Markdown格式转换器"""
|
|
10
|
+
|
|
11
|
+
def convert(self, input_content: bytes, options: Optional[Dict[str, Any]] = None) -> bytes:
|
|
12
|
+
"""将DOCX转换为Markdown
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
input_content: DOCX文件的二进制数据
|
|
16
|
+
options: 转换选项
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
bytes: Markdown文件的二进制数据
|
|
20
|
+
"""
|
|
21
|
+
if options is None:
|
|
22
|
+
options = {}
|
|
23
|
+
|
|
24
|
+
# 加载DOCX文档
|
|
25
|
+
import io
|
|
26
|
+
doc = Document(io.BytesIO(input_content))
|
|
27
|
+
|
|
28
|
+
# 转换为Markdown
|
|
29
|
+
md_content = []
|
|
30
|
+
|
|
31
|
+
# 处理段落
|
|
32
|
+
for paragraph in doc.paragraphs:
|
|
33
|
+
# 处理标题
|
|
34
|
+
if paragraph.style.name.startswith('Heading'):
|
|
35
|
+
level = int(paragraph.style.name.split(' ')[1])
|
|
36
|
+
md_content.append('#' * level + ' ' + paragraph.text)
|
|
37
|
+
# 处理列表
|
|
38
|
+
elif paragraph.style.name in ['List Bullet', 'List Number']:
|
|
39
|
+
# 检测缩进级别
|
|
40
|
+
indent_level = int(paragraph.paragraph_format.left_indent.inches // 0.5)
|
|
41
|
+
prefix = ' ' * indent_level
|
|
42
|
+
|
|
43
|
+
if paragraph.style.name == 'List Number':
|
|
44
|
+
# 简化处理,实际项目中需要更复杂的列表编号处理
|
|
45
|
+
md_content.append(prefix + '1. ' + paragraph.text)
|
|
46
|
+
else:
|
|
47
|
+
md_content.append(prefix + '- ' + paragraph.text)
|
|
48
|
+
# 处理普通段落
|
|
49
|
+
else:
|
|
50
|
+
if paragraph.text:
|
|
51
|
+
md_content.append(paragraph.text)
|
|
52
|
+
|
|
53
|
+
# 添加空行
|
|
54
|
+
md_content.append('')
|
|
55
|
+
|
|
56
|
+
# 处理表格
|
|
57
|
+
for table in doc.tables:
|
|
58
|
+
# 表头
|
|
59
|
+
headers = []
|
|
60
|
+
for cell in table.rows[0].cells:
|
|
61
|
+
headers.append(cell.text)
|
|
62
|
+
|
|
63
|
+
# 表格分隔线
|
|
64
|
+
separators = ['---'] * len(headers)
|
|
65
|
+
|
|
66
|
+
# 表格数据
|
|
67
|
+
rows = []
|
|
68
|
+
for row in table.rows[1:]:
|
|
69
|
+
row_cells = []
|
|
70
|
+
for cell in row.cells:
|
|
71
|
+
row_cells.append(cell.text)
|
|
72
|
+
rows.append(row_cells)
|
|
73
|
+
|
|
74
|
+
# 生成Markdown表格
|
|
75
|
+
if headers:
|
|
76
|
+
md_content.append('| ' + ' | '.join(headers) + ' |')
|
|
77
|
+
md_content.append('| ' + ' | '.join(separators) + ' |')
|
|
78
|
+
for row in rows:
|
|
79
|
+
md_content.append('| ' + ' | '.join(row) + ' |')
|
|
80
|
+
md_content.append('')
|
|
81
|
+
|
|
82
|
+
# 生成Markdown内容
|
|
83
|
+
md_text = '\n'.join(md_content)
|
|
84
|
+
|
|
85
|
+
return md_text.encode('utf-8')
|
|
86
|
+
|
|
87
|
+
def get_supported_formats(self) -> tuple:
|
|
88
|
+
"""获取支持的格式"""
|
|
89
|
+
return ('docx', 'markdown')
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""EPUB到Markdown格式转换器"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Dict
|
|
4
|
+
from hos_m2f.converters.base_converter import BaseConverter
|
|
5
|
+
import ebooklib
|
|
6
|
+
from ebooklib import epub
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EPUBToMDConverter(BaseConverter):
|
|
11
|
+
"""EPUB到Markdown格式转换器"""
|
|
12
|
+
|
|
13
|
+
def convert(self, input_content: bytes, options: Optional[Dict[str, Any]] = None) -> bytes:
|
|
14
|
+
"""将EPUB转换为Markdown
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
input_content: EPUB文件的二进制数据
|
|
18
|
+
options: 转换选项
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
bytes: Markdown文件的二进制数据
|
|
22
|
+
"""
|
|
23
|
+
if options is None:
|
|
24
|
+
options = {}
|
|
25
|
+
|
|
26
|
+
# 加载EPUB书籍
|
|
27
|
+
import io
|
|
28
|
+
book = epub.read_epub(io.BytesIO(input_content))
|
|
29
|
+
|
|
30
|
+
# 转换为Markdown
|
|
31
|
+
md_content = []
|
|
32
|
+
|
|
33
|
+
# 提取元数据
|
|
34
|
+
md_content.append('---')
|
|
35
|
+
if book.get_metadata('DC', 'title'):
|
|
36
|
+
md_content.append(f'title: {book.get_metadata("DC", "title")[0][0]}')
|
|
37
|
+
if book.get_metadata('DC', 'creator'):
|
|
38
|
+
md_content.append(f'author: {book.get_metadata("DC", "creator")[0][0]}')
|
|
39
|
+
if book.get_metadata('DC', 'language'):
|
|
40
|
+
md_content.append(f'language: {book.get_metadata("DC", "language")[0][0]}')
|
|
41
|
+
md_content.append('---')
|
|
42
|
+
md_content.append('')
|
|
43
|
+
|
|
44
|
+
# 处理章节
|
|
45
|
+
for item in book.get_items():
|
|
46
|
+
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
|
47
|
+
# 解析HTML内容
|
|
48
|
+
soup = BeautifulSoup(item.get_content(), 'html.parser')
|
|
49
|
+
|
|
50
|
+
# 提取标题
|
|
51
|
+
h1 = soup.find('h1')
|
|
52
|
+
if h1:
|
|
53
|
+
md_content.append(f'# {h1.get_text()}')
|
|
54
|
+
md_content.append('')
|
|
55
|
+
|
|
56
|
+
# 提取段落
|
|
57
|
+
for p in soup.find_all('p'):
|
|
58
|
+
text = p.get_text().strip()
|
|
59
|
+
if text:
|
|
60
|
+
md_content.append(text)
|
|
61
|
+
md_content.append('')
|
|
62
|
+
|
|
63
|
+
# 提取列表
|
|
64
|
+
for ul in soup.find_all('ul'):
|
|
65
|
+
for li in ul.find_all('li'):
|
|
66
|
+
text = li.get_text().strip()
|
|
67
|
+
if text:
|
|
68
|
+
md_content.append(f'- {text}')
|
|
69
|
+
md_content.append('')
|
|
70
|
+
|
|
71
|
+
for ol in soup.find_all('ol'):
|
|
72
|
+
for i, li in enumerate(ol.find_all('li'), 1):
|
|
73
|
+
text = li.get_text().strip()
|
|
74
|
+
if text:
|
|
75
|
+
md_content.append(f'{i}. {text}')
|
|
76
|
+
md_content.append('')
|
|
77
|
+
|
|
78
|
+
# 提取表格
|
|
79
|
+
for table in soup.find_all('table'):
|
|
80
|
+
# 提取表头
|
|
81
|
+
headers = []
|
|
82
|
+
thead = table.find('thead')
|
|
83
|
+
if thead:
|
|
84
|
+
for th in thead.find_all('th'):
|
|
85
|
+
headers.append(th.get_text().strip())
|
|
86
|
+
else:
|
|
87
|
+
# 尝试从第一行提取表头
|
|
88
|
+
first_row = table.find('tr')
|
|
89
|
+
if first_row:
|
|
90
|
+
for th in first_row.find_all(['th', 'td']):
|
|
91
|
+
headers.append(th.get_text().strip())
|
|
92
|
+
|
|
93
|
+
if headers:
|
|
94
|
+
# 生成表格
|
|
95
|
+
md_content.append('| ' + ' | '.join(headers) + ' |')
|
|
96
|
+
md_content.append('| ' + ' | '.join(['---'] * len(headers)) + ' |')
|
|
97
|
+
|
|
98
|
+
# 提取表格数据
|
|
99
|
+
tbody = table.find('tbody')
|
|
100
|
+
rows = tbody.find_all('tr') if tbody else table.find_all('tr')[1:] if table.find_all('tr') else []
|
|
101
|
+
|
|
102
|
+
for row in rows:
|
|
103
|
+
cells = []
|
|
104
|
+
for td in row.find_all('td'):
|
|
105
|
+
cells.append(td.get_text().strip())
|
|
106
|
+
if cells:
|
|
107
|
+
md_content.append('| ' + ' | '.join(cells) + ' |')
|
|
108
|
+
|
|
109
|
+
md_content.append('')
|
|
110
|
+
|
|
111
|
+
# 生成Markdown内容
|
|
112
|
+
md_text = '\n'.join(md_content)
|
|
113
|
+
|
|
114
|
+
return md_text.encode('utf-8')
|
|
115
|
+
|
|
116
|
+
def get_supported_formats(self) -> tuple:
|
|
117
|
+
"""获取支持的格式"""
|
|
118
|
+
return ('epub', 'markdown')
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""HTML到Markdown格式转换器"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Dict
|
|
4
|
+
from hos_m2f.converters.base_converter import BaseConverter
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HTMLToMDConverter(BaseConverter):
|
|
9
|
+
"""HTML到Markdown格式转换器"""
|
|
10
|
+
|
|
11
|
+
def convert(self, input_content: bytes, options: Optional[Dict[str, Any]] = None) -> bytes:
|
|
12
|
+
"""将HTML转换为Markdown
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
input_content: HTML文件的二进制数据
|
|
16
|
+
options: 转换选项
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
bytes: Markdown文件的二进制数据
|
|
20
|
+
"""
|
|
21
|
+
if options is None:
|
|
22
|
+
options = {}
|
|
23
|
+
|
|
24
|
+
# 解析HTML
|
|
25
|
+
soup = BeautifulSoup(input_content, 'html.parser')
|
|
26
|
+
|
|
27
|
+
# 转换为Markdown
|
|
28
|
+
md_content = self._html_to_md(soup)
|
|
29
|
+
|
|
30
|
+
return md_content.encode('utf-8')
|
|
31
|
+
|
|
32
|
+
def _html_to_md(self, soup: BeautifulSoup) -> str:
|
|
33
|
+
"""将HTML转换为Markdown
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
soup: BeautifulSoup对象
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
str: Markdown字符串
|
|
40
|
+
"""
|
|
41
|
+
md_parts = []
|
|
42
|
+
|
|
43
|
+
# 处理标题
|
|
44
|
+
for level in range(1, 7):
|
|
45
|
+
for heading in soup.find_all(f'h{level}'):
|
|
46
|
+
md_parts.append('#' * level + ' ' + heading.get_text())
|
|
47
|
+
md_parts.append('')
|
|
48
|
+
|
|
49
|
+
# 处理段落
|
|
50
|
+
for paragraph in soup.find_all('p'):
|
|
51
|
+
text = paragraph.get_text().strip()
|
|
52
|
+
if text:
|
|
53
|
+
md_parts.append(text)
|
|
54
|
+
md_parts.append('')
|
|
55
|
+
|
|
56
|
+
# 处理列表
|
|
57
|
+
for ul in soup.find_all('ul'):
|
|
58
|
+
for li in ul.find_all('li'):
|
|
59
|
+
text = li.get_text().strip()
|
|
60
|
+
if text:
|
|
61
|
+
md_parts.append('- ' + text)
|
|
62
|
+
md_parts.append('')
|
|
63
|
+
|
|
64
|
+
for ol in soup.find_all('ol'):
|
|
65
|
+
for i, li in enumerate(ol.find_all('li'), 1):
|
|
66
|
+
text = li.get_text().strip()
|
|
67
|
+
if text:
|
|
68
|
+
md_parts.append(f'{i}. ' + text)
|
|
69
|
+
md_parts.append('')
|
|
70
|
+
|
|
71
|
+
# 处理表格
|
|
72
|
+
for table in soup.find_all('table'):
|
|
73
|
+
# 提取表头
|
|
74
|
+
headers = []
|
|
75
|
+
thead = table.find('thead')
|
|
76
|
+
if thead:
|
|
77
|
+
for th in thead.find_all('th'):
|
|
78
|
+
headers.append(th.get_text().strip())
|
|
79
|
+
else:
|
|
80
|
+
# 尝试从第一行提取表头
|
|
81
|
+
first_row = table.find('tr')
|
|
82
|
+
if first_row:
|
|
83
|
+
for th in first_row.find_all(['th', 'td']):
|
|
84
|
+
headers.append(th.get_text().strip())
|
|
85
|
+
|
|
86
|
+
if headers:
|
|
87
|
+
# 生成表格
|
|
88
|
+
md_parts.append('| ' + ' | '.join(headers) + ' |')
|
|
89
|
+
md_parts.append('| ' + ' | '.join(['---'] * len(headers)) + ' |')
|
|
90
|
+
|
|
91
|
+
# 提取表格数据
|
|
92
|
+
tbody = table.find('tbody')
|
|
93
|
+
rows = tbody.find_all('tr') if tbody else table.find_all('tr')[1:] if table.find_all('tr') else []
|
|
94
|
+
|
|
95
|
+
for row in rows:
|
|
96
|
+
cells = []
|
|
97
|
+
for td in row.find_all('td'):
|
|
98
|
+
cells.append(td.get_text().strip())
|
|
99
|
+
if cells:
|
|
100
|
+
md_parts.append('| ' + ' | '.join(cells) + ' |')
|
|
101
|
+
|
|
102
|
+
md_parts.append('')
|
|
103
|
+
|
|
104
|
+
# 处理代码块
|
|
105
|
+
for pre in soup.find_all('pre'):
|
|
106
|
+
code = pre.find('code')
|
|
107
|
+
if code:
|
|
108
|
+
language = code.get('class', [''])[0].replace('language-', '')
|
|
109
|
+
md_parts.append(f'```python' if language == 'python' else '```')
|
|
110
|
+
md_parts.append(code.get_text())
|
|
111
|
+
md_parts.append('```')
|
|
112
|
+
md_parts.append('')
|
|
113
|
+
|
|
114
|
+
# 处理链接
|
|
115
|
+
for a in soup.find_all('a', href=True):
|
|
116
|
+
text = a.get_text().strip()
|
|
117
|
+
if text:
|
|
118
|
+
md_parts.append(f'[{text}]({a["href"]})')
|
|
119
|
+
md_parts.append('')
|
|
120
|
+
|
|
121
|
+
# 处理图片
|
|
122
|
+
for img in soup.find_all('img', src=True):
|
|
123
|
+
alt = img.get('alt', '')
|
|
124
|
+
src = img['src']
|
|
125
|
+
md_parts.append(f'')
|
|
126
|
+
md_parts.append('')
|
|
127
|
+
|
|
128
|
+
return '\n'.join(md_parts)
|
|
129
|
+
|
|
130
|
+
def get_supported_formats(self) -> tuple:
|
|
131
|
+
"""获取支持的格式"""
|
|
132
|
+
return ('html', 'markdown')
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""JSON到Markdown格式转换器"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Optional, Dict, List
|
|
4
|
+
from hos_m2f.converters.base_converter import BaseConverter
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class JSONToMDConverter(BaseConverter):
|
|
9
|
+
"""JSON到Markdown格式转换器"""
|
|
10
|
+
|
|
11
|
+
def convert(self, input_content: bytes, options: Optional[Dict[str, Any]] = None) -> bytes:
|
|
12
|
+
"""将JSON转换为Markdown
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
input_content: JSON文件的二进制数据
|
|
16
|
+
options: 转换选项
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
bytes: Markdown文件的二进制数据
|
|
20
|
+
"""
|
|
21
|
+
if options is None:
|
|
22
|
+
options = {}
|
|
23
|
+
|
|
24
|
+
# 解析JSON
|
|
25
|
+
json_content = json.loads(input_content.decode('utf-8'))
|
|
26
|
+
|
|
27
|
+
# 转换为Markdown
|
|
28
|
+
md_content = self._json_to_md(json_content)
|
|
29
|
+
|
|
30
|
+
return md_content.encode('utf-8')
|
|
31
|
+
|
|
32
|
+
def _json_to_md(self, data: Any, indent: int = 0) -> str:
|
|
33
|
+
"""将JSON数据转换为Markdown
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
data: JSON数据
|
|
37
|
+
indent: 缩进级别
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
str: Markdown字符串
|
|
41
|
+
"""
|
|
42
|
+
md_parts = []
|
|
43
|
+
prefix = ' ' * indent
|
|
44
|
+
|
|
45
|
+
if isinstance(data, dict):
|
|
46
|
+
for key, value in data.items():
|
|
47
|
+
# 处理标题
|
|
48
|
+
if key in ['title', 'heading', 'Header']:
|
|
49
|
+
md_parts.append('#' * (indent + 1) + ' ' + str(value))
|
|
50
|
+
# 处理列表
|
|
51
|
+
elif key in ['items', 'list', 'List']:
|
|
52
|
+
md_parts.append(f'{prefix}{key}:')
|
|
53
|
+
if isinstance(value, list):
|
|
54
|
+
for item in value:
|
|
55
|
+
md_parts.append(f'{prefix}- {self._json_to_md(item, indent + 1)}')
|
|
56
|
+
# 处理表格
|
|
57
|
+
elif key in ['table', 'Table']:
|
|
58
|
+
if isinstance(value, dict) and 'headers' in value and 'rows' in value:
|
|
59
|
+
# 生成表格
|
|
60
|
+
headers = value['headers']
|
|
61
|
+
rows = value['rows']
|
|
62
|
+
|
|
63
|
+
# 表头
|
|
64
|
+
md_parts.append('| ' + ' | '.join(headers) + ' |')
|
|
65
|
+
# 分隔线
|
|
66
|
+
md_parts.append('| ' + ' | '.join(['---'] * len(headers)) + ' |')
|
|
67
|
+
# 数据行
|
|
68
|
+
for row in rows:
|
|
69
|
+
if isinstance(row, dict):
|
|
70
|
+
cells = [str(row.get(header, '')) for header in headers]
|
|
71
|
+
else:
|
|
72
|
+
cells = [str(cell) for cell in row]
|
|
73
|
+
md_parts.append('| ' + ' | '.join(cells) + ' |')
|
|
74
|
+
# 处理普通键值对
|
|
75
|
+
else:
|
|
76
|
+
if isinstance(value, (dict, list)):
|
|
77
|
+
md_parts.append(f'{prefix}{key}:')
|
|
78
|
+
md_parts.append(self._json_to_md(value, indent + 1))
|
|
79
|
+
else:
|
|
80
|
+
md_parts.append(f'{prefix}{key}: {value}')
|
|
81
|
+
|
|
82
|
+
elif isinstance(data, list):
|
|
83
|
+
for item in data:
|
|
84
|
+
if isinstance(item, (dict, list)):
|
|
85
|
+
md_parts.append(f'{prefix}-')
|
|
86
|
+
md_parts.append(self._json_to_md(item, indent + 1))
|
|
87
|
+
else:
|
|
88
|
+
md_parts.append(f'{prefix}- {item}')
|
|
89
|
+
|
|
90
|
+
elif isinstance(data, str):
|
|
91
|
+
md_parts.append(f'{prefix}{data}')
|
|
92
|
+
|
|
93
|
+
return '\n'.join(md_parts)
|
|
94
|
+
|
|
95
|
+
def get_supported_formats(self) -> tuple:
|
|
96
|
+
"""获取支持的格式"""
|
|
97
|
+
return ('json', 'markdown')
|