hos-m2f 0.5.0__tar.gz → 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. hos_m2f-0.5.2/PKG-INFO +47 -0
  2. {hos_m2f-0.5.0 → hos_m2f-0.5.2}/hos_m2f/cli/cli.py +34 -7
  3. hos_m2f-0.5.2/hos_m2f/converters/__init__.py +27 -0
  4. hos_m2f-0.5.2/hos_m2f/converters/base_converter.py +30 -0
  5. hos_m2f-0.5.2/hos_m2f/converters/docx_to_md.py +89 -0
  6. hos_m2f-0.5.2/hos_m2f/converters/epub_to_md.py +118 -0
  7. hos_m2f-0.5.2/hos_m2f/converters/html_to_md.py +132 -0
  8. hos_m2f-0.5.2/hos_m2f/converters/json_to_md.py +97 -0
  9. hos_m2f-0.5.2/hos_m2f/converters/md_to_docx.py +171 -0
  10. hos_m2f-0.5.2/hos_m2f/converters/md_to_epub.py +108 -0
  11. hos_m2f-0.5.2/hos_m2f/converters/md_to_html.py +100 -0
  12. hos_m2f-0.5.2/hos_m2f/converters/md_to_json.py +284 -0
  13. hos_m2f-0.5.2/hos_m2f/converters/md_to_xml.py +362 -0
  14. hos_m2f-0.5.2/hos_m2f/converters/xml_to_md.py +109 -0
  15. hos_m2f-0.5.2/hos_m2f.egg-info/PKG-INFO +47 -0
  16. hos_m2f-0.5.2/hos_m2f.egg-info/SOURCES.txt +23 -0
  17. hos_m2f-0.5.2/setup.py +52 -0
  18. hos_m2f-0.5.0/PKG-INFO +0 -22
  19. hos_m2f-0.5.0/hos_m2f.egg-info/PKG-INFO +0 -22
  20. hos_m2f-0.5.0/hos_m2f.egg-info/SOURCES.txt +0 -11
  21. hos_m2f-0.5.0/setup.py +0 -30
  22. {hos_m2f-0.5.0 → hos_m2f-0.5.2}/README.md +0 -0
  23. {hos_m2f-0.5.0 → hos_m2f-0.5.2}/hos_m2f/__init__.py +0 -0
  24. {hos_m2f-0.5.0 → hos_m2f-0.5.2}/hos_m2f/cli/__init__.py +0 -0
  25. {hos_m2f-0.5.0 → hos_m2f-0.5.2}/hos_m2f.egg-info/dependency_links.txt +0 -0
  26. {hos_m2f-0.5.0 → hos_m2f-0.5.2}/hos_m2f.egg-info/entry_points.txt +0 -0
  27. {hos_m2f-0.5.0 → hos_m2f-0.5.2}/hos_m2f.egg-info/requires.txt +0 -0
  28. {hos_m2f-0.5.0 → hos_m2f-0.5.2}/hos_m2f.egg-info/top_level.txt +0 -0
  29. {hos_m2f-0.5.0 → hos_m2f-0.5.2}/setup.cfg +0 -0
hos_m2f-0.5.2/PKG-INFO ADDED
@@ -0,0 +1,47 @@
1
+ Metadata-Version: 2.4
2
+ Name: hos-m2f
3
+ Version: 0.5.2
4
+ Summary: HOS-M2F: Markdown to Industry Standard Format Compiler Engine
5
+ Author: HOS Team
6
+ Author-email: team@hos-m2f.com
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.8
11
+ Description-Content-Type: text/markdown
12
+ Requires-Dist: mistune>=2.0.0
13
+ Requires-Dist: pyyaml>=6.0
14
+ Requires-Dist: click>=8.0.0
15
+ Requires-Dist: ebooklib>=0.17.0
16
+ Requires-Dist: weasyprint>=54.0
17
+ Requires-Dist: python-docx>=0.8.11
18
+ Dynamic: author
19
+ Dynamic: author-email
20
+ Dynamic: classifier
21
+ Dynamic: description
22
+ Dynamic: description-content-type
23
+ Dynamic: requires-dist
24
+ Dynamic: requires-python
25
+ Dynamic: summary
26
+
27
+ HOS-M2F is a powerful compiler engine that converts Markdown files to various industry standard formats.
28
+
29
+ Key Features:
30
+ - Multiple output formats: PDF, DOCX, EPUB, and JSON
31
+ - Specialized modes for different document types:
32
+ - Book mode for book-length documents
33
+ - Paper mode for academic papers
34
+ - Patent mode for patent applications
35
+ - SOP (Standard Operating Procedure) mode for technical procedures
36
+ - Command-line interface for easy automation
37
+ - Semantic parsing for intelligent document structure
38
+ - Extensible architecture for custom renderers and modes
39
+
40
+ Use Cases:
41
+ - Convert technical documentation to professional formats
42
+ - Generate academic papers from Markdown sources
43
+ - Create standardized operating procedures
44
+ - Prepare patent applications with proper formatting
45
+ - Build eBooks from Markdown content
46
+
47
+ HOS-M2F simplifies the process of creating professionally formatted documents from plain Markdown, saving time and ensuring consistency across document types.
@@ -27,23 +27,31 @@ class CLI:
27
27
  build_parser = self.subparsers.add_parser('build', help='Build document from Markdown')
28
28
  build_parser.add_argument('input', help='Input Markdown file')
29
29
  build_parser.add_argument('output', help='Output file path')
30
- build_parser.add_argument('--mode', choices=['book', 'patent', 'sop', 'paper'], default='paper', help='Document mode')
31
- build_parser.add_argument('--format', choices=['epub', 'pdf', 'docx', 'json'], default='epub', help='Output format')
30
+ build_parser.add_argument('--mode', default='paper', help='Document mode')
31
+ build_parser.add_argument('--format', choices=['epub', 'pdf', 'docx', 'json', 'html', 'xml'], default='epub', help='Output format')
32
32
  build_parser.add_argument('--options', type=str, default='{}', help='Additional options as JSON string')
33
33
 
34
34
  # check命令
35
35
  check_parser = self.subparsers.add_parser('check', help='Check document structure and compliance')
36
36
  check_parser.add_argument('input', help='Input Markdown file')
37
- check_parser.add_argument('--mode', choices=['book', 'patent', 'sop', 'paper'], default='paper', help='Document mode')
37
+ check_parser.add_argument('--mode', default='paper', help='Document mode')
38
38
  check_parser.add_argument('--options', type=str, default='{}', help='Additional options as JSON string')
39
39
 
40
40
  # parse命令
41
41
  parse_parser = self.subparsers.add_parser('parse', help='Parse Markdown content and output structured data')
42
42
  parse_parser.add_argument('input', help='Input Markdown file')
43
- parse_parser.add_argument('--mode', choices=['book', 'patent', 'sop', 'paper'], default='paper', help='Document mode')
43
+ parse_parser.add_argument('--mode', default='paper', help='Document mode')
44
44
  parse_parser.add_argument('--output', help='Output JSON file (default: stdout)')
45
45
  parse_parser.add_argument('--options', type=str, default='{}', help='Additional options as JSON string')
46
46
 
47
+ # convert命令
48
+ convert_parser = self.subparsers.add_parser('convert', help='Convert between different formats')
49
+ convert_parser.add_argument('input', help='Input file')
50
+ convert_parser.add_argument('output', help='Output file path')
51
+ convert_parser.add_argument('--from', dest='from_format', required=True, choices=['md', 'docx', 'json', 'epub', 'html', 'xml'], help='Input format')
52
+ convert_parser.add_argument('--to', dest='to_format', required=True, choices=['md', 'docx', 'json', 'epub', 'html', 'xml'], help='Output format')
53
+ convert_parser.add_argument('--options', type=str, default='{}', help='Additional options as JSON string')
54
+
47
55
  # preview命令
48
56
  preview_parser = self.subparsers.add_parser('preview', help='Start preview server')
49
57
  preview_parser.add_argument('--port', type=int, default=8000, help='Port to run the server on')
@@ -54,7 +62,7 @@ class CLI:
54
62
 
55
63
  # validate命令
56
64
  validate_parser = self.subparsers.add_parser('validate', help='Validate options for a specific mode')
57
- validate_parser.add_argument('--mode', choices=['book', 'patent', 'sop', 'paper'], required=True, help='Document mode')
65
+ validate_parser.add_argument('--mode', required=True, help='Document mode')
58
66
  validate_parser.add_argument('--options', type=str, required=True, help='Options to validate as JSON string')
59
67
 
60
68
  def run(self, args=None):
@@ -82,6 +90,8 @@ class CLI:
82
90
  return self._run_check(parsed_args)
83
91
  elif parsed_args.command == 'parse':
84
92
  return self._run_parse(parsed_args)
93
+ elif parsed_args.command == 'convert':
94
+ return self._run_convert(parsed_args)
85
95
  elif parsed_args.command == 'preview':
86
96
  return self._run_preview(parsed_args)
87
97
  elif parsed_args.command == 'info':
@@ -266,12 +276,29 @@ class CLI:
266
276
 
267
277
  print('\nValidation passed!')
268
278
  return 0
279
+
280
+ def _run_convert(self, args):
281
+ """运行convert命令"""
282
+ # 解析选项
283
+ options = json.loads(args.options)
284
+
285
+ # 执行转换
286
+ result = self.engine.convert(args.input, args.output, args.from_format, args.to_format, options)
287
+
288
+ # 输出结果
289
+ print(f'Successfully converted {args.input} to {args.output}')
290
+ print(f'From format: {args.from_format}')
291
+ print(f'To format: {args.to_format}')
292
+ if result['metadata']:
293
+ print('Metadata:', json.dumps(result['metadata'], ensure_ascii=False, indent=2))
294
+
295
+ return 0
269
296
 
270
297
 
271
- def main():
298
+ def main(args=None):
272
299
  """CLI命令入口点"""
273
300
  cli = CLI()
274
- sys.exit(cli.run())
301
+ sys.exit(cli.run(args))
275
302
 
276
303
  # 命令行入口
277
304
  if __name__ == '__main__':
@@ -0,0 +1,27 @@
1
+ """基础格式互转模块"""
2
+
3
+ from hos_m2f.converters.base_converter import BaseConverter
4
+ from hos_m2f.converters.md_to_docx import MDToDOCXConverter
5
+ from hos_m2f.converters.md_to_json import MDToJSONConverter
6
+ from hos_m2f.converters.md_to_epub import MDToEPUBConverter
7
+ from hos_m2f.converters.md_to_html import MDToHTMLConverter
8
+ from hos_m2f.converters.md_to_xml import MDToXMLConverter
9
+ from hos_m2f.converters.docx_to_md import DOCXToMDConverter
10
+ from hos_m2f.converters.json_to_md import JSONToMDConverter
11
+ from hos_m2f.converters.epub_to_md import EPUBToMDConverter
12
+ from hos_m2f.converters.html_to_md import HTMLToMDConverter
13
+ from hos_m2f.converters.xml_to_md import XMLToMDConverter
14
+
15
+ __all__ = [
16
+ "BaseConverter",
17
+ "MDToDOCXConverter",
18
+ "MDToJSONConverter",
19
+ "MDToEPUBConverter",
20
+ "MDToHTMLConverter",
21
+ "MDToXMLConverter",
22
+ "DOCXToMDConverter",
23
+ "JSONToMDConverter",
24
+ "EPUBToMDConverter",
25
+ "HTMLToMDConverter",
26
+ "XMLToMDConverter"
27
+ ]
@@ -0,0 +1,30 @@
1
+ """基础转换器类"""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Optional, Dict
5
+
6
+
7
+ class BaseConverter(ABC):
8
+ """基础转换器抽象类"""
9
+
10
+ @abstractmethod
11
+ def convert(self, input_content: Any, options: Optional[Dict[str, Any]] = None) -> Any:
12
+ """转换方法
13
+
14
+ Args:
15
+ input_content: 输入内容
16
+ options: 转换选项
17
+
18
+ Returns:
19
+ Any: 转换后的内容
20
+ """
21
+ pass
22
+
23
+ @abstractmethod
24
+ def get_supported_formats(self) -> tuple:
25
+ """获取支持的格式
26
+
27
+ Returns:
28
+ tuple: (输入格式, 输出格式)
29
+ """
30
+ pass
@@ -0,0 +1,89 @@
1
+ """DOCX到Markdown格式转换器"""
2
+
3
+ from typing import Any, Optional, Dict
4
+ from hos_m2f.converters.base_converter import BaseConverter
5
+ from docx import Document
6
+
7
+
8
+ class DOCXToMDConverter(BaseConverter):
9
+ """DOCX到Markdown格式转换器"""
10
+
11
+ def convert(self, input_content: bytes, options: Optional[Dict[str, Any]] = None) -> bytes:
12
+ """将DOCX转换为Markdown
13
+
14
+ Args:
15
+ input_content: DOCX文件的二进制数据
16
+ options: 转换选项
17
+
18
+ Returns:
19
+ bytes: Markdown文件的二进制数据
20
+ """
21
+ if options is None:
22
+ options = {}
23
+
24
+ # 加载DOCX文档
25
+ import io
26
+ doc = Document(io.BytesIO(input_content))
27
+
28
+ # 转换为Markdown
29
+ md_content = []
30
+
31
+ # 处理段落
32
+ for paragraph in doc.paragraphs:
33
+ # 处理标题
34
+ if paragraph.style.name.startswith('Heading'):
35
+ level = int(paragraph.style.name.split(' ')[1])
36
+ md_content.append('#' * level + ' ' + paragraph.text)
37
+ # 处理列表
38
+ elif paragraph.style.name in ['List Bullet', 'List Number']:
39
+ # 检测缩进级别
40
+ indent_level = int(paragraph.paragraph_format.left_indent.inches // 0.5)
41
+ prefix = ' ' * indent_level
42
+
43
+ if paragraph.style.name == 'List Number':
44
+ # 简化处理,实际项目中需要更复杂的列表编号处理
45
+ md_content.append(prefix + '1. ' + paragraph.text)
46
+ else:
47
+ md_content.append(prefix + '- ' + paragraph.text)
48
+ # 处理普通段落
49
+ else:
50
+ if paragraph.text:
51
+ md_content.append(paragraph.text)
52
+
53
+ # 添加空行
54
+ md_content.append('')
55
+
56
+ # 处理表格
57
+ for table in doc.tables:
58
+ # 表头
59
+ headers = []
60
+ for cell in table.rows[0].cells:
61
+ headers.append(cell.text)
62
+
63
+ # 表格分隔线
64
+ separators = ['---'] * len(headers)
65
+
66
+ # 表格数据
67
+ rows = []
68
+ for row in table.rows[1:]:
69
+ row_cells = []
70
+ for cell in row.cells:
71
+ row_cells.append(cell.text)
72
+ rows.append(row_cells)
73
+
74
+ # 生成Markdown表格
75
+ if headers:
76
+ md_content.append('| ' + ' | '.join(headers) + ' |')
77
+ md_content.append('| ' + ' | '.join(separators) + ' |')
78
+ for row in rows:
79
+ md_content.append('| ' + ' | '.join(row) + ' |')
80
+ md_content.append('')
81
+
82
+ # 生成Markdown内容
83
+ md_text = '\n'.join(md_content)
84
+
85
+ return md_text.encode('utf-8')
86
+
87
+ def get_supported_formats(self) -> tuple:
88
+ """获取支持的格式"""
89
+ return ('docx', 'markdown')
@@ -0,0 +1,118 @@
1
+ """EPUB到Markdown格式转换器"""
2
+
3
+ from typing import Any, Optional, Dict
4
+ from hos_m2f.converters.base_converter import BaseConverter
5
+ import ebooklib
6
+ from ebooklib import epub
7
+ from bs4 import BeautifulSoup
8
+
9
+
10
+ class EPUBToMDConverter(BaseConverter):
11
+ """EPUB到Markdown格式转换器"""
12
+
13
+ def convert(self, input_content: bytes, options: Optional[Dict[str, Any]] = None) -> bytes:
14
+ """将EPUB转换为Markdown
15
+
16
+ Args:
17
+ input_content: EPUB文件的二进制数据
18
+ options: 转换选项
19
+
20
+ Returns:
21
+ bytes: Markdown文件的二进制数据
22
+ """
23
+ if options is None:
24
+ options = {}
25
+
26
+ # 加载EPUB书籍
27
+ import io
28
+ book = epub.read_epub(io.BytesIO(input_content))
29
+
30
+ # 转换为Markdown
31
+ md_content = []
32
+
33
+ # 提取元数据
34
+ md_content.append('---')
35
+ if book.get_metadata('DC', 'title'):
36
+ md_content.append(f'title: {book.get_metadata("DC", "title")[0][0]}')
37
+ if book.get_metadata('DC', 'creator'):
38
+ md_content.append(f'author: {book.get_metadata("DC", "creator")[0][0]}')
39
+ if book.get_metadata('DC', 'language'):
40
+ md_content.append(f'language: {book.get_metadata("DC", "language")[0][0]}')
41
+ md_content.append('---')
42
+ md_content.append('')
43
+
44
+ # 处理章节
45
+ for item in book.get_items():
46
+ if item.get_type() == ebooklib.ITEM_DOCUMENT:
47
+ # 解析HTML内容
48
+ soup = BeautifulSoup(item.get_content(), 'html.parser')
49
+
50
+ # 提取标题
51
+ h1 = soup.find('h1')
52
+ if h1:
53
+ md_content.append(f'# {h1.get_text()}')
54
+ md_content.append('')
55
+
56
+ # 提取段落
57
+ for p in soup.find_all('p'):
58
+ text = p.get_text().strip()
59
+ if text:
60
+ md_content.append(text)
61
+ md_content.append('')
62
+
63
+ # 提取列表
64
+ for ul in soup.find_all('ul'):
65
+ for li in ul.find_all('li'):
66
+ text = li.get_text().strip()
67
+ if text:
68
+ md_content.append(f'- {text}')
69
+ md_content.append('')
70
+
71
+ for ol in soup.find_all('ol'):
72
+ for i, li in enumerate(ol.find_all('li'), 1):
73
+ text = li.get_text().strip()
74
+ if text:
75
+ md_content.append(f'{i}. {text}')
76
+ md_content.append('')
77
+
78
+ # 提取表格
79
+ for table in soup.find_all('table'):
80
+ # 提取表头
81
+ headers = []
82
+ thead = table.find('thead')
83
+ if thead:
84
+ for th in thead.find_all('th'):
85
+ headers.append(th.get_text().strip())
86
+ else:
87
+ # 尝试从第一行提取表头
88
+ first_row = table.find('tr')
89
+ if first_row:
90
+ for th in first_row.find_all(['th', 'td']):
91
+ headers.append(th.get_text().strip())
92
+
93
+ if headers:
94
+ # 生成表格
95
+ md_content.append('| ' + ' | '.join(headers) + ' |')
96
+ md_content.append('| ' + ' | '.join(['---'] * len(headers)) + ' |')
97
+
98
+ # 提取表格数据
99
+ tbody = table.find('tbody')
100
+ rows = tbody.find_all('tr') if tbody else table.find_all('tr')[1:] if table.find_all('tr') else []
101
+
102
+ for row in rows:
103
+ cells = []
104
+ for td in row.find_all('td'):
105
+ cells.append(td.get_text().strip())
106
+ if cells:
107
+ md_content.append('| ' + ' | '.join(cells) + ' |')
108
+
109
+ md_content.append('')
110
+
111
+ # 生成Markdown内容
112
+ md_text = '\n'.join(md_content)
113
+
114
+ return md_text.encode('utf-8')
115
+
116
+ def get_supported_formats(self) -> tuple:
117
+ """获取支持的格式"""
118
+ return ('epub', 'markdown')
@@ -0,0 +1,132 @@
1
+ """HTML到Markdown格式转换器"""
2
+
3
+ from typing import Any, Optional, Dict
4
+ from hos_m2f.converters.base_converter import BaseConverter
5
+ from bs4 import BeautifulSoup
6
+
7
+
8
+ class HTMLToMDConverter(BaseConverter):
9
+ """HTML到Markdown格式转换器"""
10
+
11
+ def convert(self, input_content: bytes, options: Optional[Dict[str, Any]] = None) -> bytes:
12
+ """将HTML转换为Markdown
13
+
14
+ Args:
15
+ input_content: HTML文件的二进制数据
16
+ options: 转换选项
17
+
18
+ Returns:
19
+ bytes: Markdown文件的二进制数据
20
+ """
21
+ if options is None:
22
+ options = {}
23
+
24
+ # 解析HTML
25
+ soup = BeautifulSoup(input_content, 'html.parser')
26
+
27
+ # 转换为Markdown
28
+ md_content = self._html_to_md(soup)
29
+
30
+ return md_content.encode('utf-8')
31
+
32
+ def _html_to_md(self, soup: BeautifulSoup) -> str:
33
+ """将HTML转换为Markdown
34
+
35
+ Args:
36
+ soup: BeautifulSoup对象
37
+
38
+ Returns:
39
+ str: Markdown字符串
40
+ """
41
+ md_parts = []
42
+
43
+ # 处理标题
44
+ for level in range(1, 7):
45
+ for heading in soup.find_all(f'h{level}'):
46
+ md_parts.append('#' * level + ' ' + heading.get_text())
47
+ md_parts.append('')
48
+
49
+ # 处理段落
50
+ for paragraph in soup.find_all('p'):
51
+ text = paragraph.get_text().strip()
52
+ if text:
53
+ md_parts.append(text)
54
+ md_parts.append('')
55
+
56
+ # 处理列表
57
+ for ul in soup.find_all('ul'):
58
+ for li in ul.find_all('li'):
59
+ text = li.get_text().strip()
60
+ if text:
61
+ md_parts.append('- ' + text)
62
+ md_parts.append('')
63
+
64
+ for ol in soup.find_all('ol'):
65
+ for i, li in enumerate(ol.find_all('li'), 1):
66
+ text = li.get_text().strip()
67
+ if text:
68
+ md_parts.append(f'{i}. ' + text)
69
+ md_parts.append('')
70
+
71
+ # 处理表格
72
+ for table in soup.find_all('table'):
73
+ # 提取表头
74
+ headers = []
75
+ thead = table.find('thead')
76
+ if thead:
77
+ for th in thead.find_all('th'):
78
+ headers.append(th.get_text().strip())
79
+ else:
80
+ # 尝试从第一行提取表头
81
+ first_row = table.find('tr')
82
+ if first_row:
83
+ for th in first_row.find_all(['th', 'td']):
84
+ headers.append(th.get_text().strip())
85
+
86
+ if headers:
87
+ # 生成表格
88
+ md_parts.append('| ' + ' | '.join(headers) + ' |')
89
+ md_parts.append('| ' + ' | '.join(['---'] * len(headers)) + ' |')
90
+
91
+ # 提取表格数据
92
+ tbody = table.find('tbody')
93
+ rows = tbody.find_all('tr') if tbody else table.find_all('tr')[1:] if table.find_all('tr') else []
94
+
95
+ for row in rows:
96
+ cells = []
97
+ for td in row.find_all('td'):
98
+ cells.append(td.get_text().strip())
99
+ if cells:
100
+ md_parts.append('| ' + ' | '.join(cells) + ' |')
101
+
102
+ md_parts.append('')
103
+
104
+ # 处理代码块
105
+ for pre in soup.find_all('pre'):
106
+ code = pre.find('code')
107
+ if code:
108
+ language = code.get('class', [''])[0].replace('language-', '')
109
+ md_parts.append(f'```python' if language == 'python' else '```')
110
+ md_parts.append(code.get_text())
111
+ md_parts.append('```')
112
+ md_parts.append('')
113
+
114
+ # 处理链接
115
+ for a in soup.find_all('a', href=True):
116
+ text = a.get_text().strip()
117
+ if text:
118
+ md_parts.append(f'[{text}]({a["href"]})')
119
+ md_parts.append('')
120
+
121
+ # 处理图片
122
+ for img in soup.find_all('img', src=True):
123
+ alt = img.get('alt', '')
124
+ src = img['src']
125
+ md_parts.append(f'![{alt}]({src})')
126
+ md_parts.append('')
127
+
128
+ return '\n'.join(md_parts)
129
+
130
+ def get_supported_formats(self) -> tuple:
131
+ """获取支持的格式"""
132
+ return ('html', 'markdown')
@@ -0,0 +1,97 @@
1
+ """JSON到Markdown格式转换器"""
2
+
3
+ from typing import Any, Optional, Dict, List
4
+ from hos_m2f.converters.base_converter import BaseConverter
5
+ import json
6
+
7
+
8
+ class JSONToMDConverter(BaseConverter):
9
+ """JSON到Markdown格式转换器"""
10
+
11
+ def convert(self, input_content: bytes, options: Optional[Dict[str, Any]] = None) -> bytes:
12
+ """将JSON转换为Markdown
13
+
14
+ Args:
15
+ input_content: JSON文件的二进制数据
16
+ options: 转换选项
17
+
18
+ Returns:
19
+ bytes: Markdown文件的二进制数据
20
+ """
21
+ if options is None:
22
+ options = {}
23
+
24
+ # 解析JSON
25
+ json_content = json.loads(input_content.decode('utf-8'))
26
+
27
+ # 转换为Markdown
28
+ md_content = self._json_to_md(json_content)
29
+
30
+ return md_content.encode('utf-8')
31
+
32
+ def _json_to_md(self, data: Any, indent: int = 0) -> str:
33
+ """将JSON数据转换为Markdown
34
+
35
+ Args:
36
+ data: JSON数据
37
+ indent: 缩进级别
38
+
39
+ Returns:
40
+ str: Markdown字符串
41
+ """
42
+ md_parts = []
43
+ prefix = ' ' * indent
44
+
45
+ if isinstance(data, dict):
46
+ for key, value in data.items():
47
+ # 处理标题
48
+ if key in ['title', 'heading', 'Header']:
49
+ md_parts.append('#' * (indent + 1) + ' ' + str(value))
50
+ # 处理列表
51
+ elif key in ['items', 'list', 'List']:
52
+ md_parts.append(f'{prefix}{key}:')
53
+ if isinstance(value, list):
54
+ for item in value:
55
+ md_parts.append(f'{prefix}- {self._json_to_md(item, indent + 1)}')
56
+ # 处理表格
57
+ elif key in ['table', 'Table']:
58
+ if isinstance(value, dict) and 'headers' in value and 'rows' in value:
59
+ # 生成表格
60
+ headers = value['headers']
61
+ rows = value['rows']
62
+
63
+ # 表头
64
+ md_parts.append('| ' + ' | '.join(headers) + ' |')
65
+ # 分隔线
66
+ md_parts.append('| ' + ' | '.join(['---'] * len(headers)) + ' |')
67
+ # 数据行
68
+ for row in rows:
69
+ if isinstance(row, dict):
70
+ cells = [str(row.get(header, '')) for header in headers]
71
+ else:
72
+ cells = [str(cell) for cell in row]
73
+ md_parts.append('| ' + ' | '.join(cells) + ' |')
74
+ # 处理普通键值对
75
+ else:
76
+ if isinstance(value, (dict, list)):
77
+ md_parts.append(f'{prefix}{key}:')
78
+ md_parts.append(self._json_to_md(value, indent + 1))
79
+ else:
80
+ md_parts.append(f'{prefix}{key}: {value}')
81
+
82
+ elif isinstance(data, list):
83
+ for item in data:
84
+ if isinstance(item, (dict, list)):
85
+ md_parts.append(f'{prefix}-')
86
+ md_parts.append(self._json_to_md(item, indent + 1))
87
+ else:
88
+ md_parts.append(f'{prefix}- {item}')
89
+
90
+ elif isinstance(data, str):
91
+ md_parts.append(f'{prefix}{data}')
92
+
93
+ return '\n'.join(md_parts)
94
+
95
+ def get_supported_formats(self) -> tuple:
96
+ """获取支持的格式"""
97
+ return ('json', 'markdown')