hos-m2f 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,362 @@
1
+ """Markdown到XML格式转换器"""
2
+
3
+ from typing import Any, Optional, Dict
4
+ from hos_m2f.converters.base_converter import BaseConverter
5
+ import mistune
6
+
7
+
8
+ class MDToXMLConverter(BaseConverter):
9
+ """Markdown到XML格式转换器"""
10
+
11
+ def convert(self, input_content: str, options: Optional[Dict[str, Any]] = None) -> bytes:
12
+ """将Markdown转换为XML
13
+
14
+ Args:
15
+ input_content: Markdown内容
16
+ options: 转换选项
17
+
18
+ Returns:
19
+ bytes: XML文件的二进制数据
20
+ """
21
+ if options is None:
22
+ options = {}
23
+
24
+ # 解析Markdown
25
+ structure = self._parse_markdown(input_content)
26
+
27
+ # 转换为XML
28
+ xml_content = self._structure_to_xml(structure)
29
+
30
+ # 生成完整的XML文档
31
+ full_xml = f'''
32
+ <?xml version="1.0" encoding="UTF-8"?>
33
+ <document>
34
+ {xml_content}
35
+ </document>
36
+ '''
37
+
38
+ return full_xml.encode('utf-8')
39
+
40
+ def _parse_markdown(self, content: str) -> Dict[str, Any]:
41
+ """解析Markdown内容为结构化数据
42
+
43
+ Args:
44
+ content: Markdown内容
45
+
46
+ Returns:
47
+ Dict[str, Any]: 结构化数据
48
+ """
49
+ lines = content.split('\n')
50
+ structure = {
51
+ 'type': 'document',
52
+ 'children': [],
53
+ 'metadata': {}
54
+ }
55
+
56
+ current_heading = None
57
+ current_level = 0
58
+ current_paragraph = []
59
+ current_list = None
60
+ current_list_items = []
61
+ list_level = 0
62
+
63
+ # 解析YAML头
64
+ if lines and lines[0] == '---':
65
+ metadata = []
66
+ for i, line in enumerate(lines[1:]):
67
+ if line == '---':
68
+ break
69
+ metadata.append(line)
70
+
71
+ # 解析YAML元数据
72
+ if metadata:
73
+ import yaml
74
+ try:
75
+ metadata_content = '\n'.join(metadata)
76
+ structure['metadata'] = yaml.safe_load(metadata_content)
77
+ except Exception:
78
+ pass
79
+
80
+ # 跳过YAML头
81
+ lines = lines[i+2:]
82
+
83
+ # 解析内容
84
+ for line in lines:
85
+ line = line.rstrip()
86
+
87
+ # 处理标题
88
+ if line.startswith('#'):
89
+ # 保存当前段落
90
+ if current_paragraph:
91
+ structure['children'].append({
92
+ 'type': 'paragraph',
93
+ 'content': '\n'.join(current_paragraph)
94
+ })
95
+ current_paragraph = []
96
+
97
+ # 保存当前列表
98
+ if current_list is not None:
99
+ structure['children'].append({
100
+ 'type': 'list',
101
+ 'ordered': current_list,
102
+ 'items': current_list_items
103
+ })
104
+ current_list = None
105
+ current_list_items = []
106
+
107
+ # 解析标题
108
+ level = len(line.split(' ')[0])
109
+ title = line[level:].strip()
110
+ structure['children'].append({
111
+ 'type': 'heading',
112
+ 'level': level,
113
+ 'content': title
114
+ })
115
+ current_heading = title
116
+ current_level = level
117
+
118
+ # 处理有序列表
119
+ elif line.startswith('1. ') or line.startswith('\t1. ') or line.startswith(' 1. '):
120
+ # 保存当前段落
121
+ if current_paragraph:
122
+ structure['children'].append({
123
+ 'type': 'paragraph',
124
+ 'content': '\n'.join(current_paragraph)
125
+ })
126
+ current_paragraph = []
127
+
128
+ # 开始新列表
129
+ if current_list is None:
130
+ current_list = True
131
+ elif current_list != True:
132
+ structure['children'].append({
133
+ 'type': 'list',
134
+ 'ordered': current_list,
135
+ 'items': current_list_items
136
+ })
137
+ current_list = True
138
+ current_list_items = []
139
+
140
+ # 解析列表项
141
+ content = line.lstrip('1234567890. \t')
142
+ current_list_items.append({
143
+ 'type': 'list_item',
144
+ 'content': content
145
+ })
146
+
147
+ # 处理无序列表
148
+ elif line.startswith('- ') or line.startswith('* ') or line.startswith('+ ') or \
149
+ line.startswith('\t- ') or line.startswith('\t* ') or line.startswith('\t+ ') or \
150
+ line.startswith(' - ') or line.startswith(' * ') or line.startswith(' + '):
151
+ # 保存当前段落
152
+ if current_paragraph:
153
+ structure['children'].append({
154
+ 'type': 'paragraph',
155
+ 'content': '\n'.join(current_paragraph)
156
+ })
157
+ current_paragraph = []
158
+
159
+ # 开始新列表
160
+ if current_list is None:
161
+ current_list = False
162
+ elif current_list != False:
163
+ structure['children'].append({
164
+ 'type': 'list',
165
+ 'ordered': current_list,
166
+ 'items': current_list_items
167
+ })
168
+ current_list = False
169
+ current_list_items = []
170
+
171
+ # 解析列表项
172
+ content = line.lstrip('-*+ \t')
173
+ current_list_items.append({
174
+ 'type': 'list_item',
175
+ 'content': content
176
+ })
177
+
178
+ # 处理代码块
179
+ elif line.startswith('```'):
180
+ # 保存当前段落
181
+ if current_paragraph:
182
+ structure['children'].append({
183
+ 'type': 'paragraph',
184
+ 'content': '\n'.join(current_paragraph)
185
+ })
186
+ current_paragraph = []
187
+
188
+ # 保存当前列表
189
+ if current_list is not None:
190
+ structure['children'].append({
191
+ 'type': 'list',
192
+ 'ordered': current_list,
193
+ 'items': current_list_items
194
+ })
195
+ current_list = None
196
+ current_list_items = []
197
+
198
+ # 解析代码块
199
+ code_lines = []
200
+ language = line[3:].strip()
201
+
202
+ # 读取代码内容
203
+ for i, code_line in enumerate(lines[lines.index(line)+1:]):
204
+ if code_line.startswith('```'):
205
+ break
206
+ code_lines.append(code_line)
207
+
208
+ structure['children'].append({
209
+ 'type': 'code_block',
210
+ 'language': language,
211
+ 'content': '\n'.join(code_lines)
212
+ })
213
+
214
+ # 跳过已处理的代码行
215
+ lines = lines[:lines.index(line)] + lines[lines.index(line)+i+2:]
216
+
217
+ # 处理表格
218
+ elif line.startswith('|') and '|' in line[1:]:
219
+ # 保存当前段落
220
+ if current_paragraph:
221
+ structure['children'].append({
222
+ 'type': 'paragraph',
223
+ 'content': '\n'.join(current_paragraph)
224
+ })
225
+ current_paragraph = []
226
+
227
+ # 保存当前列表
228
+ if current_list is not None:
229
+ structure['children'].append({
230
+ 'type': 'list',
231
+ 'ordered': current_list,
232
+ 'items': current_list_items
233
+ })
234
+ current_list = None
235
+ current_list_items = []
236
+
237
+ # 解析表格
238
+ table_lines = [line]
239
+
240
+ # 读取表格内容
241
+ for i, table_line in enumerate(lines[lines.index(line)+1:]):
242
+ if table_line.startswith('|'):
243
+ table_lines.append(table_line)
244
+ else:
245
+ break
246
+
247
+ # 解析表格结构
248
+ if len(table_lines) >= 2:
249
+ headers = [h.strip() for h in table_lines[0].split('|') if h.strip()]
250
+ rows = []
251
+
252
+ # 跳过分隔线
253
+ for table_line in table_lines[2:]:
254
+ cells = [c.strip() for c in table_line.split('|') if c.strip()]
255
+ if cells:
256
+ rows.append(dict(zip(headers, cells)))
257
+
258
+ structure['children'].append({
259
+ 'type': 'table',
260
+ 'headers': headers,
261
+ 'rows': rows
262
+ })
263
+
264
+ # 跳过已处理的表格行
265
+ lines = lines[:lines.index(line)] + lines[lines.index(line)+i+1:]
266
+
267
+ # 处理段落
268
+ else:
269
+ if line or current_paragraph:
270
+ current_paragraph.append(line)
271
+
272
+ # 保存最后一个段落
273
+ if current_paragraph:
274
+ structure['children'].append({
275
+ 'type': 'paragraph',
276
+ 'content': '\n'.join(current_paragraph)
277
+ })
278
+
279
+ # 保存最后一个列表
280
+ if current_list is not None:
281
+ structure['children'].append({
282
+ 'type': 'list',
283
+ 'ordered': current_list,
284
+ 'items': current_list_items
285
+ })
286
+
287
+ return structure
288
+
289
+ def _structure_to_xml(self, structure: Dict[str, Any]) -> str:
290
+ """将结构化数据转换为XML
291
+
292
+ Args:
293
+ structure: 结构化数据
294
+
295
+ Returns:
296
+ str: XML字符串
297
+ """
298
+ xml_parts = []
299
+
300
+ # 处理元数据
301
+ if structure.get('metadata'):
302
+ xml_parts.append('<metadata>')
303
+ for key, value in structure['metadata'].items():
304
+ xml_parts.append(f'<{key}>{self._escape_xml(str(value))}</{key}>')
305
+ xml_parts.append('</metadata>')
306
+
307
+ # 处理子元素
308
+ for child in structure.get('children', []):
309
+ if child['type'] == 'heading':
310
+ xml_parts.append(f'<heading level="{child["level"]}">{self._escape_xml(child["content"])}</heading>')
311
+ elif child['type'] == 'paragraph':
312
+ xml_parts.append(f'<paragraph>{self._escape_xml(child["content"])}</paragraph>')
313
+ elif child['type'] == 'list':
314
+ list_type = 'ordered' if child['ordered'] else 'unordered'
315
+ xml_parts.append(f'<list type="{list_type}">')
316
+ for item in child['items']:
317
+ xml_parts.append(f'<list_item>{self._escape_xml(item["content"])}</list_item>')
318
+ xml_parts.append('</list>')
319
+ elif child['type'] == 'code_block':
320
+ xml_parts.append(f'<code_block language="{child.get("language", "")}">{self._escape_xml(child["content"])}</code_block>')
321
+ elif child['type'] == 'table':
322
+ xml_parts.append('<table>')
323
+ xml_parts.append('<headers>')
324
+ for header in child['headers']:
325
+ xml_parts.append(f'<header>{self._escape_xml(header)}</header>')
326
+ xml_parts.append('</headers>')
327
+ xml_parts.append('<rows>')
328
+ for row in child['rows']:
329
+ xml_parts.append('<row>')
330
+ for key, value in row.items():
331
+ xml_parts.append(f'<cell column="{self._escape_xml(key)}">{self._escape_xml(value)}</cell>')
332
+ xml_parts.append('</row>')
333
+ xml_parts.append('</rows>')
334
+ xml_parts.append('</table>')
335
+
336
+ return '\n'.join(xml_parts)
337
+
338
+ def _escape_xml(self, text: str) -> str:
339
+ """转义XML特殊字符
340
+
341
+ Args:
342
+ text: 原始文本
343
+
344
+ Returns:
345
+ str: 转义后的文本
346
+ """
347
+ escape_map = {
348
+ '&': '&amp;',
349
+ '<': '&lt;',
350
+ '>': '&gt;',
351
+ '"': '&quot;',
352
+ "'": '&apos;'
353
+ }
354
+
355
+ for char, replacement in escape_map.items():
356
+ text = text.replace(char, replacement)
357
+
358
+ return text
359
+
360
+ def get_supported_formats(self) -> tuple:
361
+ """获取支持的格式"""
362
+ return ('markdown', 'xml')
@@ -0,0 +1,109 @@
1
+ """XML到Markdown格式转换器"""
2
+
3
+ from typing import Any, Optional, Dict
4
+ from hos_m2f.converters.base_converter import BaseConverter
5
+ import xml.etree.ElementTree as ET
6
+
7
+
8
+ class XMLToMDConverter(BaseConverter):
9
+ """XML到Markdown格式转换器"""
10
+
11
+ def convert(self, input_content: bytes, options: Optional[Dict[str, Any]] = None) -> bytes:
12
+ """将XML转换为Markdown
13
+
14
+ Args:
15
+ input_content: XML文件的二进制数据
16
+ options: 转换选项
17
+
18
+ Returns:
19
+ bytes: Markdown文件的二进制数据
20
+ """
21
+ if options is None:
22
+ options = {}
23
+
24
+ # 解析XML
25
+ root = ET.fromstring(input_content)
26
+
27
+ # 转换为Markdown
28
+ md_content = self._xml_to_md(root)
29
+
30
+ return md_content.encode('utf-8')
31
+
32
+ def _xml_to_md(self, element: ET.Element, indent: int = 0) -> str:
33
+ """将XML元素转换为Markdown
34
+
35
+ Args:
36
+ element: XML元素
37
+ indent: 缩进级别
38
+
39
+ Returns:
40
+ str: Markdown字符串
41
+ """
42
+ md_parts = []
43
+ prefix = ' ' * indent
44
+
45
+ # 处理元素内容
46
+ if element.text and element.text.strip():
47
+ # 处理标题
48
+ if element.tag in ['heading', 'title', 'Header', 'Title']:
49
+ md_parts.append('#' * (indent + 1) + ' ' + element.text.strip())
50
+ # 处理段落
51
+ elif element.tag in ['paragraph', 'p', 'Paragraph']:
52
+ md_parts.append(element.text.strip())
53
+ # 处理列表项
54
+ elif element.tag in ['list_item', 'item', 'ListItem']:
55
+ md_parts.append(f'{prefix}- {element.text.strip()}')
56
+ # 处理代码块
57
+ elif element.tag in ['code_block', 'code', 'Code']:
58
+ language = element.get('language', '')
59
+ md_parts.append(f'```{language}' if language else '```')
60
+ md_parts.append(element.text.strip())
61
+ md_parts.append('```')
62
+ # 处理普通文本
63
+ else:
64
+ md_parts.append(f'{prefix}{element.tag}: {element.text.strip()}')
65
+
66
+ # 处理子元素
67
+ for child in element:
68
+ # 处理列表
69
+ if child.tag in ['list', 'List']:
70
+ list_type = child.get('type', 'unordered')
71
+ md_parts.append(f'{prefix}{child.tag}:')
72
+ for item in child:
73
+ if item.tag in ['list_item', 'item', 'ListItem']:
74
+ item_text = item.text.strip() if item.text else ''
75
+ md_parts.append(f'{prefix}- {item_text}')
76
+ # 处理表格
77
+ elif child.tag in ['table', 'Table']:
78
+ # 提取表头
79
+ headers = []
80
+ for header in child.findall('.//header'):
81
+ headers.append(header.text.strip() if header.text else '')
82
+
83
+ if headers:
84
+ # 生成表格
85
+ md_parts.append('| ' + ' | '.join(headers) + ' |')
86
+ md_parts.append('| ' + ' | '.join(['---'] * len(headers)) + ' |')
87
+
88
+ # 提取表格数据
89
+ for row in child.findall('.//row'):
90
+ cells = []
91
+ for cell in row:
92
+ cells.append(cell.text.strip() if cell.text else '')
93
+ if cells:
94
+ md_parts.append('| ' + ' | '.join(cells) + ' |')
95
+ # 处理其他子元素
96
+ else:
97
+ child_md = self._xml_to_md(child, indent + 1)
98
+ if child_md:
99
+ md_parts.append(child_md)
100
+
101
+ # 添加空行
102
+ if md_parts:
103
+ md_parts.append('')
104
+
105
+ return '\n'.join(md_parts)
106
+
107
+ def get_supported_formats(self) -> tuple:
108
+ """获取支持的格式"""
109
+ return ('xml', 'markdown')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hos-m2f
3
- Version: 0.5.1
3
+ Version: 0.5.2
4
4
  Summary: HOS-M2F: Markdown to Industry Standard Format Compiler Engine
5
5
  Author: HOS Team
6
6
  Author-email: team@hos-m2f.com
@@ -0,0 +1,20 @@
1
+ hos_m2f/__init__.py,sha256=v4k4TbKzPb3nbzgKJHaID3QTSpmTvAsGVHZ-poa870I,178
2
+ hos_m2f/cli/__init__.py,sha256=NqhmK68K2evHjP7qcyT7FUWlDqfb22CUpCOfJhnQzPs,68
3
+ hos_m2f/cli/cli.py,sha256=0bWtYmOoNE8h_rrBlwS-4yJwIRnRTtuBx3DWnMkZ4Qo,11920
4
+ hos_m2f/converters/__init__.py,sha256=d88A1sTrQsoMzrTipg7jKTWfI83GJzlRFVFNibajeag,971
5
+ hos_m2f/converters/base_converter.py,sha256=4xqcAFMT82va6VesgM_HybUPIpP77x0DrQSYzb1jf28,696
6
+ hos_m2f/converters/docx_to_md.py,sha256=_HBp3TOD9ZkTFhHR_f3ObLlpDcv0tnSPjPfeGxuvhjM,3064
7
+ hos_m2f/converters/epub_to_md.py,sha256=cFfHmK4IrJKwzEWVE3ue7Jw8tBfWu1q7wG9o7oMf4Pw,4612
8
+ hos_m2f/converters/html_to_md.py,sha256=26GqdynSxKKO2NTxPKgfFs9bTuisLaEIJdBhz4CJ5Eg,4487
9
+ hos_m2f/converters/json_to_md.py,sha256=jeLBQ3jTkgA5a2Kr2gsOPjZB-D4PZxumciFHbyPKNmc,3670
10
+ hos_m2f/converters/md_to_docx.py,sha256=5l4Q8F0-9dM0gnpZmC9C-QAKZ58LHZ9CTZ4EK9Yv5UU,5573
11
+ hos_m2f/converters/md_to_epub.py,sha256=HF0YJ3efvuG6ts45N7IaLIH_4O9VrWG5aqczF4mGPk0,2993
12
+ hos_m2f/converters/md_to_html.py,sha256=ss6Uud2mPhoIMctQWKeGpRHa0FtUqB9573ZB6cnKucA,2827
13
+ hos_m2f/converters/md_to_json.py,sha256=XqeIqwrubuLOU4dTmveAMmyuGD-lK57GVHh9nzAdtXo,10295
14
+ hos_m2f/converters/md_to_xml.py,sha256=RXNdPlkXtvQXLXBuv_xNAu5UXHQo7dF31bynBy9yDIs,13266
15
+ hos_m2f/converters/xml_to_md.py,sha256=zOkaEaSZdvyHag05kIHiWF4VyGMMjfmWmBllBpzwJ4E,4051
16
+ hos_m2f-0.5.2.dist-info/METADATA,sha256=vDnHHOcme8BeiVGqIk_AcHNtzOPFuNsL3XImjG2Lg3U,1764
17
+ hos_m2f-0.5.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
18
+ hos_m2f-0.5.2.dist-info/entry_points.txt,sha256=NeLjg1hvVt_A2sDUVZAYbfkvnZ1nGMcTqRiDoVQzn0w,49
19
+ hos_m2f-0.5.2.dist-info/top_level.txt,sha256=DMIK2jdfJss-FB_GRZ6iw4gahhZUAvSI0fHamOPL9mE,8
20
+ hos_m2f-0.5.2.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- hos_m2f/__init__.py,sha256=v4k4TbKzPb3nbzgKJHaID3QTSpmTvAsGVHZ-poa870I,178
2
- hos_m2f/cli/__init__.py,sha256=NqhmK68K2evHjP7qcyT7FUWlDqfb22CUpCOfJhnQzPs,68
3
- hos_m2f/cli/cli.py,sha256=61agI2zcmgH5vtBNayueMGD7sVrqhMe5eFpvgd9j7r0,10639
4
- hos_m2f-0.5.1.dist-info/METADATA,sha256=8cdJz_maLr7TK51prDlcqULg2mCr-XyPYz3ZOlDq8cM,1764
5
- hos_m2f-0.5.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
6
- hos_m2f-0.5.1.dist-info/entry_points.txt,sha256=NeLjg1hvVt_A2sDUVZAYbfkvnZ1nGMcTqRiDoVQzn0w,49
7
- hos_m2f-0.5.1.dist-info/top_level.txt,sha256=DMIK2jdfJss-FB_GRZ6iw4gahhZUAvSI0fHamOPL9mE,8
8
- hos_m2f-0.5.1.dist-info/RECORD,,