PyPI - hos-m2f - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

hos-m2f 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

hos_m2f/cli/cli.py +34 -7
hos_m2f/converters/__init__.py +27 -0
hos_m2f/converters/base_converter.py +30 -0
hos_m2f/converters/docx_to_md.py +89 -0
hos_m2f/converters/epub_to_md.py +118 -0
hos_m2f/converters/html_to_md.py +132 -0
hos_m2f/converters/json_to_md.py +97 -0
hos_m2f/converters/md_to_docx.py +171 -0
hos_m2f/converters/md_to_epub.py +108 -0
hos_m2f/converters/md_to_html.py +100 -0
hos_m2f/converters/md_to_json.py +284 -0
hos_m2f/converters/md_to_xml.py +362 -0
hos_m2f/converters/xml_to_md.py +109 -0
{hos_m2f-0.5.1.dist-info → hos_m2f-0.5.2.dist-info}/METADATA +1 -1
hos_m2f-0.5.2.dist-info/RECORD +20 -0
hos_m2f-0.5.1.dist-info/RECORD +0 -8
{hos_m2f-0.5.1.dist-info → hos_m2f-0.5.2.dist-info}/WHEEL +0 -0
{hos_m2f-0.5.1.dist-info → hos_m2f-0.5.2.dist-info}/entry_points.txt +0 -0
{hos_m2f-0.5.1.dist-info → hos_m2f-0.5.2.dist-info}/top_level.txt +0 -0

hos_m2f/converters/md_to_xml.py ADDED Viewed

@@ -0,0 +1,362 @@
+"""Markdown到XML格式转换器"""
+from typing import Any, Optional, Dict
+from hos_m2f.converters.base_converter import BaseConverter
+import mistune
+class MDToXMLConverter(BaseConverter):
+    """Markdown到XML格式转换器"""
+    def convert(self, input_content: str, options: Optional[Dict[str, Any]] = None) -> bytes:
+        """将Markdown转换为XML
+        Args:
+            input_content: Markdown内容
+            options: 转换选项
+        Returns:
+            bytes: XML文件的二进制数据
+        """
+        if options is None:
+            options = {}
+        # 解析Markdown
+        structure = self._parse_markdown(input_content)
+        # 转换为XML
+        xml_content = self._structure_to_xml(structure)
+        # 生成完整的XML文档
+        full_xml = f'''
+        <?xml version="1.0" encoding="UTF-8"?>
+        <document>
+            {xml_content}
+        </document>
+        '''
+        return full_xml.encode('utf-8')
+    def _parse_markdown(self, content: str) -> Dict[str, Any]:
+        """解析Markdown内容为结构化数据
+        Args:
+            content: Markdown内容
+        Returns:
+            Dict[str, Any]: 结构化数据
+        """
+        lines = content.split('\n')
+        structure = {
+            'type': 'document',
+            'children': [],
+            'metadata': {}
+        }
+        current_heading = None
+        current_level = 0
+        current_paragraph = []
+        current_list = None
+        current_list_items = []
+        list_level = 0
+        # 解析YAML头
+        if lines and lines[0] == '---':
+            metadata = []
+            for i, line in enumerate(lines[1:]):
+                if line == '---':
+                    break
+                metadata.append(line)
+            # 解析YAML元数据
+            if metadata:
+                import yaml
+                try:
+                    metadata_content = '\n'.join(metadata)
+                    structure['metadata'] = yaml.safe_load(metadata_content)
+                except Exception:
+                    pass
+            # 跳过YAML头
+            lines = lines[i+2:]
+        # 解析内容
+        for line in lines:
+            line = line.rstrip()
+            # 处理标题
+            if line.startswith('#'):
+                # 保存当前段落
+                if current_paragraph:
+                    structure['children'].append({
+                        'type': 'paragraph',
+                        'content': '\n'.join(current_paragraph)
+                    })
+                    current_paragraph = []
+                # 保存当前列表
+                if current_list is not None:
+                    structure['children'].append({
+                        'type': 'list',
+                        'ordered': current_list,
+                        'items': current_list_items
+                    })
+                    current_list = None
+                    current_list_items = []
+                # 解析标题
+                level = len(line.split(' ')[0])
+                title = line[level:].strip()
+                structure['children'].append({
+                    'type': 'heading',
+                    'level': level,
+                    'content': title
+                })
+                current_heading = title
+                current_level = level
+            # 处理有序列表
+            elif line.startswith('1. ') or line.startswith('\t1. ') or line.startswith('  1. '):
+                # 保存当前段落
+                if current_paragraph:
+                    structure['children'].append({
+                        'type': 'paragraph',
+                        'content': '\n'.join(current_paragraph)
+                    })
+                    current_paragraph = []
+                # 开始新列表
+                if current_list is None:
+                    current_list = True
+                elif current_list != True:
+                    structure['children'].append({
+                        'type': 'list',
+                        'ordered': current_list,
+                        'items': current_list_items
+                    })
+                    current_list = True
+                    current_list_items = []
+                # 解析列表项
+                content = line.lstrip('1234567890. \t')
+                current_list_items.append({
+                    'type': 'list_item',
+                    'content': content
+                })
+            # 处理无序列表
+            elif line.startswith('- ') or line.startswith('* ') or line.startswith('+ ') or \
+                 line.startswith('\t- ') or line.startswith('\t* ') or line.startswith('\t+ ') or \
+                 line.startswith('  - ') or line.startswith('  * ') or line.startswith('  + '):
+                # 保存当前段落
+                if current_paragraph:
+                    structure['children'].append({
+                        'type': 'paragraph',
+                        'content': '\n'.join(current_paragraph)
+                    })
+                    current_paragraph = []
+                # 开始新列表
+                if current_list is None:
+                    current_list = False
+                elif current_list != False:
+                    structure['children'].append({
+                        'type': 'list',
+                        'ordered': current_list,
+                        'items': current_list_items
+                    })
+                    current_list = False
+                    current_list_items = []
+                # 解析列表项
+                content = line.lstrip('-*+ \t')
+                current_list_items.append({
+                    'type': 'list_item',
+                    'content': content
+                })
+            # 处理代码块
+            elif line.startswith('```'):
+                # 保存当前段落
+                if current_paragraph:
+                    structure['children'].append({
+                        'type': 'paragraph',
+                        'content': '\n'.join(current_paragraph)
+                    })
+                    current_paragraph = []
+                # 保存当前列表
+                if current_list is not None:
+                    structure['children'].append({
+                        'type': 'list',
+                        'ordered': current_list,
+                        'items': current_list_items
+                    })
+                    current_list = None
+                    current_list_items = []
+                # 解析代码块
+                code_lines = []
+                language = line[3:].strip()
+                # 读取代码内容
+                for i, code_line in enumerate(lines[lines.index(line)+1:]):
+                    if code_line.startswith('```'):
+                        break
+                    code_lines.append(code_line)
+                structure['children'].append({
+                    'type': 'code_block',
+                    'language': language,
+                    'content': '\n'.join(code_lines)
+                })
+                # 跳过已处理的代码行
+                lines = lines[:lines.index(line)] + lines[lines.index(line)+i+2:]
+            # 处理表格
+            elif line.startswith('|') and '|' in line[1:]:
+                # 保存当前段落
+                if current_paragraph:
+                    structure['children'].append({
+                        'type': 'paragraph',
+                        'content': '\n'.join(current_paragraph)
+                    })
+                    current_paragraph = []
+                # 保存当前列表
+                if current_list is not None:
+                    structure['children'].append({
+                        'type': 'list',
+                        'ordered': current_list,
+                        'items': current_list_items
+                    })
+                    current_list = None
+                    current_list_items = []
+                # 解析表格
+                table_lines = [line]
+                # 读取表格内容
+                for i, table_line in enumerate(lines[lines.index(line)+1:]):
+                    if table_line.startswith('|'):
+                        table_lines.append(table_line)
+                    else:
+                        break
+                # 解析表格结构
+                if len(table_lines) >= 2:
+                    headers = [h.strip() for h in table_lines[0].split('|') if h.strip()]
+                    rows = []
+                    # 跳过分隔线
+                    for table_line in table_lines[2:]:
+                        cells = [c.strip() for c in table_line.split('|') if c.strip()]
+                        if cells:
+                            rows.append(dict(zip(headers, cells)))
+                    structure['children'].append({
+                        'type': 'table',
+                        'headers': headers,
+                        'rows': rows
+                    })
+                # 跳过已处理的表格行
+                lines = lines[:lines.index(line)] + lines[lines.index(line)+i+1:]
+            # 处理段落
+            else:
+                if line or current_paragraph:
+                    current_paragraph.append(line)
+        # 保存最后一个段落
+        if current_paragraph:
+            structure['children'].append({
+                'type': 'paragraph',
+                'content': '\n'.join(current_paragraph)
+            })
+        # 保存最后一个列表
+        if current_list is not None:
+            structure['children'].append({
+                'type': 'list',
+                'ordered': current_list,
+                'items': current_list_items
+            })
+        return structure
+    def _structure_to_xml(self, structure: Dict[str, Any]) -> str:
+        """将结构化数据转换为XML
+        Args:
+            structure: 结构化数据
+        Returns:
+            str: XML字符串
+        """
+        xml_parts = []
+        # 处理元数据
+        if structure.get('metadata'):
+            xml_parts.append('<metadata>')
+            for key, value in structure['metadata'].items():
+                xml_parts.append(f'<{key}>{self._escape_xml(str(value))}</{key}>')
+            xml_parts.append('</metadata>')
+        # 处理子元素
+        for child in structure.get('children', []):
+            if child['type'] == 'heading':
+                xml_parts.append(f'<heading level="{child["level"]}">{self._escape_xml(child["content"])}</heading>')
+            elif child['type'] == 'paragraph':
+                xml_parts.append(f'<paragraph>{self._escape_xml(child["content"])}</paragraph>')
+            elif child['type'] == 'list':
+                list_type = 'ordered' if child['ordered'] else 'unordered'
+                xml_parts.append(f'<list type="{list_type}">')
+                for item in child['items']:
+                    xml_parts.append(f'<list_item>{self._escape_xml(item["content"])}</list_item>')
+                xml_parts.append('</list>')
+            elif child['type'] == 'code_block':
+                xml_parts.append(f'<code_block language="{child.get("language", "")}">{self._escape_xml(child["content"])}</code_block>')
+            elif child['type'] == 'table':
+                xml_parts.append('<table>')
+                xml_parts.append('<headers>')
+                for header in child['headers']:
+                    xml_parts.append(f'<header>{self._escape_xml(header)}</header>')
+                xml_parts.append('</headers>')
+                xml_parts.append('<rows>')
+                for row in child['rows']:
+                    xml_parts.append('<row>')
+                    for key, value in row.items():
+                        xml_parts.append(f'<cell column="{self._escape_xml(key)}">{self._escape_xml(value)}</cell>')
+                    xml_parts.append('</row>')
+                xml_parts.append('</rows>')
+                xml_parts.append('</table>')
+        return '\n'.join(xml_parts)
+    def _escape_xml(self, text: str) -> str:
+        """转义XML特殊字符
+        Args:
+            text: 原始文本
+        Returns:
+            str: 转义后的文本
+        """
+        escape_map = {
+            '&': '&amp;',
+            '<': '&lt;',
+            '>': '&gt;',
+            '"': '&quot;',
+            "'": '&apos;'
+        }
+        for char, replacement in escape_map.items():
+            text = text.replace(char, replacement)
+        return text
+    def get_supported_formats(self) -> tuple:
+        """获取支持的格式"""
+        return ('markdown', 'xml')

hos_m2f/converters/xml_to_md.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""XML到Markdown格式转换器"""
+from typing import Any, Optional, Dict
+from hos_m2f.converters.base_converter import BaseConverter
+import xml.etree.ElementTree as ET
+class XMLToMDConverter(BaseConverter):
+    """XML到Markdown格式转换器"""
+    def convert(self, input_content: bytes, options: Optional[Dict[str, Any]] = None) -> bytes:
+        """将XML转换为Markdown
+        Args:
+            input_content: XML文件的二进制数据
+            options: 转换选项
+        Returns:
+            bytes: Markdown文件的二进制数据
+        """
+        if options is None:
+            options = {}
+        # 解析XML
+        root = ET.fromstring(input_content)
+        # 转换为Markdown
+        md_content = self._xml_to_md(root)
+        return md_content.encode('utf-8')
+    def _xml_to_md(self, element: ET.Element, indent: int = 0) -> str:
+        """将XML元素转换为Markdown
+        Args:
+            element: XML元素
+            indent: 缩进级别
+        Returns:
+            str: Markdown字符串
+        """
+        md_parts = []
+        prefix = '  ' * indent
+        # 处理元素内容
+        if element.text and element.text.strip():
+            # 处理标题
+            if element.tag in ['heading', 'title', 'Header', 'Title']:
+                md_parts.append('#' * (indent + 1) + ' ' + element.text.strip())
+            # 处理段落
+            elif element.tag in ['paragraph', 'p', 'Paragraph']:
+                md_parts.append(element.text.strip())
+            # 处理列表项
+            elif element.tag in ['list_item', 'item', 'ListItem']:
+                md_parts.append(f'{prefix}- {element.text.strip()}')
+            # 处理代码块
+            elif element.tag in ['code_block', 'code', 'Code']:
+                language = element.get('language', '')
+                md_parts.append(f'```{language}' if language else '```')
+                md_parts.append(element.text.strip())
+                md_parts.append('```')
+            # 处理普通文本
+            else:
+                md_parts.append(f'{prefix}{element.tag}: {element.text.strip()}')
+        # 处理子元素
+        for child in element:
+            # 处理列表
+            if child.tag in ['list', 'List']:
+                list_type = child.get('type', 'unordered')
+                md_parts.append(f'{prefix}{child.tag}:')
+                for item in child:
+                    if item.tag in ['list_item', 'item', 'ListItem']:
+                        item_text = item.text.strip() if item.text else ''
+                        md_parts.append(f'{prefix}- {item_text}')
+            # 处理表格
+            elif child.tag in ['table', 'Table']:
+                # 提取表头
+                headers = []
+                for header in child.findall('.//header'):
+                    headers.append(header.text.strip() if header.text else '')
+                if headers:
+                    # 生成表格
+                    md_parts.append('| ' + ' | '.join(headers) + ' |')
+                    md_parts.append('| ' + ' | '.join(['---'] * len(headers)) + ' |')
+                    # 提取表格数据
+                    for row in child.findall('.//row'):
+                        cells = []
+                        for cell in row:
+                            cells.append(cell.text.strip() if cell.text else '')
+                        if cells:
+                            md_parts.append('| ' + ' | '.join(cells) + ' |')
+            # 处理其他子元素
+            else:
+                child_md = self._xml_to_md(child, indent + 1)
+                if child_md:
+                    md_parts.append(child_md)
+        # 添加空行
+        if md_parts:
+            md_parts.append('')
+        return '\n'.join(md_parts)
+    def get_supported_formats(self) -> tuple:
+        """获取支持的格式"""
+        return ('xml', 'markdown')

{hos_m2f-0.5.1.dist-info → hos_m2f-0.5.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hos-m2f
-Version: 0.5.1
+Version: 0.5.2
 Summary: HOS-M2F: Markdown to Industry Standard Format Compiler Engine
 Author: HOS Team
 Author-email: team@hos-m2f.com

hos_m2f-0.5.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,20 @@
+hos_m2f/__init__.py,sha256=v4k4TbKzPb3nbzgKJHaID3QTSpmTvAsGVHZ-poa870I,178
+hos_m2f/cli/__init__.py,sha256=NqhmK68K2evHjP7qcyT7FUWlDqfb22CUpCOfJhnQzPs,68
+hos_m2f/cli/cli.py,sha256=0bWtYmOoNE8h_rrBlwS-4yJwIRnRTtuBx3DWnMkZ4Qo,11920
+hos_m2f/converters/__init__.py,sha256=d88A1sTrQsoMzrTipg7jKTWfI83GJzlRFVFNibajeag,971
+hos_m2f/converters/base_converter.py,sha256=4xqcAFMT82va6VesgM_HybUPIpP77x0DrQSYzb1jf28,696
+hos_m2f/converters/docx_to_md.py,sha256=_HBp3TOD9ZkTFhHR_f3ObLlpDcv0tnSPjPfeGxuvhjM,3064
+hos_m2f/converters/epub_to_md.py,sha256=cFfHmK4IrJKwzEWVE3ue7Jw8tBfWu1q7wG9o7oMf4Pw,4612
+hos_m2f/converters/html_to_md.py,sha256=26GqdynSxKKO2NTxPKgfFs9bTuisLaEIJdBhz4CJ5Eg,4487
+hos_m2f/converters/json_to_md.py,sha256=jeLBQ3jTkgA5a2Kr2gsOPjZB-D4PZxumciFHbyPKNmc,3670
+hos_m2f/converters/md_to_docx.py,sha256=5l4Q8F0-9dM0gnpZmC9C-QAKZ58LHZ9CTZ4EK9Yv5UU,5573
+hos_m2f/converters/md_to_epub.py,sha256=HF0YJ3efvuG6ts45N7IaLIH_4O9VrWG5aqczF4mGPk0,2993
+hos_m2f/converters/md_to_html.py,sha256=ss6Uud2mPhoIMctQWKeGpRHa0FtUqB9573ZB6cnKucA,2827
+hos_m2f/converters/md_to_json.py,sha256=XqeIqwrubuLOU4dTmveAMmyuGD-lK57GVHh9nzAdtXo,10295
+hos_m2f/converters/md_to_xml.py,sha256=RXNdPlkXtvQXLXBuv_xNAu5UXHQo7dF31bynBy9yDIs,13266
+hos_m2f/converters/xml_to_md.py,sha256=zOkaEaSZdvyHag05kIHiWF4VyGMMjfmWmBllBpzwJ4E,4051
+hos_m2f-0.5.2.dist-info/METADATA,sha256=vDnHHOcme8BeiVGqIk_AcHNtzOPFuNsL3XImjG2Lg3U,1764
+hos_m2f-0.5.2.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+hos_m2f-0.5.2.dist-info/entry_points.txt,sha256=NeLjg1hvVt_A2sDUVZAYbfkvnZ1nGMcTqRiDoVQzn0w,49
+hos_m2f-0.5.2.dist-info/top_level.txt,sha256=DMIK2jdfJss-FB_GRZ6iw4gahhZUAvSI0fHamOPL9mE,8
+hos_m2f-0.5.2.dist-info/RECORD,,

hos_m2f-0.5.1.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-hos_m2f/__init__.py,sha256=v4k4TbKzPb3nbzgKJHaID3QTSpmTvAsGVHZ-poa870I,178
-hos_m2f/cli/__init__.py,sha256=NqhmK68K2evHjP7qcyT7FUWlDqfb22CUpCOfJhnQzPs,68
-hos_m2f/cli/cli.py,sha256=61agI2zcmgH5vtBNayueMGD7sVrqhMe5eFpvgd9j7r0,10639
-hos_m2f-0.5.1.dist-info/METADATA,sha256=8cdJz_maLr7TK51prDlcqULg2mCr-XyPYz3ZOlDq8cM,1764
-hos_m2f-0.5.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-hos_m2f-0.5.1.dist-info/entry_points.txt,sha256=NeLjg1hvVt_A2sDUVZAYbfkvnZ1nGMcTqRiDoVQzn0w,49
-hos_m2f-0.5.1.dist-info/top_level.txt,sha256=DMIK2jdfJss-FB_GRZ6iw4gahhZUAvSI0fHamOPL9mE,8
-hos_m2f-0.5.1.dist-info/RECORD,,

{hos_m2f-0.5.1.dist-info → hos_m2f-0.5.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{hos_m2f-0.5.1.dist-info → hos_m2f-0.5.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{hos_m2f-0.5.1.dist-info → hos_m2f-0.5.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

hos-m2f 0.5.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

hos-m2f 0.5.1py3-none-any.whl → 0.5.2py3-none-any.whl