PyPI - hos-m2f - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

hos-m2f 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

hos_m2f/cli/__init__.py +1 -1
hos_m2f/converters/md_to_docx.py +172 -41
hos_m2f/converters/md_to_epub.py +37 -77
hos_m2f/converters/md_to_html.py +2 -14
hos_m2f/converters/md_to_json.py +40 -20
hos_m2f/converters/md_to_latex.py +63 -0
hos_m2f/converters/md_to_xml.py +40 -20
hos_m2f/converters/pdf_to_md.py +120 -0
{hos_m2f-0.5.3.dist-info → hos_m2f-0.5.5.dist-info}/METADATA +1 -1
hos_m2f-0.5.5.dist-info/RECORD +26 -0
{hos_m2f-0.5.3.dist-info → hos_m2f-0.5.5.dist-info}/entry_points.txt +1 -0
{hos_m2f-0.5.3.dist-info → hos_m2f-0.5.5.dist-info}/top_level.txt +1 -0
tests/__init__.py +1 -0
tests/test_converters.py +179 -0
tests/test_latex.py +182 -0
tests/test_modes.py +202 -0
hos_m2f-0.5.3.dist-info/RECORD +0 -20
{hos_m2f-0.5.3.dist-info → hos_m2f-0.5.5.dist-info}/WHEEL +0 -0

hos_m2f/cli/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """CLI模块"""
-from hos_m2f.cli.cli import CLI
+from .cli import CLI
 __all__ = ['CLI']

hos_m2f/converters/md_to_docx.py CHANGED Viewed

@@ -2,8 +2,8 @@
 from typing import Any, Optional, Dict
 from docx import Document
-from docx.shared import Inches, Pt
-from docx.enum.text import WD_ALIGN_PARAGRAPH
+from docx.shared import Inches, Pt, RGBColor
+from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_UNDERLINE
 from docx.enum.style import WD_STYLE_TYPE
 from hos_m2f.converters.base_converter import BaseConverter
 import mistune
@@ -31,22 +31,6 @@ class MDToDOCXConverter(BaseConverter):
         # 设置默认样式
         self._setup_styles(doc)
-        # 解析Markdown
-        markdown = mistune.create_markdown(
-            plugins=[
-                'url',
-                'abbr',
-                'def_list',
-                'footnotes',
-                'tables',
-                'task_lists',
-                'strikethrough',
-                'highlight',
-                'superscript',
-                'subscript'
-            ]
-        )
         # 自定义渲染器
         class DOCXRenderer(mistune.HTMLRenderer):
             def __init__(self, doc):
@@ -92,35 +76,130 @@ class MDToDOCXConverter(BaseConverter):
                 return ''
             def table(self, text):
-                # 简化处理，实际项目中需要更复杂的表格解析
-                self.doc.add_paragraph('Table: ' + text[:100] + '...')
+                # 解析Markdown表格并转换为DOCX表格
+                try:
+                    # 分割表格行
+                    rows = text.strip().split('\n')
+                    if not rows:
+                        return ''
+                    # 解析表头
+                    header_cells = [cell.strip() for cell in rows[0].split('|') if cell.strip()]
+                    if not header_cells:
+                        return ''
+                    # 创建表格
+                    table = self.doc.add_table(rows=1, cols=len(header_cells))
+                    table.style = 'Table Grid'
+                    # 填充表头
+                    header_row = table.rows[0]
+                    for i, cell_text in enumerate(header_cells):
+                        cell = header_row.cells[i]
+                        cell.text = cell_text
+                        # 设置表头样式
+                        for paragraph in cell.paragraphs:
+                            for run in paragraph.runs:
+                                run.bold = True
+                                run.font.size = Pt(11)
+                            paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
+                    # 跳过分隔线行，并解析对齐方式
+                    alignments = []
+                    if len(rows) > 1 and '---' in rows[1]:
+                        # 解析对齐方式
+                        alignment_row = rows[1]
+                        alignment_cells = [cell.strip() for cell in alignment_row.split('|') if cell.strip()]
+                        for cell in alignment_cells:
+                            if cell.startswith(':') and cell.endswith(':'):
+                                alignments.append(WD_ALIGN_PARAGRAPH.CENTER)
+                            elif cell.endswith(':'):
+                                alignments.append(WD_ALIGN_PARAGRAPH.RIGHT)
+                            else:
+                                alignments.append(WD_ALIGN_PARAGRAPH.LEFT)
+                        data_rows = rows[2:]
+                    else:
+                        data_rows = rows[1:]
+                    # 填充数据行
+                    for row in data_rows:
+                        cells = [cell.strip() for cell in row.split('|') if cell.strip()]
+                        if cells:
+                            new_row = table.add_row()
+                            for i, cell_text in enumerate(cells):
+                                if i < len(new_row.cells):
+                                    cell = new_row.cells[i]
+                                    cell.text = cell_text
+                                    # 设置对齐方式
+                                    if i < len(alignments):
+                                        for paragraph in cell.paragraphs:
+                                            paragraph.alignment = alignments[i]
+                except Exception as e:
+                    # 如果解析失败，回退到简单处理
+                    self.doc.add_paragraph('Table: ' + text[:100] + '...')
                 return ''
-            def image(self, src, alt='', title=None):
+            def image(self, text, url=None, title=None, alt=None):
                 try:
-                    # 简化处理，实际项目中需要处理本地和远程图片
-                    self.doc.add_paragraph(f'Image: {alt} ({src})')
-                except Exception:
-                    pass
+                    # 尝试处理本地和远程图片
+                    import os
+                    import requests
+                    from io import BytesIO
+                    # 使用alt作为替代文本
+                    if alt is None:
+                        alt = text
+                    # 检查是否有图片URL
+                    if not url:
+                        self.doc.add_paragraph(f'Image: {alt}')
+                        return ''
+                    # 检查是否是本地图片
+                    if os.path.exists(url):
+                        # 添加本地图片
+                        self.doc.add_picture(url)
+                    else:
+                        # 尝试从网络获取图片
+                        response = requests.get(url, timeout=5)
+                        if response.status_code == 200:
+                            # 添加远程图片
+                            image_stream = BytesIO(response.content)
+                            self.doc.add_picture(image_stream)
+                        else:
+                            # 如果获取失败，添加图片描述
+                            self.doc.add_paragraph(f'Image: {alt} ({url})')
+                except Exception as e:
+                    # 如果处理失败，添加图片描述
+                    self.doc.add_paragraph(f'Image: {alt or text} ({url or ""})')
                 return ''
-            def link(self, link, text=None, title=None):
-                if text:
+            def link(self, text, url=None, title=None):
+                if text and url:
+                    # 简化处理，直接添加文本和链接
                     p = self.doc.add_paragraph()
                     run = p.add_run(text)
-                    # 实际项目中需要添加超链接
+                    run.font.color.rgb = RGBColor(0, 0, 255)  # 蓝色
+                    run.underline = WD_UNDERLINE.SINGLE
+                    p.add_run(f' ({url})')
+                elif text:
+                    p = self.doc.add_paragraph(text)
+                elif url:
+                    p = self.doc.add_paragraph(url)
                 return ''
             def emphasis(self, text):
-                if self.current_paragraph:
-                    run = self.current_paragraph.add_run(text)
-                    run.italic = True
+                # 直接添加斜体文本
+                p = self.doc.add_paragraph()
+                run = p.add_run(text)
+                run.italic = True
                 return ''
             def strong(self, text):
-                if self.current_paragraph:
-                    run = self.current_paragraph.add_run(text)
-                    run.bold = True
+                # 直接添加粗体文本
+                p = self.doc.add_paragraph()
+                run = p.add_run(text)
+                run.bold = True
                 return ''
             def codespan(self, text):
@@ -129,16 +208,68 @@ class MDToDOCXConverter(BaseConverter):
                 run.font.name = 'Courier New'
                 return ''
-            def block_code(self, code, lang=None):
-                p = self.doc.add_paragraph()
-                run = p.add_run(code)
-                run.font.name = 'Courier New'
-                p.paragraph_format.left_indent = Inches(0.5)
-                return ''
+            def block_code(self, code, info=None):
+                # 处理Mermaid图表
+                if info == 'mermaid':
+                    try:
+                        # 尝试渲染Mermaid图表为图片
+                        mermaid_image = self._render_mermaid(code)
+                        if mermaid_image:
+                            # 添加图片
+                            self.doc.add_picture(mermaid_image)
+                            return ''
+                        else:
+                            # 如果渲染失败，添加代码块
+                            p = self.doc.add_paragraph('Mermaid Chart:')
+                            p = self.doc.add_paragraph(code)
+                            p.paragraph_format.left_indent = Inches(0.5)
+                            return ''
+                    except Exception as e:
+                        # 如果处理失败，添加代码块
+                        p = self.doc.add_paragraph('Mermaid Chart:')
+                        p = self.doc.add_paragraph(code)
+                        p.paragraph_format.left_indent = Inches(0.5)
+                        return ''
+                else:
+                    # 处理普通代码块
+                    p = self.doc.add_paragraph()
+                    run = p.add_run(code)
+                    run.font.name = 'Courier New'
+                    p.paragraph_format.left_indent = Inches(0.5)
+                    return ''
+            def _render_mermaid(self, mermaid_code):
+                """渲染Mermaid图表为图片"""
+                # 使用mermaid.ink API渲染Mermaid图表
+                try:
+                    import requests
+                    from io import BytesIO
+                    import urllib.parse
+                    # 编码Mermaid代码
+                    encoded_code = urllib.parse.quote(mermaid_code)
+                    # 构建API URL
+                    url = f"https://mermaid.ink/img/{encoded_code}"
+                    # 发送请求
+                    response = requests.get(url, timeout=10)
+                    if response.status_code == 200:
+                        # 返回图片数据流
+                        return BytesIO(response.content)
+                    else:
+                        # 如果API调用失败，返回None
+                        return None
+                except Exception as e:
+                    # 如果处理失败，返回None
+                    print(f"Error rendering Mermaid chart: {e}")
+                    return None
         # 渲染Markdown
         renderer = DOCXRenderer(doc)
-        markdown(input_content, renderer)
+        markdown = mistune.create_markdown(renderer=renderer)
+        markdown(input_content)
         # 保存为二进制数据
         import io

hos_m2f/converters/md_to_epub.py CHANGED Viewed

@@ -2,14 +2,18 @@
 from typing import Any, Optional, Dict
 from hos_m2f.converters.base_converter import BaseConverter
-import ebooklib
-from ebooklib import epub
-import mistune
+from hos_m2f.renderers.epub_renderer import EPUBRenderer
+from hos_m2f.structure.book_parser import BookParser
 class MDToEPUBConverter(BaseConverter):
     """Markdown到EPUB格式转换器"""
+    def __init__(self):
+        """初始化转换器"""
+        self.renderer = EPUBRenderer()
+        self.book_parser = BookParser()
     def convert(self, input_content: str, options: Optional[Dict[str, Any]] = None) -> bytes:
         """将Markdown转换为EPUB
@@ -23,85 +27,41 @@ class MDToEPUBConverter(BaseConverter):
         if options is None:
             options = {}
-        # 创建EPUB书籍
-        book = epub.EpubBook()
-        # 设置元数据
-        book.set_identifier('id12345')
-        book.set_title(options.get('title', 'Untitled'))
-        book.set_language(options.get('language', 'zh'))
-        book.add_author(options.get('author', 'Unknown'))
-        # 添加封面
-        if 'cover' in options:
-            cover_image = epub.EpubItem(
-                uid='cover-image',
-                file_name='images/cover.jpg',
-                media_type='image/jpeg',
-                content=options['cover']
-            )
-            book.add_item(cover_image)
-            book.set_cover('images/cover.jpg', cover_image)
-        # 解析Markdown
-        markdown = mistune.create_markdown(
-            plugins=[
-                'url',
-                'abbr',
-                'def_list',
-                'footnotes',
-                'tables',
-                'task_lists',
-                'strikethrough',
-                'highlight',
-                'superscript',
-                'subscript'
-            ]
-        )
+        # 使用BookParser解析Markdown内容
+        parsed_content = self.book_parser.parse(input_content, options)
-        # 转换为HTML
-        html_content = markdown(input_content)
+        # 增强解析结果
+        parsed_content = self._enhance_parsed_content(parsed_content, options)
-        # 创建章节
-        chapter = epub.EpubHtml(
-            title=options.get('title', 'Chapter 1'),
-            file_name='chapter1.xhtml',
-            lang='zh'
-        )
-        chapter.content = f'''
-        <!DOCTYPE html>
-        <html>
-        <head>
-            <title>{options.get('title', 'Untitled')}</title>
-            <meta charset="utf-8" />
-        </head>
-        <body>
-            <h1>{options.get('title', 'Untitled')}</h1>
-            {html_content}
-        </body>
-        </html>
-        '''
+        # 使用EPUBRenderer渲染EPUB文件
+        epub_content = self.renderer.render(parsed_content, options)
-        # 添加章节
-        book.add_item(chapter)
-        # 创建目录
-        book.toc = [chapter]
-        # 添加导航文件
-        book.add_item(epub.EpubNcx())
-        book.add_item(epub.EpubNav())
-        # 定义spine
-        book.spine = ['nav', chapter]
+        return epub_content
+    def _enhance_parsed_content(self, parsed_content: Dict[str, Any], options: Dict[str, Any]) -> Dict[str, Any]:
+        """增强解析结果"""
+        # 添加选项中的元数据
+        if 'title' in options:
+            parsed_content.setdefault('book_metadata', {})['title'] = options['title']
+        if 'author' in options:
+            parsed_content.setdefault('book_metadata', {})['author'] = options['author']
+        if 'language' in options:
+            parsed_content.setdefault('book_metadata', {})['language'] = options['language']
+        if 'publisher' in options:
+            parsed_content.setdefault('book_metadata', {})['publisher'] = options['publisher']
+        if 'publish_date' in options:
+            parsed_content.setdefault('book_metadata', {})['publish_date'] = options['publish_date']
+        if 'description' in options:
+            parsed_content.setdefault('book_metadata', {})['description'] = options['description']
-        # 保存为二进制数据
-        import io
-        output = io.BytesIO()
-        epub.write_epub(output, book, {})
-        output.seek(0)
+        # 添加封面信息
+        if 'cover' in options:
+            parsed_content['cover'] = {
+                'src': options['cover'],
+                'type': 'image'
+            }
-        return output.getvalue()
+        return parsed_content
     def get_supported_formats(self) -> tuple:
         """获取支持的格式"""

hos_m2f/converters/md_to_html.py CHANGED Viewed

@@ -22,20 +22,8 @@ class MDToHTMLConverter(BaseConverter):
             options = {}
         # 解析Markdown
-        markdown = mistune.create_markdown(
-            plugins=[
-                'url',
-                'abbr',
-                'def_list',
-                'footnotes',
-                'tables',
-                'task_lists',
-                'strikethrough',
-                'highlight',
-                'superscript',
-                'subscript'
-            ]
-        )
+        markdown = mistune.create_markdown()
         # 转换为HTML
         html_content = markdown(input_content)

hos_m2f/converters/md_to_json.py CHANGED Viewed

@@ -193,19 +193,30 @@ class MDToJSONConverter(BaseConverter):
                 language = line[3:].strip()
                 # 读取代码内容
-                for i, code_line in enumerate(lines[lines.index(line)+1:]):
-                    if code_line.startswith('```'):
-                        break
-                    code_lines.append(code_line)
-                structure['children'].append({
-                    'type': 'code_block',
-                    'language': language,
-                    'content': '\n'.join(code_lines)
-                })
-                # 跳过已处理的代码行
-                lines = lines[:lines.index(line)] + lines[lines.index(line)+i+2:]
+                try:
+                    line_idx = lines.index(line)
+                    code_end_idx = line_idx + 1
+                    for i, code_line in enumerate(lines[line_idx+1:]):
+                        if code_line.startswith('```'):
+                            code_end_idx = line_idx + i + 1
+                            break
+                        code_lines.append(code_line)
+                        code_end_idx = line_idx + i + 1
+                    structure['children'].append({
+                        'type': 'code_block',
+                        'language': language,
+                        'content': '\n'.join(code_lines)
+                    })
+                    # 跳过已处理的代码行
+                    if code_end_idx < len(lines):
+                        lines = lines[:line_idx] + lines[code_end_idx+1:]
+                    else:
+                        lines = lines[:line_idx]
+                except ValueError:
+                    # 如果找不到行，跳过代码块解析
+                    continue
             # 处理表格
             elif line.startswith('|') and '|' in line[1:]:
@@ -231,19 +242,28 @@ class MDToJSONConverter(BaseConverter):
                 table_lines = [line]
                 # 读取表格内容
-                for i, table_line in enumerate(lines[lines.index(line)+1:]):
-                    if table_line.startswith('|'):
-                        table_lines.append(table_line)
-                    else:
-                        break
+                try:
+                    line_idx = lines.index(line)
+                    for i, table_line in enumerate(lines[line_idx+1:]):
+                        if table_line.startswith('|'):
+                            table_lines.append(table_line)
+                        else:
+                            break
+                except ValueError:
+                    # 如果找不到行，跳过表格解析
+                    continue
                 # 解析表格结构
                 if len(table_lines) >= 2:
                     headers = [h.strip() for h in table_lines[0].split('|') if h.strip()]
                     rows = []
-                    # 跳过分隔线
-                    for table_line in table_lines[2:]:
+                    # 跳过分隔线（如果存在）
+                    start_idx = 1
+                    if len(table_lines) > 1 and any('---' in cell for cell in table_lines[1].split('|')):
+                        start_idx = 2
+                    for table_line in table_lines[start_idx:]:
                         cells = [c.strip() for c in table_line.split('|') if c.strip()]
                         if cells:
                             rows.append(dict(zip(headers, cells)))

hos_m2f/converters/md_to_latex.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""Markdown到LaTeX格式转换器"""
+from typing import Any, Optional, Dict
+from hos_m2f.converters.base_converter import BaseConverter
+from hos_m2f.renderers.latex_renderer import LaTeXRenderer
+from hos_m2f.structure.semantic_parser import SemanticParser
+class MDToLaTeXConverter(BaseConverter):
+    """Markdown到LaTeX格式转换器"""
+    def __init__(self):
+        """初始化转换器"""
+        self.renderer = LaTeXRenderer()
+        self.parser = SemanticParser()
+    def convert(self, input_content: str, options: Optional[Dict[str, Any]] = None) -> bytes:
+        """将Markdown转换为LaTeX
+        Args:
+            input_content: Markdown内容
+            options: 转换选项
+        Returns:
+            bytes: LaTeX文件的二进制数据
+        """
+        if options is None:
+            options = {}
+        # 使用SemanticParser解析Markdown内容
+        parsed_content = self.parser.parse(input_content)
+        # 增强解析结果
+        parsed_content = self._enhance_parsed_content(parsed_content, options)
+        # 使用LaTeXRenderer渲染LaTeX文件
+        latex_content = self.renderer.render(parsed_content, options)
+        return latex_content
+    def _enhance_parsed_content(self, parsed_content: Dict[str, Any], options: Dict[str, Any]) -> Dict[str, Any]:
+        """增强解析结果"""
+        # 添加选项中的元数据
+        if 'title' in options:
+            parsed_content.setdefault('metadata', {})['title'] = options['title']
+        if 'author' in options:
+            parsed_content.setdefault('metadata', {})['author'] = options['author']
+        if 'date' in options:
+            parsed_content.setdefault('metadata', {})['date'] = options['date']
+        if 'abstract' in options:
+            parsed_content.setdefault('metadata', {})['abstract'] = options['abstract']
+        if 'keywords' in options:
+            parsed_content.setdefault('metadata', {})['keywords'] = options['keywords']
+        # 添加文档类型
+        if 'document_class' in options:
+            parsed_content['document_class'] = options['document_class']
+        return parsed_content
+    def get_supported_formats(self) -> tuple:
+        """获取支持的格式"""
+        return ('markdown', 'latex')

hos_m2f/converters/md_to_xml.py CHANGED Viewed

@@ -200,19 +200,30 @@ class MDToXMLConverter(BaseConverter):
                 language = line[3:].strip()
                 # 读取代码内容
-                for i, code_line in enumerate(lines[lines.index(line)+1:]):
-                    if code_line.startswith('```'):
-                        break
-                    code_lines.append(code_line)
-                structure['children'].append({
-                    'type': 'code_block',
-                    'language': language,
-                    'content': '\n'.join(code_lines)
-                })
-                # 跳过已处理的代码行
-                lines = lines[:lines.index(line)] + lines[lines.index(line)+i+2:]
+                try:
+                    line_idx = lines.index(line)
+                    code_end_idx = line_idx + 1
+                    for i, code_line in enumerate(lines[line_idx+1:]):
+                        if code_line.startswith('```'):
+                            code_end_idx = line_idx + i + 1
+                            break
+                        code_lines.append(code_line)
+                        code_end_idx = line_idx + i + 1
+                    structure['children'].append({
+                        'type': 'code_block',
+                        'language': language,
+                        'content': '\n'.join(code_lines)
+                    })
+                    # 跳过已处理的代码行
+                    if code_end_idx < len(lines):
+                        lines = lines[:line_idx] + lines[code_end_idx+1:]
+                    else:
+                        lines = lines[:line_idx]
+                except ValueError:
+                    # 如果找不到行，跳过代码块解析
+                    continue
             # 处理表格
             elif line.startswith('|') and '|' in line[1:]:
@@ -238,19 +249,28 @@ class MDToXMLConverter(BaseConverter):
                 table_lines = [line]
                 # 读取表格内容
-                for i, table_line in enumerate(lines[lines.index(line)+1:]):
-                    if table_line.startswith('|'):
-                        table_lines.append(table_line)
-                    else:
-                        break
+                try:
+                    line_idx = lines.index(line)
+                    for i, table_line in enumerate(lines[line_idx+1:]):
+                        if table_line.startswith('|'):
+                            table_lines.append(table_line)
+                        else:
+                            break
+                except ValueError:
+                    # 如果找不到行，跳过表格解析
+                    continue
                 # 解析表格结构
                 if len(table_lines) >= 2:
                     headers = [h.strip() for h in table_lines[0].split('|') if h.strip()]
                     rows = []
-                    # 跳过分隔线
-                    for table_line in table_lines[2:]:
+                    # 跳过分隔线（如果存在）
+                    start_idx = 1
+                    if len(table_lines) > 1 and any('---' in cell for cell in table_lines[1].split('|')):
+                        start_idx = 2
+                    for table_line in table_lines[start_idx:]:
                         cells = [c.strip() for c in table_line.split('|') if c.strip()]
                         if cells:
                             rows.append(dict(zip(headers, cells)))

hos-m2f 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

hos-m2f 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl