PyPI - pydatamax - Versions diffs - 0.1.15.post2__tar.gz → 0.1.16.post1__tar.gz - Mend

pydatamax 0.1.15.post2tar.gz → 0.1.16.post1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydatamax
-Version: 0.1.15.post2
+Version: 0.1.16.post1
 Summary: A library for parsing and converting various file formats.
 Home-page: https://github.com/Hi-Dolphin/datamax
 Author: ccy

{pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/core.py RENAMED Viewed

@@ -87,7 +87,7 @@ class ParserFactory:
                 )
             elif parser_class_name == 'DocxParser' or parser_class_name == 'DocParser':
                 return parser_class(
-                    file_path=file_path, to_markdown=to_markdown
+                    file_path=file_path, to_markdown=to_markdown, use_uno=True
                 )
             elif parser_class_name == 'XlsxParser':
                 return parser_class(

{pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/doc_parser.py RENAMED Viewed

@@ -26,6 +26,21 @@ try:
     from datamax.utils.uno_handler import HAS_UNO, convert_with_uno
 except ImportError:
     HAS_UNO = False
+    logger.error(
+        "❌ UNO处理器导入失败！\n"
+        "🔧 解决方案：\n"
+        "1. 安装LibreOffice和python-uno：\n"
+        "   - Ubuntu/Debian: sudo apt-get install libreoffice python3-uno\n"
+        "   - CentOS/RHEL: sudo yum install libreoffice python3-uno\n"
+        "   - macOS: brew install libreoffice\n"
+        "   - Windows: 下载并安装LibreOffice\n"
+        "2. 确保Python可以访问uno模块：\n"
+        "   - Linux: export PYTHONPATH=/usr/lib/libreoffice/program:$PYTHONPATH\n"
+        "   - Windows: 添加LibreOffice\\program到系统PATH\n"
+        "3. 验证安装：python -c 'import uno'\n"
+        "4. 如果仍有问题，请查看完整文档：\n"
+        "   https://wiki.documentfoundation.org/Documentation/DevGuide/Installing_the_SDK"
+    )
 class DocParser(BaseLife):
@@ -46,7 +61,11 @@ class DocParser(BaseLife):
         else:
             self.use_uno = False
             if use_uno and not HAS_UNO:
-                logger.warning(f"⚠️ UNO不可用，回退到传统命令行方式")
+                logger.warning(
+                    f"⚠️ UNO不可用，回退到传统命令行方式\n"
+                    f"💡 提示：UNO转换更快更稳定，强烈建议安装和配置UNO\n"
+                    f"📖 请参考上述错误信息中的安装指南"
+                )
             else:
                 logger.info(f"🚀 DocParser初始化完成 - 使用传统命令行方式")
@@ -265,52 +284,78 @@ class DocParser(BaseLife):
             return ""
     def _clean_extracted_text(self, text: str) -> str:
-        """清理提取的文本，移除控制字符和格式化，但保留中文"""
+        """清理提取的文本，彻底移除所有XML标签和控制字符，只保留纯文本"""
         try:
-            # 移除NULL字符和其他控制字符（但不移除换行等）
+            # 1. 解码HTML/XML实体
+            text = html.unescape(text)
+            # 2. 移除所有XML/HTML标签
+            text = re.sub(r'<[^>]+>', '', text)
+            # 3. 移除XML命名空间前缀
+            text = re.sub(r'\b\w+:', '', text)
+            # 4. 移除NULL字符和其他控制字符
             text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f]', '', text)
-            # 移除连续的特殊字符（但保留中文和常用标点）
-            # 修改正则表达式，确保不会误删中文
-            text = re.sub(r'[^\w\s\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af，。！？；：""''（）【】《》、·…—.,!?;:()-]+', ' ', text)
+            # 5. 移除特殊的XML字符序列
+            text = re.sub(r'&[a-zA-Z]+;', '', text)
+            text = re.sub(r'&#\d+;', '', text)
+            text = re.sub(r'&#x[0-9a-fA-F]+;', '', text)
+            # 6. 保留有意义的字符，移除其他特殊字符
+            # 保留：中文、日文、韩文、英文、数字、常用标点和空白
+            allowed_chars = (
+                r'\w\s'  # 字母数字和空白
+                r'\u4e00-\u9fff'  # 中文
+                r'\u3040-\u30ff'  # 日文
+                r'\uac00-\ud7af'  # 韩文
+                r'，。！？；：""''（）【】《》、·…—'  # 中文标点
+                r'.,!?;:()[\]{}"\'`~@#$%^&*+=\-_/\\'  # 英文标点和常用符号
+            )
+            # 使用更严格的过滤，但保留所有有意义的字符
+            cleaned_text = ''.join(char for char in text if re.match(f'[{allowed_chars}]', char))
-            # 移除过长的无意义字符序列（通常是乱码）
-            text = re.sub(r'[\x80-\xff]{10,}', ' ', text)
+            # 7. 移除过长的无意义字符序列（通常是二进制垃圾）
+            cleaned_text = re.sub(r'([^\s\u4e00-\u9fff])\1{5,}', r'\1', cleaned_text)
-            # 移除重复的空白
-            text = re.sub(r'\s+', ' ', text)
-            text = re.sub(r'\n\s*\n\s*\n', '\n\n', text)
+            # 8. 清理多余的空白，但保留段落结构
+            cleaned_text = re.sub(r'[ \t]+', ' ', cleaned_text)  # 多个空格/制表符变为单个空格
+            cleaned_text = re.sub(r'\n\s*\n\s*\n+', '\n\n', cleaned_text)  # 多个空行变为双空行
+            cleaned_text = re.sub(r'^\s+|\s+$', '', cleaned_text, flags=re.MULTILINE)  # 移除行首行尾空白
-            # 确保段落分隔
-            lines = text.split('\n')
+            # 9. 进一步清理：移除独立的标点符号行
+            lines = cleaned_text.split('\n')
             cleaned_lines = []
             for line in lines:
                 line = line.strip()
                 if line:
-                    # 检查行是否主要是乱码
-                    printable_chars = sum(1 for c in line if c.isprintable() or '\u4e00' <= c <= '\u9fff')
-                    total_chars = len(line)
+                    # 检查行是否主要是有意义的内容
+                    # 计算中文、英文字母和数字的比例
+                    meaningful_chars = sum(1 for c in line if (
+                        c.isalnum() or '\u4e00' <= c <= '\u9fff'
+                    ))
-                    # 如果可打印字符（包括中文）占比超过60%，则保留该行
-                    if total_chars > 0 and printable_chars / total_chars > 0.6:
+                    # 如果有意义字符占比超过30%，或者行长度小于5（可能是标题），则保留
+                    if (len(line) < 5 or
+                        (meaningful_chars > 0 and meaningful_chars / len(line) > 0.3)):
                         cleaned_lines.append(line)
-                elif cleaned_lines and cleaned_lines[-1]:
-                    cleaned_lines.append('')  # 保留段落分隔
+                elif cleaned_lines and cleaned_lines[-1]:  # 保留段落分隔
+                    cleaned_lines.append('')
             result = '\n'.join(cleaned_lines).strip()
-            # 最后检查：如果结果太短或包含太多乱码，返回空
-            if len(result) < 20:
+            # 10. 最终检查
+            if len(result) < 10:
+                logger.warning("⚠️ 清理后的文本过短，可能存在问题")
                 return ""
-            # 检查乱码比例
-            weird_chars = sum(1 for c in result if ord(c) > 127 and not ('\u4e00' <= c <= '\u9fff' or c in '，。！？；：""''（）【】《》、·…—'))
-            if len(result) > 0 and weird_chars / len(result) > 0.3:
-                logger.warning(f"⚠️ 文本包含过多乱码字符 ({weird_chars}/{len(result)})")
-                # 尝试只保留ASCII和中文部分
-                result = re.sub(r'[^\x00-\x7f\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af，。！？；：""''（）【】《》、·…—\s]+', ' ', result)
-                result = re.sub(r'\s+', ' ', result).strip()
+            # 检查是否还包含XML标签
+            if re.search(r'<[^>]+>', result):
+                logger.warning("⚠️ 清理后仍包含XML标签，进行二次清理")
+                result = re.sub(r'<[^>]+>', '', result)
             return result
@@ -355,7 +400,20 @@ class DocParser(BaseLife):
                     return txt_path
             except Exception as e:
-                logger.error(f"💥 UNO转换失败: {str(e)}")
+                logger.error(
+                    f"💥 UNO转换失败: {str(e)}\n"
+                    f"🔍 诊断信息：\n"
+                    f"   - 错误类型: {type(e).__name__}\n"
+                    f"   - LibreOffice是否已安装？尝试运行: soffice --version\n"
+                    f"   - Python UNO模块是否可用？尝试: python -c 'import uno'\n"
+                    f"   - 是否有其他LibreOffice实例在运行？\n"
+                    f"   - 文件权限是否正确？\n"
+                    f"🔧 可能的解决方案：\n"
+                    f"   1. 确保LibreOffice正确安装\n"
+                    f"   2. 关闭所有LibreOffice进程\n"
+                    f"   3. 检查文件权限和路径\n"
+                    f"   4. 尝试手动运行: soffice --headless --convert-to txt \"{doc_path}\""
+                )
                 logger.warning("⚠️ 自动回退到传统命令行方式...")
                 return self._doc_to_txt_subprocess(doc_path, dir_path)
         else:

{pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/parser/docx_parser.py RENAMED Viewed

@@ -18,7 +18,21 @@ try:
     from datamax.utils.uno_handler import HAS_UNO, convert_with_uno
 except ImportError:
     HAS_UNO = False
-    logger.warning("⚠️ UNO不可用，回退到传统命令行方式")
+    logger.error(
+        "❌ UNO处理器导入失败！\n"
+        "🔧 解决方案：\n"
+        "1. 安装LibreOffice和python-uno：\n"
+        "   - Ubuntu/Debian: sudo apt-get install libreoffice python3-uno\n"
+        "   - CentOS/RHEL: sudo yum install libreoffice python3-uno\n"
+        "   - macOS: brew install libreoffice\n"
+        "   - Windows: 下载并安装LibreOffice\n"
+        "2. 确保Python可以访问uno模块：\n"
+        "   - Linux: export PYTHONPATH=/usr/lib/libreoffice/program:$PYTHONPATH\n"
+        "   - Windows: 添加LibreOffice\\program到系统PATH\n"
+        "3. 验证安装：python -c 'import uno'\n"
+        "4. 如果仍有问题，请查看完整文档：\n"
+        "   https://wiki.documentfoundation.org/Documentation/DevGuide/Installing_the_SDK"
+    )
@@ -40,7 +54,11 @@ class DocxParser(BaseLife):
         else:
             self.use_uno = False
             if use_uno and not HAS_UNO:
-                logger.warning(f"⚠️ UNO不可用，回退到传统命令行方式")
+                logger.warning(
+                    f"⚠️ UNO不可用，回退到传统命令行方式\n"
+                    f"💡 提示：UNO转换更快更稳定，强烈建议安装和配置UNO\n"
+                    f"📖 请参考上述错误信息中的安装指南"
+                )
             else:
                 logger.info(f"🚀 DocxParser初始化完成 - 使用传统命令行方式")
@@ -64,7 +82,20 @@ class DocxParser(BaseLife):
                     return txt_path
             except Exception as e:
-                logger.error(f"💥 UNO转换失败: {str(e)}")
+                logger.error(
+                    f"💥 UNO转换失败: {str(e)}\n"
+                    f"🔍 诊断信息：\n"
+                    f"   - 错误类型: {type(e).__name__}\n"
+                    f"   - LibreOffice是否已安装？尝试运行: soffice --version\n"
+                    f"   - Python UNO模块是否可用？尝试: python -c 'import uno'\n"
+                    f"   - 是否有其他LibreOffice实例在运行？\n"
+                    f"   - 文件权限是否正确？\n"
+                    f"🔧 可能的解决方案：\n"
+                    f"   1. 确保LibreOffice正确安装\n"
+                    f"   2. 关闭所有LibreOffice进程\n"
+                    f"   3. 检查文件权限和路径\n"
+                    f"   4. 尝试手动运行: soffice --headless --convert-to txt \"{docx_path}\""
+                )
                 logger.warning("⚠️ 自动回退到传统命令行方式...")
                 return self._docx_to_txt_subprocess(docx_path, dir_path)
         else:
@@ -228,26 +259,54 @@ class DocxParser(BaseLife):
             return ""
     def _extract_standard_document_content(self, docx_zip: zipfile.ZipFile) -> str:
-        """提取标准document.xml内容"""
+        """提取标准document.xml内容 - 只提取纯文本"""
         try:
             if 'word/document.xml' in docx_zip.namelist():
                 doc_xml = docx_zip.read('word/document.xml').decode('utf-8', errors='replace')
-                # 使用正则表达式提取文本内容
-                import xml.etree.ElementTree as ET
+                # 解码XML实体
+                doc_xml = html.unescape(doc_xml)
+                # 提取所有<w:t>标签中的文本（包括各种命名空间前缀）
+                # 使用更宽松的正则表达式来匹配任何命名空间前缀
+                text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
+                text_matches = re.findall(text_pattern, doc_xml)
-                # 移除命名空间前缀以简化处理
-                doc_xml_clean = re.sub(r'xmlns[^=]*="[^"]*"', '', doc_xml)
-                doc_xml_clean = re.sub(r'w:', '', doc_xml_clean)
-                doc_xml_clean = re.sub(r'[a-zA-Z0-9]+:', '', doc_xml_clean)
+                # 额外提取可能存在的无命名空间的<t>标签
+                text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', doc_xml))
-                # 提取所有<t>标签中的文本
-                text_matches = re.findall(r'<t[^>]*>(.*?)</t>', doc_xml_clean, re.DOTALL)
                 if text_matches:
-                    content = ' '.join(text_matches)
-                    content = html.unescape(content)
-                    logger.info(f"📝 从document.xml提取文本: {len(content)} 字符")
-                    return content.strip()
+                    # 清理和组合文本
+                    cleaned_texts = []
+                    for text in text_matches:
+                        # 解码XML实体
+                        text = html.unescape(text)
+                        # 移除多余的空白字符，但保留单个空格
+                        text = re.sub(r'\s+', ' ', text.strip())
+                        if text:
+                            cleaned_texts.append(text)
+                    # 智能连接文本片段
+                    content = ''
+                    for i, text in enumerate(cleaned_texts):
+                        if i == 0:
+                            content = text
+                        else:
+                            # 如果前一个文本片段不是以标点结束，且当前文本不是以大写开头，则不加空格
+                            prev_char = content[-1] if content else ''
+                            curr_char = text[0] if text else ''
+                            if prev_char in '.!?。！？\n' or curr_char.isupper() or curr_char in '，。！？；：':
+                                content += ' ' + text
+                            else:
+                                content += text
+                    # 最终清理
+                    content = re.sub(r'\s+', ' ', content)
+                    content = content.strip()
+                    logger.info(f"📝 从document.xml提取纯文本: {len(content)} 字符")
+                    return content
             return ""
         except Exception as e:
             logger.error(f"💥 提取标准文档内容失败: {str(e)}")
@@ -271,7 +330,7 @@ class DocxParser(BaseLife):
             return ""
     def _extract_headers_footers(self, docx_zip: zipfile.ZipFile) -> str:
-        """提取页眉页脚内容"""
+        """提取页眉页脚内容 - 只提取纯文本"""
         try:
             header_footer_content = []
@@ -280,35 +339,66 @@ class DocxParser(BaseLife):
                     logger.debug(f"📄 处理页眉页脚: {filename}")
                     content = docx_zip.read(filename).decode('utf-8', errors='replace')
-                    # 提取文本内容
-                    text_matches = re.findall(r'<w:t[^>]*>(.*?)</w:t>', content, re.DOTALL)
+                    # 解码XML实体
+                    content = html.unescape(content)
+                    # 提取文本内容 - 使用更宽松的模式
+                    text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
+                    text_matches = re.findall(text_pattern, content)
+                    text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', content))
                     if text_matches:
-                        header_footer_text = ' '.join(text_matches)
-                        header_footer_text = html.unescape(header_footer_text)
-                        if header_footer_text.strip():
-                            header_footer_content.append(header_footer_text.strip())
+                        # 清理和组合文本
+                        cleaned_texts = []
+                        for text in text_matches:
+                            text = html.unescape(text)
+                            text = re.sub(r'\s+', ' ', text.strip())
+                            if text:
+                                cleaned_texts.append(text)
+                        if cleaned_texts:
+                            # 合并文本片段
+                            header_footer_text = ' '.join(cleaned_texts)
+                            header_footer_text = re.sub(r'\s+', ' ', header_footer_text.strip())
+                            if header_footer_text:
+                                header_footer_content.append(header_footer_text)
             if header_footer_content:
-                logger.info(f"📑 提取页眉页脚内容: {len(header_footer_content)} 个")
+                logger.info(f"📑 提取页眉页脚纯文本: {len(header_footer_content)} 个")
-            return ' '.join(header_footer_content) if header_footer_content else ""
+            return '\n'.join(header_footer_content) if header_footer_content else ""
         except Exception as e:
             logger.error(f"💥 提取页眉页脚失败: {str(e)}")
             return ""
     def _extract_comments(self, docx_zip: zipfile.ZipFile) -> str:
-        """提取注释和批注内容"""
+        """提取注释和批注内容 - 只提取纯文本"""
         try:
             if 'word/comments.xml' in docx_zip.namelist():
                 comments_xml = docx_zip.read('word/comments.xml').decode('utf-8', errors='replace')
-                # 提取注释文本
-                text_matches = re.findall(r'<w:t[^>]*>(.*?)</w:t>', comments_xml, re.DOTALL)
+                # 解码XML实体
+                comments_xml = html.unescape(comments_xml)
+                # 提取注释文本 - 使用更宽松的模式
+                text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
+                text_matches = re.findall(text_pattern, comments_xml)
+                text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', comments_xml))
                 if text_matches:
-                    comments_text = ' '.join(text_matches)
-                    comments_text = html.unescape(comments_text)
-                    logger.info(f"💬 提取注释内容: {len(comments_text)} 字符")
-                    return comments_text.strip()
+                    # 清理和组合文本
+                    cleaned_texts = []
+                    for text in text_matches:
+                        text = html.unescape(text)
+                        text = re.sub(r'\s+', ' ', text.strip())
+                        if text:
+                            cleaned_texts.append(text)
+                    if cleaned_texts:
+                        comments_text = ' '.join(cleaned_texts)
+                        comments_text = re.sub(r'\s+', ' ', comments_text.strip())
+                        logger.info(f"💬 提取注释纯文本: {len(comments_text)} 字符")
+                        return comments_text
             return ""
         except Exception as e:
@@ -316,7 +406,7 @@ class DocxParser(BaseLife):
             return ""
     def _extract_textbox_content(self, docx_zip: zipfile.ZipFile) -> str:
-        """提取文本框和图形对象中的文本"""
+        """提取文本框和图形对象中的文本 - 只提取纯文本"""
         try:
             textbox_content = []
@@ -325,26 +415,43 @@ class DocxParser(BaseLife):
                 if 'word/' in filename and filename.endswith('.xml'):
                     content = docx_zip.read(filename).decode('utf-8', errors='replace')
+                    # 解码XML实体
+                    content = html.unescape(content)
                     # 查找文本框内容 (w:txbxContent)
-                    textbox_matches = re.findall(r'<w:txbxContent[^>]*>(.*?)</w:txbxContent>', content, re.DOTALL)
+                    textbox_matches = re.findall(r'<[^:>]*:txbxContent[^>]*>(.*?)</[^:>]*:txbxContent>', content, re.DOTALL)
                     for match in textbox_matches:
-                        text_matches = re.findall(r'<w:t[^>]*>(.*?)</w:t>', match, re.DOTALL)
+                        # 从文本框内容中提取文本
+                        text_pattern = r'<[^:>]*:t[^>]*>([^<]*)</[^:>]*:t>'
+                        text_matches = re.findall(text_pattern, match)
+                        text_matches.extend(re.findall(r'<t[^>]*>([^<]*)</t>', match))
                         if text_matches:
-                            textbox_text = ' '.join(text_matches)
-                            textbox_text = html.unescape(textbox_text)
-                            if textbox_text.strip():
-                                textbox_content.append(textbox_text.strip())
+                            # 清理和组合文本
+                            cleaned_texts = []
+                            for text in text_matches:
+                                text = html.unescape(text)
+                                text = re.sub(r'\s+', ' ', text.strip())
+                                if text:
+                                    cleaned_texts.append(text)
+                            if cleaned_texts:
+                                textbox_text = ' '.join(cleaned_texts)
+                                textbox_text = re.sub(r'\s+', ' ', textbox_text.strip())
+                                if textbox_text:
+                                    textbox_content.append(textbox_text)
             if textbox_content:
-                logger.info(f"📦 提取文本框内容: {len(textbox_content)} 个")
+                logger.info(f"📦 提取文本框纯文本: {len(textbox_content)} 个")
-            return ' '.join(textbox_content) if textbox_content else ""
+            return '\n'.join(textbox_content) if textbox_content else ""
         except Exception as e:
             logger.error(f"💥 提取文本框内容失败: {str(e)}")
             return ""
     def _combine_extracted_content(self, content_list: list) -> str:
-        """合并提取到的各种内容"""
+        """合并提取到的各种内容 - 输出清晰的纯文本"""
         combined = []
         # 按重要性排序内容
@@ -353,14 +460,40 @@ class DocxParser(BaseLife):
         for content_type in priority_order:
             for item_type, content in content_list:
                 if item_type == content_type and content.strip():
-                    combined.append(content.strip())
+                    # 清理内容中的多余空白
+                    cleaned_content = re.sub(r'\s+', ' ', content.strip())
+                    cleaned_content = re.sub(r'\n\s*\n', '\n\n', cleaned_content)
+                    if cleaned_content:
+                        # 根据内容类型添加简单的标记（仅在有多种内容类型时）
+                        if len([1 for t, c in content_list if c.strip()]) > 1:
+                            if item_type == "header_footer":
+                                combined.append(f"[页眉页脚]\n{cleaned_content}")
+                            elif item_type == "comments":
+                                combined.append(f"[批注]\n{cleaned_content}")
+                            elif item_type == "textboxes":
+                                combined.append(f"[文本框]\n{cleaned_content}")
+                            else:
+                                combined.append(cleaned_content)
+                        else:
+                            combined.append(cleaned_content)
         # 添加其他未分类的内容
         for item_type, content in content_list:
             if item_type not in priority_order and content.strip():
-                combined.append(content.strip())
+                cleaned_content = re.sub(r'\s+', ' ', content.strip())
+                cleaned_content = re.sub(r'\n\s*\n', '\n\n', cleaned_content)
+                if cleaned_content:
+                    combined.append(cleaned_content)
+        # 合并所有内容，使用双换行分隔不同部分
+        final_content = '\n\n'.join(combined) if combined else ""
+        # 最终清理：确保没有过多的空行
+        final_content = re.sub(r'\n{3,}', '\n\n', final_content)
+        final_content = final_content.strip()
-        return '\n\n'.join(combined) if combined else ""
+        return final_content
     def _extract_html_from_mht(self, mht_content: str) -> str:
         """从MHT内容中提取HTML部分并转换为简洁文本"""

{pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/datamax/utils/uno_handler.py RENAMED Viewed

@@ -125,12 +125,23 @@ class UnoManager:
                 cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
             )
             logger.info(f"⏳ 等待LibreOffice服务启动...")
-            time.sleep(5)  # 给服务一些启动时间
-            if self._check_soffice_running():
-                logger.info("✅ LibreOffice服务启动成功")
-            else:
-                raise Exception("LibreOffice服务启动失败")
+            # 智能等待：轮询检查服务状态，给不同性能机器弹性时间
+            start_time = time.time()
+            check_interval = 1  # 每1秒检查一次
+            max_wait_time = 30    # 最大等待30秒
+            while time.time() - start_time < max_wait_time:
+                if self._check_soffice_running():
+                    elapsed = time.time() - start_time
+                    logger.info(f"✅ LibreOffice服务启动成功 (耗时 {elapsed:.1f}秒)")
+                    return
+                logger.debug(f"🔄 服务未就绪，继续等待... (已等待 {time.time() - start_time:.1f}秒)")
+                time.sleep(check_interval)
+            # 超时仍未启动
+            raise Exception(f"LibreOffice服务启动超时 (等待了{max_wait_time}秒)")
         except Exception as e:
             logger.error(f"❌ 启动LibreOffice服务失败: {str(e)}")

{pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/pydatamax.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydatamax
-Version: 0.1.15.post2
+Version: 0.1.16.post1
 Summary: A library for parsing and converting various file formats.
 Home-page: https://github.com/Hi-Dolphin/datamax
 Author: ccy

{pydatamax-0.1.15.post2 → pydatamax-0.1.16.post1}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import find_packages, setup
 setup(
     name="pydatamax",
-    version='0.1.15.post2',
+    version='0.1.16.post1',
     packages=find_packages(),
     install_requires=[
         "oss2>=2.19.1,<3.0.0",