indexdoc-converter 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. indexdoc_converter-0.2.3/PKG-INFO +228 -0
  2. indexdoc_converter-0.2.3/README.md +134 -0
  3. {indexdoc_converter-0.2.1 → indexdoc_converter-0.2.3}/indexdoc_converter/excel_to_md.py +1 -1
  4. {indexdoc_converter-0.2.1 → indexdoc_converter-0.2.3}/indexdoc_converter/html_to_md.py +1 -1
  5. indexdoc_converter-0.2.3/indexdoc_converter/pptx_to_md.py +174 -0
  6. indexdoc_converter-0.2.3/indexdoc_converter.egg-info/PKG-INFO +228 -0
  7. {indexdoc_converter-0.2.1 → indexdoc_converter-0.2.3}/setup.py +1 -1
  8. indexdoc_converter-0.2.1/PKG-INFO +0 -92
  9. indexdoc_converter-0.2.1/README.md +0 -0
  10. indexdoc_converter-0.2.1/indexdoc_converter/pptx_to_md.py +0 -79
  11. indexdoc_converter-0.2.1/indexdoc_converter.egg-info/PKG-INFO +0 -92
  12. {indexdoc_converter-0.2.1 → indexdoc_converter-0.2.3}/indexdoc_converter/__init__.py +0 -0
  13. {indexdoc_converter-0.2.1 → indexdoc_converter-0.2.3}/indexdoc_converter/docx_to_md.py +0 -0
  14. {indexdoc_converter-0.2.1 → indexdoc_converter-0.2.3}/indexdoc_converter/utils/FileUtil.py +0 -0
  15. {indexdoc_converter-0.2.1 → indexdoc_converter-0.2.3}/indexdoc_converter/utils/IDUtil.py +0 -0
  16. {indexdoc_converter-0.2.1 → indexdoc_converter-0.2.3}/indexdoc_converter/utils/__init__.py +0 -0
  17. {indexdoc_converter-0.2.1 → indexdoc_converter-0.2.3}/indexdoc_converter/utils/img_to_base64.py +0 -0
  18. {indexdoc_converter-0.2.1 → indexdoc_converter-0.2.3}/indexdoc_converter.egg-info/SOURCES.txt +0 -0
  19. {indexdoc_converter-0.2.1 → indexdoc_converter-0.2.3}/indexdoc_converter.egg-info/dependency_links.txt +0 -0
  20. {indexdoc_converter-0.2.1 → indexdoc_converter-0.2.3}/indexdoc_converter.egg-info/requires.txt +0 -0
  21. {indexdoc_converter-0.2.1 → indexdoc_converter-0.2.3}/indexdoc_converter.egg-info/top_level.txt +0 -0
  22. {indexdoc_converter-0.2.1 → indexdoc_converter-0.2.3}/setup.cfg +0 -0
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.4
2
+ Name: indexdoc_converter
3
+ Version: 0.2.3
4
+ Summary: 可以将Word文档(仅.docx)、Excel表格、Html网页、PPt文件 转化为Markdown文件。
5
+ Home-page: https://github.com/indexdoc/indexdoc-converter.git
6
+ Author: 杭州智予数信息技术有限公司
7
+ Author-email: indexdoc@qq.com
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: annotated-types==0.7.0
11
+ Requires-Dist: asgiref==3.11.0
12
+ Requires-Dist: beautifulsoup4==4.14.3
13
+ Requires-Dist: bottle==0.13.4
14
+ Requires-Dist: captcha==0.7.1
15
+ Requires-Dist: certifi==2026.1.4
16
+ Requires-Dist: cffi==2.0.0
17
+ Requires-Dist: chardet==5.2.0
18
+ Requires-Dist: charset-normalizer==3.4.4
19
+ Requires-Dist: clickhouse-driver==0.2.10
20
+ Requires-Dist: clr_loader==0.2.10
21
+ Requires-Dist: cobble==0.1.4
22
+ Requires-Dist: colorama==0.4.6
23
+ Requires-Dist: cryptography==46.0.3
24
+ Requires-Dist: cssselect==1.4.0
25
+ Requires-Dist: defusedxml==0.7.1
26
+ Requires-Dist: Django==6.0.1
27
+ Requires-Dist: django-ranged-response==0.2.0
28
+ Requires-Dist: django-simple-captcha==0.6.3
29
+ Requires-Dist: duckdb==1.4.3
30
+ Requires-Dist: et_xmlfile==2.0.0
31
+ Requires-Dist: filelock==3.20.3
32
+ Requires-Dist: fonttools==4.61.1
33
+ Requires-Dist: fpdf2==2.8.5
34
+ Requires-Dist: fsspec==2026.1.0
35
+ Requires-Dist: html2text==2025.4.15
36
+ Requires-Dist: idna==3.11
37
+ Requires-Dist: image==1.5.33
38
+ Requires-Dist: Jinja2==3.1.6
39
+ Requires-Dist: lxml==6.0.2
40
+ Requires-Dist: lxml_html_clean==0.4.3
41
+ Requires-Dist: mammoth==1.11.0
42
+ Requires-Dist: markdownify==1.2.2
43
+ Requires-Dist: MarkupSafe==3.0.3
44
+ Requires-Dist: mpmath==1.3.0
45
+ Requires-Dist: networkx==3.6.1
46
+ Requires-Dist: numpy==2.4.1
47
+ Requires-Dist: odfpy==1.4.1
48
+ Requires-Dist: openpyxl==3.1.5
49
+ Requires-Dist: pandas==3.0.0
50
+ Requires-Dist: pdfkit==1.0.0
51
+ Requires-Dist: pillow==12.1.0
52
+ Requires-Dist: pptx2md==2.0.6
53
+ Requires-Dist: proxy_tools==0.1.0
54
+ Requires-Dist: psutil==7.2.1
55
+ Requires-Dist: pycparser==2.23
56
+ Requires-Dist: pydantic==2.12.5
57
+ Requires-Dist: pydantic_core==2.41.5
58
+ Requires-Dist: PyJWT==2.10.1
59
+ Requires-Dist: pyperclip==1.11.0
60
+ Requires-Dist: python-dateutil==2.9.0.post0
61
+ Requires-Dist: python-docx==1.2.0
62
+ Requires-Dist: python-pptx==1.0.2
63
+ Requires-Dist: pythonnet==3.0.5
64
+ Requires-Dist: pytz==2025.2
65
+ Requires-Dist: pywebview==6.1
66
+ Requires-Dist: pywin32==311
67
+ Requires-Dist: RapidFuzz==3.14.3
68
+ Requires-Dist: readability-lxml==0.8.4.1
69
+ Requires-Dist: requests==2.32.5
70
+ Requires-Dist: scipy==1.17.0
71
+ Requires-Dist: setuptools==80.9.0
72
+ Requires-Dist: six==1.17.0
73
+ Requires-Dist: soupsieve==2.8.3
74
+ Requires-Dist: sqlparse==0.5.5
75
+ Requires-Dist: sympy==1.14.0
76
+ Requires-Dist: torch==2.9.1
77
+ Requires-Dist: tornado==6.5.4
78
+ Requires-Dist: tqdm==4.67.1
79
+ Requires-Dist: typing-inspection==0.4.2
80
+ Requires-Dist: typing_extensions==4.15.0
81
+ Requires-Dist: tzdata==2025.3
82
+ Requires-Dist: tzlocal==5.3.1
83
+ Requires-Dist: urllib3==2.6.3
84
+ Requires-Dist: WMI==1.5.1
85
+ Requires-Dist: xlsxwriter==3.2.9
86
+ Dynamic: author
87
+ Dynamic: author-email
88
+ Dynamic: description
89
+ Dynamic: description-content-type
90
+ Dynamic: home-page
91
+ Dynamic: requires-dist
92
+ Dynamic: requires-python
93
+ Dynamic: summary
94
+
95
+ <div align="center">
96
+ <strong>简体中文</strong> | <a href="README_EN.md">English</a>
97
+ </div>
98
+
99
+ ---
100
+ # indexdoc-converter 文档转换工具库
101
+ **indexdoc-converter** 是一款基于 Python 开发的文档转换工具库,核心功能为将主流办公文档、网页文件高效转换为 Markdown 格式。各类型文件支持格式如下:
102
+ - Word 文档支持 **.docx** ;
103
+ - Excel 类表格文档支持 **.xlsx、.xls、.ods、.csv、.tsv** ;
104
+ - 网页文件支持 **.html、.mhtml、.htm 及网页url** ;
105
+ - PPT 演示文档支持 **.pptx** 。
106
+ 该工具库现已发布至 PyPI(Python Package Index),可通过 pip 包管理工具快速安装并投入使用。
107
+
108
+ [![Python Version](https://img.shields.io/badge/python-3.10+-green.svg)](https://www.python.org/) [![GitHub Stars](https://img.shields.io/github/stars/indexdoc/indexdoc-converter?style=social)](https://github.com/indexdoc/indexdoc-converter.git)
109
+
110
+
111
+ ## 库的使用
112
+ ```bash
113
+ #库安装
114
+ pip install -U indexdoc-converter #下载最新版本库
115
+ ```
116
+ - 若使用该库 python版本最小应为 Python3.10
117
+ - 包目录结构
118
+
119
+ ```bash
120
+ indexdoc-converter/ # 项目根目录
121
+ ├── indexdoc_converter/ # 核心包目录
122
+ │ ├── __init__.py # 核心代码
123
+ │ ├── docx_to_md.py # Word转Markdown工具类
124
+ │ ├── excel_to_md.py # Excel转Markdown工具类
125
+ │ ├── html_to_md.py # Html转Markdown工具类
126
+ │ ├── pptx_to_md.py # ppt转Markdown工具类
127
+ │ └── utils/
128
+ │ ├── __init__.py
129
+ │ ├── FileUtil.py
130
+ │ ├── IDUtil.py
131
+ │ └── img_to_base64.py
132
+ ```
133
+
134
+ ### 使用示例
135
+
136
+ ```bash
137
+ #引用 注意引用为 indexdoc_converter 而不是 indexdoc-converter
138
+ from indexdoc_converter.docx_to_md import convert_docx_to_md
139
+ from indexdoc_converter.excel_to_md import TableToMarkdown
140
+ from indexdoc_converter.html_to_md import convert_to_md
141
+ from indexdoc_converter.pptx_to_md import pptx_to_md
142
+
143
+ # -------------------------------------------Word转Markdown---------------------------------------------------
144
+ md_text = convert_docx_to_md(r"C:\Users\xxx\测试文档.docx", False)
145
+ with open('./test.md', 'w', encoding='utf-8') as f:
146
+ f.write(md_text)
147
+
148
+ # -------------------------------------------Excel转Markdown-------------------------------------------------
149
+ # 自定义参数示例
150
+ converter = TableToMarkdown(
151
+ file_title_level=2, # 文件标题的Markdown层级,默认1(#),这里设为2(##)
152
+ single_row_value_as_title=True, # 是否将单行唯一值识别为标题,默认True
153
+ max_rows=8000, # 最大处理行数,默认6000(实际处理行数是max_rows+1)
154
+ max_cols=200 # 最大处理列数,默认128(实际处理列数是max_cols+1)
155
+ )
156
+
157
+ # 转换单个文件
158
+ file_path = r"C:\Users\xxx\测试文件.xlsx"
159
+ result = converter.convert(file_path)
160
+
161
+ # blank 模式:保留合并单元格的原始样式(只在合并单元格左上角显示内容,其余位置为空)
162
+ with open("../tmp/测试_blank.md", "w", encoding="utf-8") as f:
163
+ f.write(result['blank'])
164
+
165
+ # fill 模式:将合并单元格的内容填充到所有合并的单元格中同时还能自动识别表格中的标题行、分割多个表格块,处理空行 / 空列,兼容各种表格格式的合并单元格解析。
166
+ with open("../tmp/测试_fill.md", "w", encoding="utf-8") as f:
167
+ f.write(result['fill'])
168
+
169
+ # -------------------------------------------ppt转Markdown---------------------------------------------------
170
+ ppt_file = r"C:\Users\xxx\测试文件.pptx"
171
+ md_path = pptx_to_md(ppt_file)
172
+ print(f"单文件转换完成,MD文件路径:{md_path}")
173
+
174
+ # -------------------------------------------网页文件转Markdown-----------------------------------------------
175
+ # html = "https://news.qq.com/rain/a/20260114A01NI000"
176
+ html = "https://www.aituple.com"
177
+ # html = "https://www.indexdoc.com"
178
+ # html = r"C:\Users\xxx\测试文件.html"
179
+ # html = "https://www.indexdoc.com/contact.html"
180
+ md = convert_to_md(html, '../tmp/测试html.md')
181
+ # md = mhtml_to_markdown(mhtml)
182
+ ```
183
+ ### Word文档
184
+ #### 原文档
185
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Word1.png)
186
+ #### 转换后文档
187
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Word2.png)
188
+
189
+ ### Excel文档
190
+ #### 原文档
191
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Excel1.png)
192
+ #### 转换后文档
193
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Excel2.png)
194
+
195
+ ### ppt文档
196
+ #### 原文档
197
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/ppt1.png)
198
+ #### 转换后文档
199
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/ppt2.png)
200
+
201
+ ### 网页文件
202
+ #### 原文档
203
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/html1.png)
204
+ #### 转换后文档
205
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/html2.png)
206
+
207
+ ## 二次开发
208
+
209
+ - Python 3.10+
210
+
211
+ ```bash
212
+ #源码地址
213
+ https://github.com/indexdoc/indexdoc-converter.git
214
+ ```
215
+ ```bash
216
+ #快速安装依赖库
217
+ pip install -r requirements.txt
218
+
219
+ # 阿里镜像源
220
+ pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
221
+ ```
222
+
223
+
224
+ ## 📞 作者
225
+
226
+ - 作者:杭州智予数信息技术有限公司
227
+
228
+ - 邮箱:indexdoc@qq.com
@@ -0,0 +1,134 @@
1
+ <div align="center">
2
+ <strong>简体中文</strong> | <a href="README_EN.md">English</a>
3
+ </div>
4
+
5
+ ---
6
+ # indexdoc-converter 文档转换工具库
7
+ **indexdoc-converter** 是一款基于 Python 开发的文档转换工具库,核心功能为将主流办公文档、网页文件高效转换为 Markdown 格式。各类型文件支持格式如下:
8
+ - Word 文档支持 **.docx** ;
9
+ - Excel 类表格文档支持 **.xlsx、.xls、.ods、.csv、.tsv** ;
10
+ - 网页文件支持 **.html、.mhtml、.htm 及网页url** ;
11
+ - PPT 演示文档支持 **.pptx** 。
12
+ 该工具库现已发布至 PyPI(Python Package Index),可通过 pip 包管理工具快速安装并投入使用。
13
+
14
+ [![Python Version](https://img.shields.io/badge/python-3.10+-green.svg)](https://www.python.org/) [![GitHub Stars](https://img.shields.io/github/stars/indexdoc/indexdoc-converter?style=social)](https://github.com/indexdoc/indexdoc-converter.git)
15
+
16
+
17
+ ## 库的使用
18
+ ```bash
19
+ #库安装
20
+ pip install -U indexdoc-converter #下载最新版本库
21
+ ```
22
+ - 若使用该库 python版本最小应为 Python3.10
23
+ - 包目录结构
24
+
25
+ ```bash
26
+ indexdoc-converter/ # 项目根目录
27
+ ├── indexdoc_converter/ # 核心包目录
28
+ │ ├── __init__.py # 核心代码
29
+ │ ├── docx_to_md.py # Word转Markdown工具类
30
+ │ ├── excel_to_md.py # Excel转Markdown工具类
31
+ │ ├── html_to_md.py # Html转Markdown工具类
32
+ │ ├── pptx_to_md.py # ppt转Markdown工具类
33
+ │ └── utils/
34
+ │ ├── __init__.py
35
+ │ ├── FileUtil.py
36
+ │ ├── IDUtil.py
37
+ │ └── img_to_base64.py
38
+ ```
39
+
40
+ ### 使用示例
41
+
42
+ ```bash
43
+ #引用 注意引用为 indexdoc_converter 而不是 indexdoc-converter
44
+ from indexdoc_converter.docx_to_md import convert_docx_to_md
45
+ from indexdoc_converter.excel_to_md import TableToMarkdown
46
+ from indexdoc_converter.html_to_md import convert_to_md
47
+ from indexdoc_converter.pptx_to_md import pptx_to_md
48
+
49
+ # -------------------------------------------Word转Markdown---------------------------------------------------
50
+ md_text = convert_docx_to_md(r"C:\Users\xxx\测试文档.docx", False)
51
+ with open('./test.md', 'w', encoding='utf-8') as f:
52
+ f.write(md_text)
53
+
54
+ # -------------------------------------------Excel转Markdown-------------------------------------------------
55
+ # 自定义参数示例
56
+ converter = TableToMarkdown(
57
+ file_title_level=2, # 文件标题的Markdown层级,默认1(#),这里设为2(##)
58
+ single_row_value_as_title=True, # 是否将单行唯一值识别为标题,默认True
59
+ max_rows=8000, # 最大处理行数,默认6000(实际处理行数是max_rows+1)
60
+ max_cols=200 # 最大处理列数,默认128(实际处理列数是max_cols+1)
61
+ )
62
+
63
+ # 转换单个文件
64
+ file_path = r"C:\Users\xxx\测试文件.xlsx"
65
+ result = converter.convert(file_path)
66
+
67
+ # blank 模式:保留合并单元格的原始样式(只在合并单元格左上角显示内容,其余位置为空)
68
+ with open("../tmp/测试_blank.md", "w", encoding="utf-8") as f:
69
+ f.write(result['blank'])
70
+
71
+ # fill 模式:将合并单元格的内容填充到所有合并的单元格中同时还能自动识别表格中的标题行、分割多个表格块,处理空行 / 空列,兼容各种表格格式的合并单元格解析。
72
+ with open("../tmp/测试_fill.md", "w", encoding="utf-8") as f:
73
+ f.write(result['fill'])
74
+
75
+ # -------------------------------------------ppt转Markdown---------------------------------------------------
76
+ ppt_file = r"C:\Users\xxx\测试文件.pptx"
77
+ md_path = pptx_to_md(ppt_file)
78
+ print(f"单文件转换完成,MD文件路径:{md_path}")
79
+
80
+ # -------------------------------------------网页文件转Markdown-----------------------------------------------
81
+ # html = "https://news.qq.com/rain/a/20260114A01NI000"
82
+ html = "https://www.aituple.com"
83
+ # html = "https://www.indexdoc.com"
84
+ # html = r"C:\Users\xxx\测试文件.html"
85
+ # html = "https://www.indexdoc.com/contact.html"
86
+ md = convert_to_md(html, '../tmp/测试html.md')
87
+ # md = mhtml_to_markdown(mhtml)
88
+ ```
89
+ ### Word文档
90
+ #### 原文档
91
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Word1.png)
92
+ #### 转换后文档
93
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Word2.png)
94
+
95
+ ### Excel文档
96
+ #### 原文档
97
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Excel1.png)
98
+ #### 转换后文档
99
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Excel2.png)
100
+
101
+ ### ppt文档
102
+ #### 原文档
103
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/ppt1.png)
104
+ #### 转换后文档
105
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/ppt2.png)
106
+
107
+ ### 网页文件
108
+ #### 原文档
109
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/html1.png)
110
+ #### 转换后文档
111
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/html2.png)
112
+
113
+ ## 二次开发
114
+
115
+ - Python 3.10+
116
+
117
+ ```bash
118
+ #源码地址
119
+ https://github.com/indexdoc/indexdoc-converter.git
120
+ ```
121
+ ```bash
122
+ #快速安装依赖库
123
+ pip install -r requirements.txt
124
+
125
+ # 阿里镜像源
126
+ pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
127
+ ```
128
+
129
+
130
+ ## 📞 作者
131
+
132
+ - 作者:杭州智予数信息技术有限公司
133
+
134
+ - 邮箱:indexdoc@qq.com
@@ -7,7 +7,7 @@ from typing import List, Dict, Tuple
7
7
  import numpy as np
8
8
  import pandas as pd
9
9
 
10
- from utils import FileUtil
10
+ from indexdoc_converter.utils import FileUtil
11
11
 
12
12
  pd.set_option('future.no_silent_downcasting', True)
13
13
 
@@ -10,7 +10,7 @@ from PIL import Image
10
10
  from readability import Document
11
11
  import html2text
12
12
 
13
- from utils.img_to_base64 import Image2Base64
13
+ from indexdoc_converter.utils.img_to_base64 import Image2Base64
14
14
 
15
15
  # ============ 图片下载和处理类 ============
16
16
  class ImageDownloader:
@@ -0,0 +1,174 @@
1
+ import pptx2md
2
+ from pathlib import Path
3
+ import tempfile
4
+
5
+
6
+
7
+ from indexdoc_converter.utils.img_to_base64 import OutputFormat
8
+ from indexdoc_converter.utils.img_to_base64 import Image2Base64
9
+ from indexdoc_converter.utils import IDUtil
10
+
11
+ def replace_img_with_base64(md_file: Path, img_dir: Path, del_temp_img: bool = True):
12
+ """
13
+ 终极修复版:解决URL编码(%5C)、Windows反斜杠、相对/绝对路径等所有匹配问题
14
+ :param md_file: 生成的MD文件路径
15
+ :param img_dir: pptx2md导出的临时图片目录
16
+ :param del_temp_img: 是否删除临时图片/目录(推荐True)
17
+ """
18
+ import urllib.parse # 内置库,无需额外安装,处理URL编码/解码
19
+
20
+ if not md_file.exists() or not img_dir.is_dir():
21
+ print(f"⚠ 跳过Base64替换:MD文件或图片目录不存在")
22
+ return
23
+
24
+ # 1. 读取MD内容,保留原始编码格式
25
+ with open(md_file, 'r', encoding='utf-8') as f:
26
+ md_content = f.read()
27
+
28
+ img_path_2_b64 = {}
29
+ # 2. 遍历临时图片,生成所有需要匹配的路径格式 + Base64映射
30
+ for img_file in img_dir.glob("*.*"):
31
+ if not img_file.is_file():
32
+ continue
33
+ try:
34
+ # 调用你的Base64工具类生成MD格式的Base64(参数可按需求调整)
35
+ b64_md_str = Image2Base64.convert_file(
36
+ image_path=img_file,
37
+ max_dim=1200,
38
+ max_kb=200,
39
+ force_webp=True,
40
+ out_format=OutputFormat.MARKDOWN_ALT,
41
+ quality=80
42
+ )
43
+
44
+ # 关键:生成5种需要替换的路径格式,覆盖pptx2md所有生成情况
45
+ img_dir_name = img_dir.name # 临时目录名(如4fb6cc7bfcbb11f0aefebc2411ecbf31_pptx_imgs)
46
+ img_name = img_file.name # 图片名(如测试文件_0.jpg)
47
+ # 格式1:绝对路径(C:\Temp\xxx_imgs\测试文件_0.jpg)
48
+ img_path_2_b64[str(img_file)] = b64_md_str
49
+ # 格式2:纯文件名(测试文件_0.jpg)
50
+ img_path_2_b64[img_name] = b64_md_str
51
+ # 格式3:Windows原始反斜杠相对路径(xxx_imgs\测试文件_0.jpg)
52
+ win_relative_path = f"{img_dir_name}\\{img_name}"
53
+ img_path_2_b64[win_relative_path] = b64_md_str
54
+ # 格式4:URL编码后的相对路径(xxx_imgs%5C测试文件_0.jpg)【核心修复项】
55
+ url_encoded_path = urllib.parse.quote(win_relative_path, safe='')
56
+ img_path_2_b64[url_encoded_path] = b64_md_str
57
+ # 格式5:正斜杠相对路径(xxx_imgs/测试文件_0.jpg)【兼容兜底】
58
+ slash_relative_path = f"{img_dir_name}/{img_name}"
59
+ img_path_2_b64[slash_relative_path] = b64_md_str
60
+
61
+ except Exception as e:
62
+ print(f"⚠ 单张图片转Base64失败: {img_file.name} | 错误: {e}")
63
+ continue
64
+
65
+ # 3. 批量替换:按「长路径优先」替换,避免短路径匹配冲突
66
+ # 排序:路径越长越先替换,防止纯文件名先匹配导致长路径替换失败
67
+ for raw_img_path in sorted(img_path_2_b64.keys(), key=len, reverse=True):
68
+ md_content = md_content.replace('![]('+raw_img_path+')', img_path_2_b64[raw_img_path])
69
+
70
+ # 4. 重写MD文件,写入Base64内嵌内容
71
+ with open(md_file, 'w', encoding='utf-8') as f:
72
+ f.write(md_content)
73
+
74
+ # 5. 清理临时图片目录,避免冗余
75
+ if del_temp_img:
76
+ try:
77
+ import shutil
78
+ shutil.rmtree(img_dir)
79
+ print(f"✅ 清理临时图片目录完成: {img_dir.name}")
80
+ except Exception as e:
81
+ print(f"⚠ 临时图片目录清理失败: {e}")
82
+
83
+ def pptx_to_md(pptx_file):
84
+ """单文件PPT转MD:图片自动转为Base64内嵌"""
85
+ # 生成唯一的MD文件名和专属图片临时目录(避免多文件冲突)
86
+ uuid_str = IDUtil.get_uuid()
87
+ md_file_path = Path(tempfile.gettempdir()) / f"{uuid_str}.md"
88
+ temp_img_dir = Path(tempfile.gettempdir()) / f"{uuid_str}_pptx_imgs"
89
+ temp_img_dir.mkdir(parents=True, exist_ok=True) # 创建专属图片目录
90
+
91
+ # 配置pptx2md转换参数
92
+ config = pptx2md.ConversionConfig(
93
+ pptx_path=pptx_file,
94
+ output_path=md_file_path,
95
+ image_dir=temp_img_dir, # 图片导出到「专属临时目录」,而非全局临时目录
96
+ disable_notes=True # 根据需求调整:是否禁用PPT备注页转换
97
+ )
98
+ # 执行PPT转MD(此时MD中是图片路径,图片在临时目录)
99
+ pptx2md.convert(config)
100
+ # 核心步骤:调用替换函数,将MD中的图片路径改为Base64内嵌
101
+ replace_img_with_base64(md_file_path, temp_img_dir)
102
+
103
+ return md_file_path
104
+
105
+
106
+ def batch_pptx_to_md(input_dir: str, output_dir: str = None, image_dir: str = None):
107
+ """
108
+ 批量PPT转MD:图片自动转为Base64内嵌(原image_dir参数失效,无需传值)
109
+ Args:
110
+ input_dir (str): 包含 PPTX 文件的文件夹路径
111
+ output_dir (str, optional): 输出 Markdown 文件的文件夹。若为 None,则输出到与 PPTX 相同位置
112
+ image_dir (str, optional): 兼容原参数,实际已失效(图片不再导出到该目录,直接内嵌Base64)
113
+ """
114
+ input_path = Path(input_dir)
115
+ if not input_path.exists() or not input_path.is_dir():
116
+ raise ValueError(f"输入目录不存在或不是文件夹: {input_dir}")
117
+
118
+ # 处理输出目录
119
+ if output_dir is None:
120
+ output_path = input_path
121
+ else:
122
+ output_path = Path(output_dir)
123
+ output_path.mkdir(parents=True, exist_ok=True)
124
+
125
+ # 筛选所有pptx文件
126
+ pptx_files = list(input_path.glob("*.pptx"))
127
+ if not pptx_files:
128
+ print(f"在 {input_dir} 中未找到 .pptx 文件")
129
+ return
130
+
131
+ print(f"发现 {len(pptx_files)} 个 PPTX 文件,开始转换(图片自动Base64内嵌)...\n")
132
+
133
+ # 批量转换
134
+ for pptx_file in pptx_files:
135
+ try:
136
+ md_file = output_path / f"{pptx_file.stem}.md"
137
+ # 为当前PPT创建专属临时图片目录(避免多PPT图片冲突)
138
+ uuid_str = IDUtil.get_uuid()
139
+ temp_img_dir = Path(tempfile.gettempdir()) / f"{uuid_str}_{pptx_file.stem}_imgs"
140
+ temp_img_dir.mkdir(parents=True, exist_ok=True)
141
+
142
+ # 配置pptx2md
143
+ config = pptx2md.ConversionConfig(
144
+ pptx_path=pptx_file,
145
+ output_path=md_file,
146
+ image_dir=temp_img_dir, # 图片导出到专属临时目录
147
+ disable_notes=True
148
+ )
149
+
150
+ print(f"正在转换: {pptx_file.name}")
151
+ pptx2md.convert(config)
152
+ # 核心:替换图片路径为Base64
153
+ replace_img_with_base64(md_file, temp_img_dir)
154
+ print(f"✅ 转换成功: {md_file.name}\n")
155
+
156
+ except Exception as e:
157
+ print(f"❌ 转换失败: {pptx_file.name} | 错误: {e}\n")
158
+ continue
159
+
160
+ print("📌 批量转换完成!所有MD文件均为图片Base64内嵌格式")
161
+
162
+
163
+ if __name__ == "__main__":
164
+ # 配置你的路径(image_folder无需传值,已失效)
165
+ input_folder = r'D:\测试目录_全面\ppt' # 你的PPT文件夹路径
166
+ output_folder = './markdown_out' # MD输出路径,设为None则输出到PPT同目录
167
+ # image_folder = None # 无需设置,图片直接Base64内嵌,该参数失效
168
+
169
+ # 执行批量转换
170
+ batch_pptx_to_md(
171
+ input_dir=input_folder,
172
+ output_dir=output_folder,
173
+ # image_dir无需传值,保留原参数仅为兼容
174
+ )
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.4
2
+ Name: indexdoc_converter
3
+ Version: 0.2.3
4
+ Summary: 可以将Word文档(仅.docx)、Excel表格、Html网页、PPt文件 转化为Markdown文件。
5
+ Home-page: https://github.com/indexdoc/indexdoc-converter.git
6
+ Author: 杭州智予数信息技术有限公司
7
+ Author-email: indexdoc@qq.com
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: annotated-types==0.7.0
11
+ Requires-Dist: asgiref==3.11.0
12
+ Requires-Dist: beautifulsoup4==4.14.3
13
+ Requires-Dist: bottle==0.13.4
14
+ Requires-Dist: captcha==0.7.1
15
+ Requires-Dist: certifi==2026.1.4
16
+ Requires-Dist: cffi==2.0.0
17
+ Requires-Dist: chardet==5.2.0
18
+ Requires-Dist: charset-normalizer==3.4.4
19
+ Requires-Dist: clickhouse-driver==0.2.10
20
+ Requires-Dist: clr_loader==0.2.10
21
+ Requires-Dist: cobble==0.1.4
22
+ Requires-Dist: colorama==0.4.6
23
+ Requires-Dist: cryptography==46.0.3
24
+ Requires-Dist: cssselect==1.4.0
25
+ Requires-Dist: defusedxml==0.7.1
26
+ Requires-Dist: Django==6.0.1
27
+ Requires-Dist: django-ranged-response==0.2.0
28
+ Requires-Dist: django-simple-captcha==0.6.3
29
+ Requires-Dist: duckdb==1.4.3
30
+ Requires-Dist: et_xmlfile==2.0.0
31
+ Requires-Dist: filelock==3.20.3
32
+ Requires-Dist: fonttools==4.61.1
33
+ Requires-Dist: fpdf2==2.8.5
34
+ Requires-Dist: fsspec==2026.1.0
35
+ Requires-Dist: html2text==2025.4.15
36
+ Requires-Dist: idna==3.11
37
+ Requires-Dist: image==1.5.33
38
+ Requires-Dist: Jinja2==3.1.6
39
+ Requires-Dist: lxml==6.0.2
40
+ Requires-Dist: lxml_html_clean==0.4.3
41
+ Requires-Dist: mammoth==1.11.0
42
+ Requires-Dist: markdownify==1.2.2
43
+ Requires-Dist: MarkupSafe==3.0.3
44
+ Requires-Dist: mpmath==1.3.0
45
+ Requires-Dist: networkx==3.6.1
46
+ Requires-Dist: numpy==2.4.1
47
+ Requires-Dist: odfpy==1.4.1
48
+ Requires-Dist: openpyxl==3.1.5
49
+ Requires-Dist: pandas==3.0.0
50
+ Requires-Dist: pdfkit==1.0.0
51
+ Requires-Dist: pillow==12.1.0
52
+ Requires-Dist: pptx2md==2.0.6
53
+ Requires-Dist: proxy_tools==0.1.0
54
+ Requires-Dist: psutil==7.2.1
55
+ Requires-Dist: pycparser==2.23
56
+ Requires-Dist: pydantic==2.12.5
57
+ Requires-Dist: pydantic_core==2.41.5
58
+ Requires-Dist: PyJWT==2.10.1
59
+ Requires-Dist: pyperclip==1.11.0
60
+ Requires-Dist: python-dateutil==2.9.0.post0
61
+ Requires-Dist: python-docx==1.2.0
62
+ Requires-Dist: python-pptx==1.0.2
63
+ Requires-Dist: pythonnet==3.0.5
64
+ Requires-Dist: pytz==2025.2
65
+ Requires-Dist: pywebview==6.1
66
+ Requires-Dist: pywin32==311
67
+ Requires-Dist: RapidFuzz==3.14.3
68
+ Requires-Dist: readability-lxml==0.8.4.1
69
+ Requires-Dist: requests==2.32.5
70
+ Requires-Dist: scipy==1.17.0
71
+ Requires-Dist: setuptools==80.9.0
72
+ Requires-Dist: six==1.17.0
73
+ Requires-Dist: soupsieve==2.8.3
74
+ Requires-Dist: sqlparse==0.5.5
75
+ Requires-Dist: sympy==1.14.0
76
+ Requires-Dist: torch==2.9.1
77
+ Requires-Dist: tornado==6.5.4
78
+ Requires-Dist: tqdm==4.67.1
79
+ Requires-Dist: typing-inspection==0.4.2
80
+ Requires-Dist: typing_extensions==4.15.0
81
+ Requires-Dist: tzdata==2025.3
82
+ Requires-Dist: tzlocal==5.3.1
83
+ Requires-Dist: urllib3==2.6.3
84
+ Requires-Dist: WMI==1.5.1
85
+ Requires-Dist: xlsxwriter==3.2.9
86
+ Dynamic: author
87
+ Dynamic: author-email
88
+ Dynamic: description
89
+ Dynamic: description-content-type
90
+ Dynamic: home-page
91
+ Dynamic: requires-dist
92
+ Dynamic: requires-python
93
+ Dynamic: summary
94
+
95
+ <div align="center">
96
+ <strong>简体中文</strong> | <a href="README_EN.md">English</a>
97
+ </div>
98
+
99
+ ---
100
+ # indexdoc-converter 文档转换工具库
101
+ **indexdoc-converter** 是一款基于 Python 开发的文档转换工具库,核心功能为将主流办公文档、网页文件高效转换为 Markdown 格式。各类型文件支持格式如下:
102
+ - Word 文档支持 **.docx** ;
103
+ - Excel 类表格文档支持 **.xlsx、.xls、.ods、.csv、.tsv** ;
104
+ - 网页文件支持 **.html、.mhtml、.htm 及网页url** ;
105
+ - PPT 演示文档支持 **.pptx** 。
106
+ 该工具库现已发布至 PyPI(Python Package Index),可通过 pip 包管理工具快速安装并投入使用。
107
+
108
+ [![Python Version](https://img.shields.io/badge/python-3.10+-green.svg)](https://www.python.org/) [![GitHub Stars](https://img.shields.io/github/stars/indexdoc/indexdoc-converter?style=social)](https://github.com/indexdoc/indexdoc-converter.git)
109
+
110
+
111
+ ## 库的使用
112
+ ```bash
113
+ #库安装
114
+ pip install -U indexdoc-converter #下载最新版本库
115
+ ```
116
+ - 若使用该库 python版本最小应为 Python3.10
117
+ - 包目录结构
118
+
119
+ ```bash
120
+ indexdoc-converter/ # 项目根目录
121
+ ├── indexdoc_converter/ # 核心包目录
122
+ │ ├── __init__.py # 核心代码
123
+ │ ├── docx_to_md.py # Word转Markdown工具类
124
+ │ ├── excel_to_md.py # Excel转Markdown工具类
125
+ │ ├── html_to_md.py # Html转Markdown工具类
126
+ │ ├── pptx_to_md.py # ppt转Markdown工具类
127
+ │ └── utils/
128
+ │ ├── __init__.py
129
+ │ ├── FileUtil.py
130
+ │ ├── IDUtil.py
131
+ │ └── img_to_base64.py
132
+ ```
133
+
134
+ ### 使用示例
135
+
136
+ ```bash
137
+ #引用 注意引用为 indexdoc_converter 而不是 indexdoc-converter
138
+ from indexdoc_converter.docx_to_md import convert_docx_to_md
139
+ from indexdoc_converter.excel_to_md import TableToMarkdown
140
+ from indexdoc_converter.html_to_md import convert_to_md
141
+ from indexdoc_converter.pptx_to_md import pptx_to_md
142
+
143
+ # -------------------------------------------Word转Markdown---------------------------------------------------
144
+ md_text = convert_docx_to_md(r"C:\Users\xxx\测试文档.docx", False)
145
+ with open('./test.md', 'w', encoding='utf-8') as f:
146
+ f.write(md_text)
147
+
148
+ # -------------------------------------------Excel转Markdown-------------------------------------------------
149
+ # 自定义参数示例
150
+ converter = TableToMarkdown(
151
+ file_title_level=2, # 文件标题的Markdown层级,默认1(#),这里设为2(##)
152
+ single_row_value_as_title=True, # 是否将单行唯一值识别为标题,默认True
153
+ max_rows=8000, # 最大处理行数,默认6000(实际处理行数是max_rows+1)
154
+ max_cols=200 # 最大处理列数,默认128(实际处理列数是max_cols+1)
155
+ )
156
+
157
+ # 转换单个文件
158
+ file_path = r"C:\Users\xxx\测试文件.xlsx"
159
+ result = converter.convert(file_path)
160
+
161
+ # blank 模式:保留合并单元格的原始样式(只在合并单元格左上角显示内容,其余位置为空)
162
+ with open("../tmp/测试_blank.md", "w", encoding="utf-8") as f:
163
+ f.write(result['blank'])
164
+
165
+ # fill 模式:将合并单元格的内容填充到所有合并的单元格中同时还能自动识别表格中的标题行、分割多个表格块,处理空行 / 空列,兼容各种表格格式的合并单元格解析。
166
+ with open("../tmp/测试_fill.md", "w", encoding="utf-8") as f:
167
+ f.write(result['fill'])
168
+
169
+ # -------------------------------------------ppt转Markdown---------------------------------------------------
170
+ ppt_file = r"C:\Users\xxx\测试文件.pptx"
171
+ md_path = pptx_to_md(ppt_file)
172
+ print(f"单文件转换完成,MD文件路径:{md_path}")
173
+
174
+ # -------------------------------------------网页文件转Markdown-----------------------------------------------
175
+ # html = "https://news.qq.com/rain/a/20260114A01NI000"
176
+ html = "https://www.aituple.com"
177
+ # html = "https://www.indexdoc.com"
178
+ # html = r"C:\Users\xxx\测试文件.html"
179
+ # html = "https://www.indexdoc.com/contact.html"
180
+ md = convert_to_md(html, '../tmp/测试html.md')
181
+ # md = mhtml_to_markdown(mhtml)
182
+ ```
183
+ ### Word文档
184
+ #### 原文档
185
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Word1.png)
186
+ #### 转换后文档
187
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Word2.png)
188
+
189
+ ### Excel文档
190
+ #### 原文档
191
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Excel1.png)
192
+ #### 转换后文档
193
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Excel2.png)
194
+
195
+ ### ppt文档
196
+ #### 原文档
197
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/ppt1.png)
198
+ #### 转换后文档
199
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/ppt2.png)
200
+
201
+ ### 网页文件
202
+ #### 原文档
203
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/html1.png)
204
+ #### 转换后文档
205
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/html2.png)
206
+
207
+ ## 二次开发
208
+
209
+ - Python 3.10+
210
+
211
+ ```bash
212
+ #源码地址
213
+ https://github.com/indexdoc/indexdoc-converter.git
214
+ ```
215
+ ```bash
216
+ #快速安装依赖库
217
+ pip install -r requirements.txt
218
+
219
+ # 阿里镜像源
220
+ pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
221
+ ```
222
+
223
+
224
+ ## 📞 作者
225
+
226
+ - 作者:杭州智予数信息技术有限公司
227
+
228
+ - 邮箱:indexdoc@qq.com
@@ -11,7 +11,7 @@ with open("requirements.txt", "r", encoding="utf-8") as f:
11
11
 
12
12
  setup(
13
13
  name="indexdoc_converter", # 你的工具名称(PyPI上唯一)
14
- version="0.2.1", # 版本号(遵循语义化版本)
14
+ version="0.2.3", # 版本号(遵循语义化版本)
15
15
  description="可以将Word文档(仅.docx)、Excel表格、Html网页、PPt文件 转化为Markdown文件。",
16
16
  long_description=README,
17
17
  long_description_content_type="text/markdown",
@@ -1,92 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: indexdoc_converter
3
- Version: 0.2.1
4
- Summary: 可以将Word文档(仅.docx)、Excel表格、Html网页、PPt文件 转化为Markdown文件。
5
- Home-page: https://github.com/indexdoc/indexdoc-converter.git
6
- Author: 杭州智予数信息技术有限公司
7
- Author-email: indexdoc@qq.com
8
- Requires-Python: >=3.10
9
- Description-Content-Type: text/markdown
10
- Requires-Dist: annotated-types==0.7.0
11
- Requires-Dist: asgiref==3.11.0
12
- Requires-Dist: beautifulsoup4==4.14.3
13
- Requires-Dist: bottle==0.13.4
14
- Requires-Dist: captcha==0.7.1
15
- Requires-Dist: certifi==2026.1.4
16
- Requires-Dist: cffi==2.0.0
17
- Requires-Dist: chardet==5.2.0
18
- Requires-Dist: charset-normalizer==3.4.4
19
- Requires-Dist: clickhouse-driver==0.2.10
20
- Requires-Dist: clr_loader==0.2.10
21
- Requires-Dist: cobble==0.1.4
22
- Requires-Dist: colorama==0.4.6
23
- Requires-Dist: cryptography==46.0.3
24
- Requires-Dist: cssselect==1.4.0
25
- Requires-Dist: defusedxml==0.7.1
26
- Requires-Dist: Django==6.0.1
27
- Requires-Dist: django-ranged-response==0.2.0
28
- Requires-Dist: django-simple-captcha==0.6.3
29
- Requires-Dist: duckdb==1.4.3
30
- Requires-Dist: et_xmlfile==2.0.0
31
- Requires-Dist: filelock==3.20.3
32
- Requires-Dist: fonttools==4.61.1
33
- Requires-Dist: fpdf2==2.8.5
34
- Requires-Dist: fsspec==2026.1.0
35
- Requires-Dist: html2text==2025.4.15
36
- Requires-Dist: idna==3.11
37
- Requires-Dist: image==1.5.33
38
- Requires-Dist: Jinja2==3.1.6
39
- Requires-Dist: lxml==6.0.2
40
- Requires-Dist: lxml_html_clean==0.4.3
41
- Requires-Dist: mammoth==1.11.0
42
- Requires-Dist: markdownify==1.2.2
43
- Requires-Dist: MarkupSafe==3.0.3
44
- Requires-Dist: mpmath==1.3.0
45
- Requires-Dist: networkx==3.6.1
46
- Requires-Dist: numpy==2.4.1
47
- Requires-Dist: odfpy==1.4.1
48
- Requires-Dist: openpyxl==3.1.5
49
- Requires-Dist: pandas==3.0.0
50
- Requires-Dist: pdfkit==1.0.0
51
- Requires-Dist: pillow==12.1.0
52
- Requires-Dist: pptx2md==2.0.6
53
- Requires-Dist: proxy_tools==0.1.0
54
- Requires-Dist: psutil==7.2.1
55
- Requires-Dist: pycparser==2.23
56
- Requires-Dist: pydantic==2.12.5
57
- Requires-Dist: pydantic_core==2.41.5
58
- Requires-Dist: PyJWT==2.10.1
59
- Requires-Dist: pyperclip==1.11.0
60
- Requires-Dist: python-dateutil==2.9.0.post0
61
- Requires-Dist: python-docx==1.2.0
62
- Requires-Dist: python-pptx==1.0.2
63
- Requires-Dist: pythonnet==3.0.5
64
- Requires-Dist: pytz==2025.2
65
- Requires-Dist: pywebview==6.1
66
- Requires-Dist: pywin32==311
67
- Requires-Dist: RapidFuzz==3.14.3
68
- Requires-Dist: readability-lxml==0.8.4.1
69
- Requires-Dist: requests==2.32.5
70
- Requires-Dist: scipy==1.17.0
71
- Requires-Dist: setuptools==80.9.0
72
- Requires-Dist: six==1.17.0
73
- Requires-Dist: soupsieve==2.8.3
74
- Requires-Dist: sqlparse==0.5.5
75
- Requires-Dist: sympy==1.14.0
76
- Requires-Dist: torch==2.9.1
77
- Requires-Dist: tornado==6.5.4
78
- Requires-Dist: tqdm==4.67.1
79
- Requires-Dist: typing-inspection==0.4.2
80
- Requires-Dist: typing_extensions==4.15.0
81
- Requires-Dist: tzdata==2025.3
82
- Requires-Dist: tzlocal==5.3.1
83
- Requires-Dist: urllib3==2.6.3
84
- Requires-Dist: WMI==1.5.1
85
- Requires-Dist: xlsxwriter==3.2.9
86
- Dynamic: author
87
- Dynamic: author-email
88
- Dynamic: description-content-type
89
- Dynamic: home-page
90
- Dynamic: requires-dist
91
- Dynamic: requires-python
92
- Dynamic: summary
File without changes
@@ -1,79 +0,0 @@
1
- import pptx2md
2
- from pathlib import Path
3
- import tempfile
4
-
5
- from utils import IDUtil
6
-
7
- def pptx_to_md(pptx_file):
8
- md_file_path = Path(tempfile.gettempdir()) / f"{IDUtil.get_uuid()}.md"
9
- config = pptx2md.ConversionConfig(
10
- pptx_path=pptx_file,
11
- output_path=md_file_path,
12
- image_dir=Path(tempfile.gettempdir()),
13
- disable_notes=True # 根据需求选择是否禁用备注页
14
- )
15
- pptx2md.convert(config)
16
- return md_file_path
17
-
18
- def batch_pptx_to_md(input_dir: str, output_dir: str = None, image_dir: str = None):
19
- """
20
- 使用指定的输入目录中的所有 .pptx 文件,并将它们转换为 Markdown 文件。
21
-
22
- Args:
23
- input_dir (str): 包含 PPTX 文件的文件夹路径
24
- output_dir (str, optional): 输出 Markdown 文件的文件夹。若为 None,则输出到与 PPTX 相同位置
25
- image_dir (str, optional): 图片保存目录。如果需要导出图片的话设置此项
26
- """
27
- input_path = Path(input_dir)
28
- if not input_path.exists() or not input_path.is_dir():
29
- raise ValueError(f"输入目录不存在或不是文件夹: {input_dir}")
30
-
31
- # 如果没有指定输出目录,则默认为输入目录
32
- if output_dir is None:
33
- output_path = input_path
34
- else:
35
- output_path = Path(output_dir)
36
- output_path.mkdir(parents=True, exist_ok=True)
37
-
38
- # 如果需要导出图片,创建图片目录
39
- if image_dir:
40
- img_path = Path(image_dir)
41
- img_path.mkdir(parents=True, exist_ok=True)
42
- else:
43
- img_path = None
44
-
45
- pptx_files = list(input_path.glob("*.pptx"))
46
- if not pptx_files:
47
- print(f"在 {input_dir} 中未找到 .pptx 文件")
48
- return
49
-
50
- print(f"发现 {len(pptx_files)} 个 PPTX 文件,开始转换...")
51
-
52
- for pptx_file in pptx_files:
53
- try:
54
- md_file = output_path / f"{pptx_file.stem}.md"
55
-
56
- config = pptx2md.ConversionConfig(
57
- pptx_path=pptx_file,
58
- output_path=md_file,
59
- image_dir=img_path,
60
- disable_notes=True # 根据需求选择是否禁用备注页
61
- )
62
-
63
- print(f"正在转换: {pptx_file.name} → {md_file.name}")
64
-
65
- pptx2md.convert(config)
66
- print(f"✅ 成功: {md_file.name}")
67
- except Exception as e:
68
- print(f"❌ 失败: {pptx_file.name} - {e}")
69
-
70
- print("批量转换完成!")
71
-
72
-
73
- if __name__ == "__main__":
74
- # 你可以在这里直接设置路径,或者通过命令行参数传递
75
- input_folder = r'D:\测试目录_全面\ppt' # 替换为你的PPTX文件夹路径
76
- output_folder = './markdown_out' # 可选,设为None则输出到原目录
77
- image_folder = './markdown_out/images' # 如果你需要导出图片,设置此路径;否则保持None
78
-
79
- batch_pptx_to_md(input_folder, output_folder, image_folder)
@@ -1,92 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: indexdoc_converter
3
- Version: 0.2.1
4
- Summary: 可以将Word文档(仅.docx)、Excel表格、Html网页、PPt文件 转化为Markdown文件。
5
- Home-page: https://github.com/indexdoc/indexdoc-converter.git
6
- Author: 杭州智予数信息技术有限公司
7
- Author-email: indexdoc@qq.com
8
- Requires-Python: >=3.10
9
- Description-Content-Type: text/markdown
10
- Requires-Dist: annotated-types==0.7.0
11
- Requires-Dist: asgiref==3.11.0
12
- Requires-Dist: beautifulsoup4==4.14.3
13
- Requires-Dist: bottle==0.13.4
14
- Requires-Dist: captcha==0.7.1
15
- Requires-Dist: certifi==2026.1.4
16
- Requires-Dist: cffi==2.0.0
17
- Requires-Dist: chardet==5.2.0
18
- Requires-Dist: charset-normalizer==3.4.4
19
- Requires-Dist: clickhouse-driver==0.2.10
20
- Requires-Dist: clr_loader==0.2.10
21
- Requires-Dist: cobble==0.1.4
22
- Requires-Dist: colorama==0.4.6
23
- Requires-Dist: cryptography==46.0.3
24
- Requires-Dist: cssselect==1.4.0
25
- Requires-Dist: defusedxml==0.7.1
26
- Requires-Dist: Django==6.0.1
27
- Requires-Dist: django-ranged-response==0.2.0
28
- Requires-Dist: django-simple-captcha==0.6.3
29
- Requires-Dist: duckdb==1.4.3
30
- Requires-Dist: et_xmlfile==2.0.0
31
- Requires-Dist: filelock==3.20.3
32
- Requires-Dist: fonttools==4.61.1
33
- Requires-Dist: fpdf2==2.8.5
34
- Requires-Dist: fsspec==2026.1.0
35
- Requires-Dist: html2text==2025.4.15
36
- Requires-Dist: idna==3.11
37
- Requires-Dist: image==1.5.33
38
- Requires-Dist: Jinja2==3.1.6
39
- Requires-Dist: lxml==6.0.2
40
- Requires-Dist: lxml_html_clean==0.4.3
41
- Requires-Dist: mammoth==1.11.0
42
- Requires-Dist: markdownify==1.2.2
43
- Requires-Dist: MarkupSafe==3.0.3
44
- Requires-Dist: mpmath==1.3.0
45
- Requires-Dist: networkx==3.6.1
46
- Requires-Dist: numpy==2.4.1
47
- Requires-Dist: odfpy==1.4.1
48
- Requires-Dist: openpyxl==3.1.5
49
- Requires-Dist: pandas==3.0.0
50
- Requires-Dist: pdfkit==1.0.0
51
- Requires-Dist: pillow==12.1.0
52
- Requires-Dist: pptx2md==2.0.6
53
- Requires-Dist: proxy_tools==0.1.0
54
- Requires-Dist: psutil==7.2.1
55
- Requires-Dist: pycparser==2.23
56
- Requires-Dist: pydantic==2.12.5
57
- Requires-Dist: pydantic_core==2.41.5
58
- Requires-Dist: PyJWT==2.10.1
59
- Requires-Dist: pyperclip==1.11.0
60
- Requires-Dist: python-dateutil==2.9.0.post0
61
- Requires-Dist: python-docx==1.2.0
62
- Requires-Dist: python-pptx==1.0.2
63
- Requires-Dist: pythonnet==3.0.5
64
- Requires-Dist: pytz==2025.2
65
- Requires-Dist: pywebview==6.1
66
- Requires-Dist: pywin32==311
67
- Requires-Dist: RapidFuzz==3.14.3
68
- Requires-Dist: readability-lxml==0.8.4.1
69
- Requires-Dist: requests==2.32.5
70
- Requires-Dist: scipy==1.17.0
71
- Requires-Dist: setuptools==80.9.0
72
- Requires-Dist: six==1.17.0
73
- Requires-Dist: soupsieve==2.8.3
74
- Requires-Dist: sqlparse==0.5.5
75
- Requires-Dist: sympy==1.14.0
76
- Requires-Dist: torch==2.9.1
77
- Requires-Dist: tornado==6.5.4
78
- Requires-Dist: tqdm==4.67.1
79
- Requires-Dist: typing-inspection==0.4.2
80
- Requires-Dist: typing_extensions==4.15.0
81
- Requires-Dist: tzdata==2025.3
82
- Requires-Dist: tzlocal==5.3.1
83
- Requires-Dist: urllib3==2.6.3
84
- Requires-Dist: WMI==1.5.1
85
- Requires-Dist: xlsxwriter==3.2.9
86
- Dynamic: author
87
- Dynamic: author-email
88
- Dynamic: description-content-type
89
- Dynamic: home-page
90
- Dynamic: requires-dist
91
- Dynamic: requires-python
92
- Dynamic: summary