indexdoc-converter 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. indexdoc_converter-0.2.3/PKG-INFO +228 -0
  2. indexdoc_converter-0.2.3/README.md +134 -0
  3. {indexdoc_converter-0.2.2 → indexdoc_converter-0.2.3}/indexdoc_converter/pptx_to_md.py +0 -2
  4. indexdoc_converter-0.2.3/indexdoc_converter.egg-info/PKG-INFO +228 -0
  5. {indexdoc_converter-0.2.2 → indexdoc_converter-0.2.3}/setup.py +1 -1
  6. indexdoc_converter-0.2.2/PKG-INFO +0 -92
  7. indexdoc_converter-0.2.2/README.md +0 -0
  8. indexdoc_converter-0.2.2/indexdoc_converter.egg-info/PKG-INFO +0 -92
  9. {indexdoc_converter-0.2.2 → indexdoc_converter-0.2.3}/indexdoc_converter/__init__.py +0 -0
  10. {indexdoc_converter-0.2.2 → indexdoc_converter-0.2.3}/indexdoc_converter/docx_to_md.py +0 -0
  11. {indexdoc_converter-0.2.2 → indexdoc_converter-0.2.3}/indexdoc_converter/excel_to_md.py +0 -0
  12. {indexdoc_converter-0.2.2 → indexdoc_converter-0.2.3}/indexdoc_converter/html_to_md.py +0 -0
  13. {indexdoc_converter-0.2.2 → indexdoc_converter-0.2.3}/indexdoc_converter/utils/FileUtil.py +0 -0
  14. {indexdoc_converter-0.2.2 → indexdoc_converter-0.2.3}/indexdoc_converter/utils/IDUtil.py +0 -0
  15. {indexdoc_converter-0.2.2 → indexdoc_converter-0.2.3}/indexdoc_converter/utils/__init__.py +0 -0
  16. {indexdoc_converter-0.2.2 → indexdoc_converter-0.2.3}/indexdoc_converter/utils/img_to_base64.py +0 -0
  17. {indexdoc_converter-0.2.2 → indexdoc_converter-0.2.3}/indexdoc_converter.egg-info/SOURCES.txt +0 -0
  18. {indexdoc_converter-0.2.2 → indexdoc_converter-0.2.3}/indexdoc_converter.egg-info/dependency_links.txt +0 -0
  19. {indexdoc_converter-0.2.2 → indexdoc_converter-0.2.3}/indexdoc_converter.egg-info/requires.txt +0 -0
  20. {indexdoc_converter-0.2.2 → indexdoc_converter-0.2.3}/indexdoc_converter.egg-info/top_level.txt +0 -0
  21. {indexdoc_converter-0.2.2 → indexdoc_converter-0.2.3}/setup.cfg +0 -0
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.4
2
+ Name: indexdoc_converter
3
+ Version: 0.2.3
4
+ Summary: 可以将Word文档(仅.docx)、Excel表格、Html网页、PPt文件 转化为Markdown文件。
5
+ Home-page: https://github.com/indexdoc/indexdoc-converter.git
6
+ Author: 杭州智予数信息技术有限公司
7
+ Author-email: indexdoc@qq.com
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: annotated-types==0.7.0
11
+ Requires-Dist: asgiref==3.11.0
12
+ Requires-Dist: beautifulsoup4==4.14.3
13
+ Requires-Dist: bottle==0.13.4
14
+ Requires-Dist: captcha==0.7.1
15
+ Requires-Dist: certifi==2026.1.4
16
+ Requires-Dist: cffi==2.0.0
17
+ Requires-Dist: chardet==5.2.0
18
+ Requires-Dist: charset-normalizer==3.4.4
19
+ Requires-Dist: clickhouse-driver==0.2.10
20
+ Requires-Dist: clr_loader==0.2.10
21
+ Requires-Dist: cobble==0.1.4
22
+ Requires-Dist: colorama==0.4.6
23
+ Requires-Dist: cryptography==46.0.3
24
+ Requires-Dist: cssselect==1.4.0
25
+ Requires-Dist: defusedxml==0.7.1
26
+ Requires-Dist: Django==6.0.1
27
+ Requires-Dist: django-ranged-response==0.2.0
28
+ Requires-Dist: django-simple-captcha==0.6.3
29
+ Requires-Dist: duckdb==1.4.3
30
+ Requires-Dist: et_xmlfile==2.0.0
31
+ Requires-Dist: filelock==3.20.3
32
+ Requires-Dist: fonttools==4.61.1
33
+ Requires-Dist: fpdf2==2.8.5
34
+ Requires-Dist: fsspec==2026.1.0
35
+ Requires-Dist: html2text==2025.4.15
36
+ Requires-Dist: idna==3.11
37
+ Requires-Dist: image==1.5.33
38
+ Requires-Dist: Jinja2==3.1.6
39
+ Requires-Dist: lxml==6.0.2
40
+ Requires-Dist: lxml_html_clean==0.4.3
41
+ Requires-Dist: mammoth==1.11.0
42
+ Requires-Dist: markdownify==1.2.2
43
+ Requires-Dist: MarkupSafe==3.0.3
44
+ Requires-Dist: mpmath==1.3.0
45
+ Requires-Dist: networkx==3.6.1
46
+ Requires-Dist: numpy==2.4.1
47
+ Requires-Dist: odfpy==1.4.1
48
+ Requires-Dist: openpyxl==3.1.5
49
+ Requires-Dist: pandas==3.0.0
50
+ Requires-Dist: pdfkit==1.0.0
51
+ Requires-Dist: pillow==12.1.0
52
+ Requires-Dist: pptx2md==2.0.6
53
+ Requires-Dist: proxy_tools==0.1.0
54
+ Requires-Dist: psutil==7.2.1
55
+ Requires-Dist: pycparser==2.23
56
+ Requires-Dist: pydantic==2.12.5
57
+ Requires-Dist: pydantic_core==2.41.5
58
+ Requires-Dist: PyJWT==2.10.1
59
+ Requires-Dist: pyperclip==1.11.0
60
+ Requires-Dist: python-dateutil==2.9.0.post0
61
+ Requires-Dist: python-docx==1.2.0
62
+ Requires-Dist: python-pptx==1.0.2
63
+ Requires-Dist: pythonnet==3.0.5
64
+ Requires-Dist: pytz==2025.2
65
+ Requires-Dist: pywebview==6.1
66
+ Requires-Dist: pywin32==311
67
+ Requires-Dist: RapidFuzz==3.14.3
68
+ Requires-Dist: readability-lxml==0.8.4.1
69
+ Requires-Dist: requests==2.32.5
70
+ Requires-Dist: scipy==1.17.0
71
+ Requires-Dist: setuptools==80.9.0
72
+ Requires-Dist: six==1.17.0
73
+ Requires-Dist: soupsieve==2.8.3
74
+ Requires-Dist: sqlparse==0.5.5
75
+ Requires-Dist: sympy==1.14.0
76
+ Requires-Dist: torch==2.9.1
77
+ Requires-Dist: tornado==6.5.4
78
+ Requires-Dist: tqdm==4.67.1
79
+ Requires-Dist: typing-inspection==0.4.2
80
+ Requires-Dist: typing_extensions==4.15.0
81
+ Requires-Dist: tzdata==2025.3
82
+ Requires-Dist: tzlocal==5.3.1
83
+ Requires-Dist: urllib3==2.6.3
84
+ Requires-Dist: WMI==1.5.1
85
+ Requires-Dist: xlsxwriter==3.2.9
86
+ Dynamic: author
87
+ Dynamic: author-email
88
+ Dynamic: description
89
+ Dynamic: description-content-type
90
+ Dynamic: home-page
91
+ Dynamic: requires-dist
92
+ Dynamic: requires-python
93
+ Dynamic: summary
94
+
95
+ <div align="center">
96
+ <strong>简体中文</strong> | <a href="README_EN.md">English</a>
97
+ </div>
98
+
99
+ ---
100
+ # indexdoc-converter 文档转换工具库
101
+ **indexdoc-converter** 是一款基于 Python 开发的文档转换工具库,核心功能为将主流办公文档、网页文件高效转换为 Markdown 格式。各类型文件支持格式如下:
102
+ - Word 文档支持 **.docx** ;
103
+ - Excel 类表格文档支持 **.xlsx、.xls、.ods、.csv、.tsv** ;
104
+ - 网页文件支持 **.html、.mhtml、.htm 及网页url** ;
105
+ - PPT 演示文档支持 **.pptx** 。
106
+ 该工具库现已发布至 PyPI(Python Package Index),可通过 pip 包管理工具快速安装并投入使用。
107
+
108
+ [![Python Version](https://img.shields.io/badge/python-3.10+-green.svg)](https://www.python.org/) [![GitHub Stars](https://img.shields.io/github/stars/indexdoc/indexdoc-converter?style=social)](https://github.com/indexdoc/indexdoc-converter.git)
109
+
110
+
111
+ ## 库的使用
112
+ ```bash
113
+ #库安装
114
+ pip install -U indexdoc-converter #下载最新版本库
115
+ ```
116
+ - 若使用该库 python版本最小应为 Python3.10
117
+ - 包目录结构
118
+
119
+ ```bash
120
+ indexdoc-converter/ # 项目根目录
121
+ ├── indexdoc_converter/ # 核心包目录
122
+ │ ├── __init__.py # 核心代码
123
+ │ ├── docx_to_md.py # Word转Markdown工具类
124
+ │ ├── excel_to_md.py # Excel转Markdown工具类
125
+ │ ├── html_to_md.py # Html转Markdown工具类
126
+ │ ├── pptx_to_md.py # ppt转Markdown工具类
127
+ │ └── utils/
128
+ │ ├── __init__.py
129
+ │ ├── FileUtil.py
130
+ │ ├── IDUtil.py
131
+ │ └── img_to_base64.py
132
+ ```
133
+
134
+ ### 使用示例
135
+
136
+ ```bash
137
+ #引用 注意引用为 indexdoc_converter 而不是 indexdoc-converter
138
+ from indexdoc_converter.docx_to_md import convert_docx_to_md
139
+ from indexdoc_converter.excel_to_md import TableToMarkdown
140
+ from indexdoc_converter.html_to_md import convert_to_md
141
+ from indexdoc_converter.pptx_to_md import pptx_to_md
142
+
143
+ # -------------------------------------------Word转Markdown---------------------------------------------------
144
+ md_text = convert_docx_to_md(r"C:\Users\xxx\测试文档.docx", False)
145
+ with open('./test.md', 'w', encoding='utf-8') as f:
146
+ f.write(md_text)
147
+
148
+ # -------------------------------------------Excel转Markdown-------------------------------------------------
149
+ # 自定义参数示例
150
+ converter = TableToMarkdown(
151
+ file_title_level=2, # 文件标题的Markdown层级,默认1(#),这里设为2(##)
152
+ single_row_value_as_title=True, # 是否将单行唯一值识别为标题,默认True
153
+ max_rows=8000, # 最大处理行数,默认6000(实际处理行数是max_rows+1)
154
+ max_cols=200 # 最大处理列数,默认128(实际处理列数是max_cols+1)
155
+ )
156
+
157
+ # 转换单个文件
158
+ file_path = r"C:\Users\xxx\测试文件.xlsx"
159
+ result = converter.convert(file_path)
160
+
161
+ # blank 模式:保留合并单元格的原始样式(只在合并单元格左上角显示内容,其余位置为空)
162
+ with open("../tmp/测试_blank.md", "w", encoding="utf-8") as f:
163
+ f.write(result['blank'])
164
+
165
+ # fill 模式:将合并单元格的内容填充到所有合并的单元格中同时还能自动识别表格中的标题行、分割多个表格块,处理空行 / 空列,兼容各种表格格式的合并单元格解析。
166
+ with open("../tmp/测试_fill.md", "w", encoding="utf-8") as f:
167
+ f.write(result['fill'])
168
+
169
+ # -------------------------------------------ppt转Markdown---------------------------------------------------
170
+ ppt_file = r"C:\Users\xxx\测试文件.pptx"
171
+ md_path = pptx_to_md(ppt_file)
172
+ print(f"单文件转换完成,MD文件路径:{md_path}")
173
+
174
+ # -------------------------------------------网页文件转Markdown-----------------------------------------------
175
+ # html = "https://news.qq.com/rain/a/20260114A01NI000"
176
+ html = "https://www.aituple.com"
177
+ # html = "https://www.indexdoc.com"
178
+ # html = r"C:\Users\xxx\测试文件.html"
179
+ # html = "https://www.indexdoc.com/contact.html"
180
+ md = convert_to_md(html, '../tmp/测试html.md')
181
+ # md = mhtml_to_markdown(mhtml)
182
+ ```
183
+ ### Word文档
184
+ #### 原文档
185
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Word1.png)
186
+ #### 转换后文档
187
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Word2.png)
188
+
189
+ ### Excel文档
190
+ #### 原文档
191
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Excel1.png)
192
+ #### 转换后文档
193
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Excel2.png)
194
+
195
+ ### ppt文档
196
+ #### 原文档
197
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/ppt1.png)
198
+ #### 转换后文档
199
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/ppt2.png)
200
+
201
+ ### 网页文件
202
+ #### 原文档
203
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/html1.png)
204
+ #### 转换后文档
205
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/html2.png)
206
+
207
+ ## 二次开发
208
+
209
+ - Python 3.10+
210
+
211
+ ```bash
212
+ #源码地址
213
+ https://github.com/indexdoc/indexdoc-converter.git
214
+ ```
215
+ ```bash
216
+ #快速安装依赖库
217
+ pip install -r requirements.txt
218
+
219
+ # 阿里镜像源
220
+ pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
221
+ ```
222
+
223
+
224
+ ## 📞 作者
225
+
226
+ - 作者:杭州智予数信息技术有限公司
227
+
228
+ - 邮箱:indexdoc@qq.com
@@ -0,0 +1,134 @@
1
+ <div align="center">
2
+ <strong>简体中文</strong> | <a href="README_EN.md">English</a>
3
+ </div>
4
+
5
+ ---
6
+ # indexdoc-converter 文档转换工具库
7
+ **indexdoc-converter** 是一款基于 Python 开发的文档转换工具库,核心功能为将主流办公文档、网页文件高效转换为 Markdown 格式。各类型文件支持格式如下:
8
+ - Word 文档支持 **.docx** ;
9
+ - Excel 类表格文档支持 **.xlsx、.xls、.ods、.csv、.tsv** ;
10
+ - 网页文件支持 **.html、.mhtml、.htm 及网页url** ;
11
+ - PPT 演示文档支持 **.pptx** 。
12
+ 该工具库现已发布至 PyPI(Python Package Index),可通过 pip 包管理工具快速安装并投入使用。
13
+
14
+ [![Python Version](https://img.shields.io/badge/python-3.10+-green.svg)](https://www.python.org/) [![GitHub Stars](https://img.shields.io/github/stars/indexdoc/indexdoc-converter?style=social)](https://github.com/indexdoc/indexdoc-converter.git)
15
+
16
+
17
+ ## 库的使用
18
+ ```bash
19
+ #库安装
20
+ pip install -U indexdoc-converter #下载最新版本库
21
+ ```
22
+ - 若使用该库 python版本最小应为 Python3.10
23
+ - 包目录结构
24
+
25
+ ```bash
26
+ indexdoc-converter/ # 项目根目录
27
+ ├── indexdoc_converter/ # 核心包目录
28
+ │ ├── __init__.py # 核心代码
29
+ │ ├── docx_to_md.py # Word转Markdown工具类
30
+ │ ├── excel_to_md.py # Excel转Markdown工具类
31
+ │ ├── html_to_md.py # Html转Markdown工具类
32
+ │ ├── pptx_to_md.py # ppt转Markdown工具类
33
+ │ └── utils/
34
+ │ ├── __init__.py
35
+ │ ├── FileUtil.py
36
+ │ ├── IDUtil.py
37
+ │ └── img_to_base64.py
38
+ ```
39
+
40
+ ### 使用示例
41
+
42
+ ```bash
43
+ #引用 注意引用为 indexdoc_converter 而不是 indexdoc-converter
44
+ from indexdoc_converter.docx_to_md import convert_docx_to_md
45
+ from indexdoc_converter.excel_to_md import TableToMarkdown
46
+ from indexdoc_converter.html_to_md import convert_to_md
47
+ from indexdoc_converter.pptx_to_md import pptx_to_md
48
+
49
+ # -------------------------------------------Word转Markdown---------------------------------------------------
50
+ md_text = convert_docx_to_md(r"C:\Users\xxx\测试文档.docx", False)
51
+ with open('./test.md', 'w', encoding='utf-8') as f:
52
+ f.write(md_text)
53
+
54
+ # -------------------------------------------Excel转Markdown-------------------------------------------------
55
+ # 自定义参数示例
56
+ converter = TableToMarkdown(
57
+ file_title_level=2, # 文件标题的Markdown层级,默认1(#),这里设为2(##)
58
+ single_row_value_as_title=True, # 是否将单行唯一值识别为标题,默认True
59
+ max_rows=8000, # 最大处理行数,默认6000(实际处理行数是max_rows+1)
60
+ max_cols=200 # 最大处理列数,默认128(实际处理列数是max_cols+1)
61
+ )
62
+
63
+ # 转换单个文件
64
+ file_path = r"C:\Users\xxx\测试文件.xlsx"
65
+ result = converter.convert(file_path)
66
+
67
+ # blank 模式:保留合并单元格的原始样式(只在合并单元格左上角显示内容,其余位置为空)
68
+ with open("../tmp/测试_blank.md", "w", encoding="utf-8") as f:
69
+ f.write(result['blank'])
70
+
71
+ # fill 模式:将合并单元格的内容填充到所有合并的单元格中同时还能自动识别表格中的标题行、分割多个表格块,处理空行 / 空列,兼容各种表格格式的合并单元格解析。
72
+ with open("../tmp/测试_fill.md", "w", encoding="utf-8") as f:
73
+ f.write(result['fill'])
74
+
75
+ # -------------------------------------------ppt转Markdown---------------------------------------------------
76
+ ppt_file = r"C:\Users\xxx\测试文件.pptx"
77
+ md_path = pptx_to_md(ppt_file)
78
+ print(f"单文件转换完成,MD文件路径:{md_path}")
79
+
80
+ # -------------------------------------------网页文件转Markdown-----------------------------------------------
81
+ # html = "https://news.qq.com/rain/a/20260114A01NI000"
82
+ html = "https://www.aituple.com"
83
+ # html = "https://www.indexdoc.com"
84
+ # html = r"C:\Users\xxx\测试文件.html"
85
+ # html = "https://www.indexdoc.com/contact.html"
86
+ md = convert_to_md(html, '../tmp/测试html.md')
87
+ # md = mhtml_to_markdown(mhtml)
88
+ ```
89
+ ### Word文档
90
+ #### 原文档
91
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Word1.png)
92
+ #### 转换后文档
93
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Word2.png)
94
+
95
+ ### Excel文档
96
+ #### 原文档
97
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Excel1.png)
98
+ #### 转换后文档
99
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Excel2.png)
100
+
101
+ ### ppt文档
102
+ #### 原文档
103
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/ppt1.png)
104
+ #### 转换后文档
105
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/ppt2.png)
106
+
107
+ ### 网页文件
108
+ #### 原文档
109
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/html1.png)
110
+ #### 转换后文档
111
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/html2.png)
112
+
113
+ ## 二次开发
114
+
115
+ - Python 3.10+
116
+
117
+ ```bash
118
+ #源码地址
119
+ https://github.com/indexdoc/indexdoc-converter.git
120
+ ```
121
+ ```bash
122
+ #快速安装依赖库
123
+ pip install -r requirements.txt
124
+
125
+ # 阿里镜像源
126
+ pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
127
+ ```
128
+
129
+
130
+ ## 📞 作者
131
+
132
+ - 作者:杭州智予数信息技术有限公司
133
+
134
+ - 邮箱:indexdoc@qq.com
@@ -1,8 +1,6 @@
1
1
  import pptx2md
2
2
  from pathlib import Path
3
- import os
4
3
  import tempfile
5
- import shutil # 新增:用于清理临时目录
6
4
 
7
5
 
8
6
 
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.4
2
+ Name: indexdoc_converter
3
+ Version: 0.2.3
4
+ Summary: 可以将Word文档(仅.docx)、Excel表格、Html网页、PPt文件 转化为Markdown文件。
5
+ Home-page: https://github.com/indexdoc/indexdoc-converter.git
6
+ Author: 杭州智予数信息技术有限公司
7
+ Author-email: indexdoc@qq.com
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: annotated-types==0.7.0
11
+ Requires-Dist: asgiref==3.11.0
12
+ Requires-Dist: beautifulsoup4==4.14.3
13
+ Requires-Dist: bottle==0.13.4
14
+ Requires-Dist: captcha==0.7.1
15
+ Requires-Dist: certifi==2026.1.4
16
+ Requires-Dist: cffi==2.0.0
17
+ Requires-Dist: chardet==5.2.0
18
+ Requires-Dist: charset-normalizer==3.4.4
19
+ Requires-Dist: clickhouse-driver==0.2.10
20
+ Requires-Dist: clr_loader==0.2.10
21
+ Requires-Dist: cobble==0.1.4
22
+ Requires-Dist: colorama==0.4.6
23
+ Requires-Dist: cryptography==46.0.3
24
+ Requires-Dist: cssselect==1.4.0
25
+ Requires-Dist: defusedxml==0.7.1
26
+ Requires-Dist: Django==6.0.1
27
+ Requires-Dist: django-ranged-response==0.2.0
28
+ Requires-Dist: django-simple-captcha==0.6.3
29
+ Requires-Dist: duckdb==1.4.3
30
+ Requires-Dist: et_xmlfile==2.0.0
31
+ Requires-Dist: filelock==3.20.3
32
+ Requires-Dist: fonttools==4.61.1
33
+ Requires-Dist: fpdf2==2.8.5
34
+ Requires-Dist: fsspec==2026.1.0
35
+ Requires-Dist: html2text==2025.4.15
36
+ Requires-Dist: idna==3.11
37
+ Requires-Dist: image==1.5.33
38
+ Requires-Dist: Jinja2==3.1.6
39
+ Requires-Dist: lxml==6.0.2
40
+ Requires-Dist: lxml_html_clean==0.4.3
41
+ Requires-Dist: mammoth==1.11.0
42
+ Requires-Dist: markdownify==1.2.2
43
+ Requires-Dist: MarkupSafe==3.0.3
44
+ Requires-Dist: mpmath==1.3.0
45
+ Requires-Dist: networkx==3.6.1
46
+ Requires-Dist: numpy==2.4.1
47
+ Requires-Dist: odfpy==1.4.1
48
+ Requires-Dist: openpyxl==3.1.5
49
+ Requires-Dist: pandas==3.0.0
50
+ Requires-Dist: pdfkit==1.0.0
51
+ Requires-Dist: pillow==12.1.0
52
+ Requires-Dist: pptx2md==2.0.6
53
+ Requires-Dist: proxy_tools==0.1.0
54
+ Requires-Dist: psutil==7.2.1
55
+ Requires-Dist: pycparser==2.23
56
+ Requires-Dist: pydantic==2.12.5
57
+ Requires-Dist: pydantic_core==2.41.5
58
+ Requires-Dist: PyJWT==2.10.1
59
+ Requires-Dist: pyperclip==1.11.0
60
+ Requires-Dist: python-dateutil==2.9.0.post0
61
+ Requires-Dist: python-docx==1.2.0
62
+ Requires-Dist: python-pptx==1.0.2
63
+ Requires-Dist: pythonnet==3.0.5
64
+ Requires-Dist: pytz==2025.2
65
+ Requires-Dist: pywebview==6.1
66
+ Requires-Dist: pywin32==311
67
+ Requires-Dist: RapidFuzz==3.14.3
68
+ Requires-Dist: readability-lxml==0.8.4.1
69
+ Requires-Dist: requests==2.32.5
70
+ Requires-Dist: scipy==1.17.0
71
+ Requires-Dist: setuptools==80.9.0
72
+ Requires-Dist: six==1.17.0
73
+ Requires-Dist: soupsieve==2.8.3
74
+ Requires-Dist: sqlparse==0.5.5
75
+ Requires-Dist: sympy==1.14.0
76
+ Requires-Dist: torch==2.9.1
77
+ Requires-Dist: tornado==6.5.4
78
+ Requires-Dist: tqdm==4.67.1
79
+ Requires-Dist: typing-inspection==0.4.2
80
+ Requires-Dist: typing_extensions==4.15.0
81
+ Requires-Dist: tzdata==2025.3
82
+ Requires-Dist: tzlocal==5.3.1
83
+ Requires-Dist: urllib3==2.6.3
84
+ Requires-Dist: WMI==1.5.1
85
+ Requires-Dist: xlsxwriter==3.2.9
86
+ Dynamic: author
87
+ Dynamic: author-email
88
+ Dynamic: description
89
+ Dynamic: description-content-type
90
+ Dynamic: home-page
91
+ Dynamic: requires-dist
92
+ Dynamic: requires-python
93
+ Dynamic: summary
94
+
95
+ <div align="center">
96
+ <strong>简体中文</strong> | <a href="README_EN.md">English</a>
97
+ </div>
98
+
99
+ ---
100
+ # indexdoc-converter 文档转换工具库
101
+ **indexdoc-converter** 是一款基于 Python 开发的文档转换工具库,核心功能为将主流办公文档、网页文件高效转换为 Markdown 格式。各类型文件支持格式如下:
102
+ - Word 文档支持 **.docx** ;
103
+ - Excel 类表格文档支持 **.xlsx、.xls、.ods、.csv、.tsv** ;
104
+ - 网页文件支持 **.html、.mhtml、.htm 及网页url** ;
105
+ - PPT 演示文档支持 **.pptx** 。
106
+ 该工具库现已发布至 PyPI(Python Package Index),可通过 pip 包管理工具快速安装并投入使用。
107
+
108
+ [![Python Version](https://img.shields.io/badge/python-3.10+-green.svg)](https://www.python.org/) [![GitHub Stars](https://img.shields.io/github/stars/indexdoc/indexdoc-converter?style=social)](https://github.com/indexdoc/indexdoc-converter.git)
109
+
110
+
111
+ ## 库的使用
112
+ ```bash
113
+ #库安装
114
+ pip install -U indexdoc-converter #下载最新版本库
115
+ ```
116
+ - 若使用该库 python版本最小应为 Python3.10
117
+ - 包目录结构
118
+
119
+ ```bash
120
+ indexdoc-converter/ # 项目根目录
121
+ ├── indexdoc_converter/ # 核心包目录
122
+ │ ├── __init__.py # 核心代码
123
+ │ ├── docx_to_md.py # Word转Markdown工具类
124
+ │ ├── excel_to_md.py # Excel转Markdown工具类
125
+ │ ├── html_to_md.py # Html转Markdown工具类
126
+ │ ├── pptx_to_md.py # ppt转Markdown工具类
127
+ │ └── utils/
128
+ │ ├── __init__.py
129
+ │ ├── FileUtil.py
130
+ │ ├── IDUtil.py
131
+ │ └── img_to_base64.py
132
+ ```
133
+
134
+ ### 使用示例
135
+
136
+ ```bash
137
+ #引用 注意引用为 indexdoc_converter 而不是 indexdoc-converter
138
+ from indexdoc_converter.docx_to_md import convert_docx_to_md
139
+ from indexdoc_converter.excel_to_md import TableToMarkdown
140
+ from indexdoc_converter.html_to_md import convert_to_md
141
+ from indexdoc_converter.pptx_to_md import pptx_to_md
142
+
143
+ # -------------------------------------------Word转Markdown---------------------------------------------------
144
+ md_text = convert_docx_to_md(r"C:\Users\xxx\测试文档.docx", False)
145
+ with open('./test.md', 'w', encoding='utf-8') as f:
146
+ f.write(md_text)
147
+
148
+ # -------------------------------------------Excel转Markdown-------------------------------------------------
149
+ # 自定义参数示例
150
+ converter = TableToMarkdown(
151
+ file_title_level=2, # 文件标题的Markdown层级,默认1(#),这里设为2(##)
152
+ single_row_value_as_title=True, # 是否将单行唯一值识别为标题,默认True
153
+ max_rows=8000, # 最大处理行数,默认6000(实际处理行数是max_rows+1)
154
+ max_cols=200 # 最大处理列数,默认128(实际处理列数是max_cols+1)
155
+ )
156
+
157
+ # 转换单个文件
158
+ file_path = r"C:\Users\xxx\测试文件.xlsx"
159
+ result = converter.convert(file_path)
160
+
161
+ # blank 模式:保留合并单元格的原始样式(只在合并单元格左上角显示内容,其余位置为空)
162
+ with open("../tmp/测试_blank.md", "w", encoding="utf-8") as f:
163
+ f.write(result['blank'])
164
+
165
+ # fill 模式:将合并单元格的内容填充到所有合并的单元格中同时还能自动识别表格中的标题行、分割多个表格块,处理空行 / 空列,兼容各种表格格式的合并单元格解析。
166
+ with open("../tmp/测试_fill.md", "w", encoding="utf-8") as f:
167
+ f.write(result['fill'])
168
+
169
+ # -------------------------------------------ppt转Markdown---------------------------------------------------
170
+ ppt_file = r"C:\Users\xxx\测试文件.pptx"
171
+ md_path = pptx_to_md(ppt_file)
172
+ print(f"单文件转换完成,MD文件路径:{md_path}")
173
+
174
+ # -------------------------------------------网页文件转Markdown-----------------------------------------------
175
+ # html = "https://news.qq.com/rain/a/20260114A01NI000"
176
+ html = "https://www.aituple.com"
177
+ # html = "https://www.indexdoc.com"
178
+ # html = r"C:\Users\xxx\测试文件.html"
179
+ # html = "https://www.indexdoc.com/contact.html"
180
+ md = convert_to_md(html, '../tmp/测试html.md')
181
+ # md = mhtml_to_markdown(mhtml)
182
+ ```
183
+ ### Word文档
184
+ #### 原文档
185
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Word1.png)
186
+ #### 转换后文档
187
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Word2.png)
188
+
189
+ ### Excel文档
190
+ #### 原文档
191
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Excel1.png)
192
+ #### 转换后文档
193
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/Excel2.png)
194
+
195
+ ### ppt文档
196
+ #### 原文档
197
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/ppt1.png)
198
+ #### 转换后文档
199
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/ppt2.png)
200
+
201
+ ### 网页文件
202
+ #### 原文档
203
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/html1.png)
204
+ #### 转换后文档
205
+ ![主页1](https://github.com/indexdoc/indexdoc-converter/raw/main/html2.png)
206
+
207
+ ## 二次开发
208
+
209
+ - Python 3.10+
210
+
211
+ ```bash
212
+ #源码地址
213
+ https://github.com/indexdoc/indexdoc-converter.git
214
+ ```
215
+ ```bash
216
+ #快速安装依赖库
217
+ pip install -r requirements.txt
218
+
219
+ # 阿里镜像源
220
+ pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
221
+ ```
222
+
223
+
224
+ ## 📞 作者
225
+
226
+ - 作者:杭州智予数信息技术有限公司
227
+
228
+ - 邮箱:indexdoc@qq.com
@@ -11,7 +11,7 @@ with open("requirements.txt", "r", encoding="utf-8") as f:
11
11
 
12
12
  setup(
13
13
  name="indexdoc_converter", # 你的工具名称(PyPI上唯一)
14
- version="0.2.2", # 版本号(遵循语义化版本)
14
+ version="0.2.3", # 版本号(遵循语义化版本)
15
15
  description="可以将Word文档(仅.docx)、Excel表格、Html网页、PPt文件 转化为Markdown文件。",
16
16
  long_description=README,
17
17
  long_description_content_type="text/markdown",
@@ -1,92 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: indexdoc_converter
3
- Version: 0.2.2
4
- Summary: 可以将Word文档(仅.docx)、Excel表格、Html网页、PPt文件 转化为Markdown文件。
5
- Home-page: https://github.com/indexdoc/indexdoc-converter.git
6
- Author: 杭州智予数信息技术有限公司
7
- Author-email: indexdoc@qq.com
8
- Requires-Python: >=3.10
9
- Description-Content-Type: text/markdown
10
- Requires-Dist: annotated-types==0.7.0
11
- Requires-Dist: asgiref==3.11.0
12
- Requires-Dist: beautifulsoup4==4.14.3
13
- Requires-Dist: bottle==0.13.4
14
- Requires-Dist: captcha==0.7.1
15
- Requires-Dist: certifi==2026.1.4
16
- Requires-Dist: cffi==2.0.0
17
- Requires-Dist: chardet==5.2.0
18
- Requires-Dist: charset-normalizer==3.4.4
19
- Requires-Dist: clickhouse-driver==0.2.10
20
- Requires-Dist: clr_loader==0.2.10
21
- Requires-Dist: cobble==0.1.4
22
- Requires-Dist: colorama==0.4.6
23
- Requires-Dist: cryptography==46.0.3
24
- Requires-Dist: cssselect==1.4.0
25
- Requires-Dist: defusedxml==0.7.1
26
- Requires-Dist: Django==6.0.1
27
- Requires-Dist: django-ranged-response==0.2.0
28
- Requires-Dist: django-simple-captcha==0.6.3
29
- Requires-Dist: duckdb==1.4.3
30
- Requires-Dist: et_xmlfile==2.0.0
31
- Requires-Dist: filelock==3.20.3
32
- Requires-Dist: fonttools==4.61.1
33
- Requires-Dist: fpdf2==2.8.5
34
- Requires-Dist: fsspec==2026.1.0
35
- Requires-Dist: html2text==2025.4.15
36
- Requires-Dist: idna==3.11
37
- Requires-Dist: image==1.5.33
38
- Requires-Dist: Jinja2==3.1.6
39
- Requires-Dist: lxml==6.0.2
40
- Requires-Dist: lxml_html_clean==0.4.3
41
- Requires-Dist: mammoth==1.11.0
42
- Requires-Dist: markdownify==1.2.2
43
- Requires-Dist: MarkupSafe==3.0.3
44
- Requires-Dist: mpmath==1.3.0
45
- Requires-Dist: networkx==3.6.1
46
- Requires-Dist: numpy==2.4.1
47
- Requires-Dist: odfpy==1.4.1
48
- Requires-Dist: openpyxl==3.1.5
49
- Requires-Dist: pandas==3.0.0
50
- Requires-Dist: pdfkit==1.0.0
51
- Requires-Dist: pillow==12.1.0
52
- Requires-Dist: pptx2md==2.0.6
53
- Requires-Dist: proxy_tools==0.1.0
54
- Requires-Dist: psutil==7.2.1
55
- Requires-Dist: pycparser==2.23
56
- Requires-Dist: pydantic==2.12.5
57
- Requires-Dist: pydantic_core==2.41.5
58
- Requires-Dist: PyJWT==2.10.1
59
- Requires-Dist: pyperclip==1.11.0
60
- Requires-Dist: python-dateutil==2.9.0.post0
61
- Requires-Dist: python-docx==1.2.0
62
- Requires-Dist: python-pptx==1.0.2
63
- Requires-Dist: pythonnet==3.0.5
64
- Requires-Dist: pytz==2025.2
65
- Requires-Dist: pywebview==6.1
66
- Requires-Dist: pywin32==311
67
- Requires-Dist: RapidFuzz==3.14.3
68
- Requires-Dist: readability-lxml==0.8.4.1
69
- Requires-Dist: requests==2.32.5
70
- Requires-Dist: scipy==1.17.0
71
- Requires-Dist: setuptools==80.9.0
72
- Requires-Dist: six==1.17.0
73
- Requires-Dist: soupsieve==2.8.3
74
- Requires-Dist: sqlparse==0.5.5
75
- Requires-Dist: sympy==1.14.0
76
- Requires-Dist: torch==2.9.1
77
- Requires-Dist: tornado==6.5.4
78
- Requires-Dist: tqdm==4.67.1
79
- Requires-Dist: typing-inspection==0.4.2
80
- Requires-Dist: typing_extensions==4.15.0
81
- Requires-Dist: tzdata==2025.3
82
- Requires-Dist: tzlocal==5.3.1
83
- Requires-Dist: urllib3==2.6.3
84
- Requires-Dist: WMI==1.5.1
85
- Requires-Dist: xlsxwriter==3.2.9
86
- Dynamic: author
87
- Dynamic: author-email
88
- Dynamic: description-content-type
89
- Dynamic: home-page
90
- Dynamic: requires-dist
91
- Dynamic: requires-python
92
- Dynamic: summary
File without changes
@@ -1,92 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: indexdoc_converter
3
- Version: 0.2.2
4
- Summary: 可以将Word文档(仅.docx)、Excel表格、Html网页、PPt文件 转化为Markdown文件。
5
- Home-page: https://github.com/indexdoc/indexdoc-converter.git
6
- Author: 杭州智予数信息技术有限公司
7
- Author-email: indexdoc@qq.com
8
- Requires-Python: >=3.10
9
- Description-Content-Type: text/markdown
10
- Requires-Dist: annotated-types==0.7.0
11
- Requires-Dist: asgiref==3.11.0
12
- Requires-Dist: beautifulsoup4==4.14.3
13
- Requires-Dist: bottle==0.13.4
14
- Requires-Dist: captcha==0.7.1
15
- Requires-Dist: certifi==2026.1.4
16
- Requires-Dist: cffi==2.0.0
17
- Requires-Dist: chardet==5.2.0
18
- Requires-Dist: charset-normalizer==3.4.4
19
- Requires-Dist: clickhouse-driver==0.2.10
20
- Requires-Dist: clr_loader==0.2.10
21
- Requires-Dist: cobble==0.1.4
22
- Requires-Dist: colorama==0.4.6
23
- Requires-Dist: cryptography==46.0.3
24
- Requires-Dist: cssselect==1.4.0
25
- Requires-Dist: defusedxml==0.7.1
26
- Requires-Dist: Django==6.0.1
27
- Requires-Dist: django-ranged-response==0.2.0
28
- Requires-Dist: django-simple-captcha==0.6.3
29
- Requires-Dist: duckdb==1.4.3
30
- Requires-Dist: et_xmlfile==2.0.0
31
- Requires-Dist: filelock==3.20.3
32
- Requires-Dist: fonttools==4.61.1
33
- Requires-Dist: fpdf2==2.8.5
34
- Requires-Dist: fsspec==2026.1.0
35
- Requires-Dist: html2text==2025.4.15
36
- Requires-Dist: idna==3.11
37
- Requires-Dist: image==1.5.33
38
- Requires-Dist: Jinja2==3.1.6
39
- Requires-Dist: lxml==6.0.2
40
- Requires-Dist: lxml_html_clean==0.4.3
41
- Requires-Dist: mammoth==1.11.0
42
- Requires-Dist: markdownify==1.2.2
43
- Requires-Dist: MarkupSafe==3.0.3
44
- Requires-Dist: mpmath==1.3.0
45
- Requires-Dist: networkx==3.6.1
46
- Requires-Dist: numpy==2.4.1
47
- Requires-Dist: odfpy==1.4.1
48
- Requires-Dist: openpyxl==3.1.5
49
- Requires-Dist: pandas==3.0.0
50
- Requires-Dist: pdfkit==1.0.0
51
- Requires-Dist: pillow==12.1.0
52
- Requires-Dist: pptx2md==2.0.6
53
- Requires-Dist: proxy_tools==0.1.0
54
- Requires-Dist: psutil==7.2.1
55
- Requires-Dist: pycparser==2.23
56
- Requires-Dist: pydantic==2.12.5
57
- Requires-Dist: pydantic_core==2.41.5
58
- Requires-Dist: PyJWT==2.10.1
59
- Requires-Dist: pyperclip==1.11.0
60
- Requires-Dist: python-dateutil==2.9.0.post0
61
- Requires-Dist: python-docx==1.2.0
62
- Requires-Dist: python-pptx==1.0.2
63
- Requires-Dist: pythonnet==3.0.5
64
- Requires-Dist: pytz==2025.2
65
- Requires-Dist: pywebview==6.1
66
- Requires-Dist: pywin32==311
67
- Requires-Dist: RapidFuzz==3.14.3
68
- Requires-Dist: readability-lxml==0.8.4.1
69
- Requires-Dist: requests==2.32.5
70
- Requires-Dist: scipy==1.17.0
71
- Requires-Dist: setuptools==80.9.0
72
- Requires-Dist: six==1.17.0
73
- Requires-Dist: soupsieve==2.8.3
74
- Requires-Dist: sqlparse==0.5.5
75
- Requires-Dist: sympy==1.14.0
76
- Requires-Dist: torch==2.9.1
77
- Requires-Dist: tornado==6.5.4
78
- Requires-Dist: tqdm==4.67.1
79
- Requires-Dist: typing-inspection==0.4.2
80
- Requires-Dist: typing_extensions==4.15.0
81
- Requires-Dist: tzdata==2025.3
82
- Requires-Dist: tzlocal==5.3.1
83
- Requires-Dist: urllib3==2.6.3
84
- Requires-Dist: WMI==1.5.1
85
- Requires-Dist: xlsxwriter==3.2.9
86
- Dynamic: author
87
- Dynamic: author-email
88
- Dynamic: description-content-type
89
- Dynamic: home-page
90
- Dynamic: requires-dist
91
- Dynamic: requires-python
92
- Dynamic: summary