indexdoc-converter 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. {indexdoc_converter-0.2.0 → indexdoc_converter-0.2.1}/PKG-INFO +1 -1
  2. indexdoc_converter-0.2.1/indexdoc_converter/utils/FileUtil.py +45 -0
  3. indexdoc_converter-0.2.1/indexdoc_converter/utils/IDUtil.py +40 -0
  4. indexdoc_converter-0.2.1/indexdoc_converter/utils/__init__.py +0 -0
  5. indexdoc_converter-0.2.1/indexdoc_converter/utils/img_to_base64.py +218 -0
  6. {indexdoc_converter-0.2.0 → indexdoc_converter-0.2.1}/indexdoc_converter.egg-info/PKG-INFO +1 -1
  7. {indexdoc_converter-0.2.0 → indexdoc_converter-0.2.1}/indexdoc_converter.egg-info/SOURCES.txt +5 -1
  8. {indexdoc_converter-0.2.0 → indexdoc_converter-0.2.1}/setup.py +1 -1
  9. {indexdoc_converter-0.2.0 → indexdoc_converter-0.2.1}/README.md +0 -0
  10. {indexdoc_converter-0.2.0 → indexdoc_converter-0.2.1}/indexdoc_converter/__init__.py +0 -0
  11. {indexdoc_converter-0.2.0 → indexdoc_converter-0.2.1}/indexdoc_converter/docx_to_md.py +0 -0
  12. {indexdoc_converter-0.2.0 → indexdoc_converter-0.2.1}/indexdoc_converter/excel_to_md.py +0 -0
  13. {indexdoc_converter-0.2.0 → indexdoc_converter-0.2.1}/indexdoc_converter/html_to_md.py +0 -0
  14. {indexdoc_converter-0.2.0 → indexdoc_converter-0.2.1}/indexdoc_converter/pptx_to_md.py +0 -0
  15. {indexdoc_converter-0.2.0 → indexdoc_converter-0.2.1}/indexdoc_converter.egg-info/dependency_links.txt +0 -0
  16. {indexdoc_converter-0.2.0 → indexdoc_converter-0.2.1}/indexdoc_converter.egg-info/requires.txt +0 -0
  17. {indexdoc_converter-0.2.0 → indexdoc_converter-0.2.1}/indexdoc_converter.egg-info/top_level.txt +0 -0
  18. {indexdoc_converter-0.2.0 → indexdoc_converter-0.2.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: indexdoc_converter
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: 可以将Word文档(仅.docx)、Excel表格、Html网页、PPt文件 转化为Markdown文件。
5
5
  Home-page: https://github.com/indexdoc/indexdoc-converter.git
6
6
  Author: 杭州智予数信息技术有限公司
@@ -0,0 +1,45 @@
1
+ import os
2
+ def get_filepath_shortname_suffix(file_url):
3
+ """
4
+ 获取文件路径, 文件名, 后缀名
5
+ :param file_url:
6
+ :return:
7
+ """
8
+ filepath, tmpfilename = os.path.split(file_url)
9
+ shotname, extension = os.path.splitext(tmpfilename)
10
+ return filepath, shotname, extension
11
+
12
+ def get_file_suffix(file_url):
13
+ return os.path.splitext(file_url)[-1].lower()
14
+
15
+ def get_file_name_without_suffix(file_url):
16
+ return os.path.splitext(file_url)[-2]
17
+
18
+
19
+ def detect_encoding(file_path: str) -> str:
20
+ """
21
+ 稳定版编码检测
22
+ 核心思路:实际读取测试比理论检测更可靠
23
+ """
24
+ encodings_to_try = [
25
+ 'utf-8', # 必须包含
26
+ 'gb18030', # 最全中文支持
27
+ 'utf-8-sig', # 必须包含
28
+ 'big5', # 台湾繁体,极少数情况
29
+ 'utf-16', # 极少数情况
30
+ 'cp1252', # Windows西欧
31
+ 'latin1', # 最兼容的回退
32
+ ]
33
+
34
+ for encoding in encodings_to_try:
35
+ try:
36
+ with open(file_path, 'r', encoding=encoding) as f:
37
+ # 读取前1KB测试,不消耗太多资源
38
+ f.read(1024)
39
+ # 如果测试读取成功,返回该编码
40
+ # print(encoding)
41
+ return encoding
42
+ except UnicodeDecodeError:
43
+ continue
44
+ #默认返回utf-8
45
+ return 'utf-8'
@@ -0,0 +1,40 @@
1
+ import uuid
2
+ from datetime import datetime
3
+
4
+
5
+ def get_long():
6
+ return get_long_id_by_time()
7
+
8
+
9
+ def get_max_long():
10
+ return 9999999999999999
11
+
12
+
13
+ def get_uuid():
14
+ return uuid.uuid1().hex
15
+
16
+
17
+ # 适合数据量不大的非集群应用,每秒插入不超过10000个
18
+ # 如果需要作集群,需要将此服务独立出来单独作为一个ID生成器进程。也可以定义一个标志位作为集群ID。
19
+ from datetime import datetime
20
+
21
+ last_time_id = 21010100000000
22
+
23
+
24
+ def get_long_id_by_time():
25
+ global last_time_id
26
+ dt = datetime.now()
27
+ new_id = datetime.strftime(dt, '%y%m%d%H%M%S0000')
28
+ new_id = int(new_id) + int(dt.microsecond / 100)
29
+ if new_id <= last_time_id:
30
+ new_id = last_time_id + 1
31
+ last_time_id = new_id
32
+ return new_id
33
+
34
+ # 雪花算法,适合大数据量分布式应用
35
+ # import snowflake.client
36
+ # def get_long_id_by_snowflake():
37
+ # snowflake.client.Generator
38
+ # pass
39
+
40
+ # print(type(get_long_id_by_time()),get_long_id_by_time())
@@ -0,0 +1,218 @@
1
+ import base64
2
+ import io
3
+ from pathlib import Path
4
+ from typing import Union, Optional
5
+ from PIL import Image
6
+ import olefile #部分图像格式用到,必须保留
7
+
8
+
9
+ class OutputFormat:
10
+ """定义输出格式枚举"""
11
+ MARKDOWN_ALT = 'MARKDOWN_ALT' # ![文件名](data:...)
12
+ MARKDOWN_NO_ALT = 'MARKDOWN_NO_ALT' # ![](data:...)
13
+ RAW_BASE64 = 'RAW_BASE64' # 纯 Base64 字符串
14
+ MIME_BASE64 = 'MIME_BASE64' # data:image/xxx;base64,...
15
+
16
+
17
+ class Image2Base64:
18
+ """
19
+ 智能图像转 Base64 工具类
20
+ 特性:支持 HEIC/PSD/SVG、智能缩放、二分法压缩、透明度自动合成
21
+ """
22
+ _SUPPORTED_EXTS = {
23
+ '.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp',
24
+ '.bmp', '.tiff', '.tif', '.ico', '.psd', '.heic', '.heif'
25
+ }
26
+ _NEED_CONVERSION = {'.bmp', '.tiff', '.tif', '.ico', '.psd', '.heic', '.heif'}
27
+ _MIME_MAP = {
28
+ '.png': 'image/png', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
29
+ '.gif': 'image/gif', '.svg': 'image/svg+xml', '.webp': 'image/webp',
30
+ }
31
+
32
+ @classmethod
33
+ def convert_file(
34
+ cls,
35
+ image_path: Union[str, Path],
36
+ max_dim: int = 1200, # 限制长边像素
37
+ max_kb: int = 200, # 目标最大体积 (KB)
38
+ force_webp: bool = True, # 强制转为 WebP (推荐)
39
+ out_format: str = OutputFormat.MARKDOWN_ALT,
40
+ quality: int = 80 # 初始压缩质量
41
+ ) -> str:
42
+ # 0. 前置校验
43
+ path = Path(image_path)
44
+ if not path.exists():
45
+ raise FileNotFoundError(f"Missing file: {path}")
46
+
47
+ ext = path.suffix.lower()
48
+ if ext not in cls._SUPPORTED_EXTS:
49
+ supported_list = ", ".join(sorted(cls._SUPPORTED_EXTS))
50
+ raise ValueError(f"Unsupported extension '{ext}'. Supported: {supported_list}")
51
+
52
+ final_data: bytes = b""
53
+ mime_type: str = ""
54
+
55
+ # 1. 矢量图处理 (SVG) - 保持矢量特性,不走 Pillow
56
+ if ext == '.svg':
57
+ with open(path, 'rb') as f:
58
+ final_data = f.read()
59
+ mime_type = "image/svg+xml"
60
+
61
+ # 2. 位图处理流程
62
+ else:
63
+ # HEIC 特殊加载支持
64
+ if ext in ['.heic', '.heif']:
65
+ try:
66
+ from pillow_heif import register_heif_opener
67
+ register_heif_opener()
68
+ except ImportError:
69
+ raise ImportError("Please install 'pillow-heif' to support HEIC files.")
70
+
71
+ with Image.open(path) as img:
72
+ # 确定目标格式与 MIME
73
+ final_data, mime_type = cls.convert_image(img, max_dim,max_kb,force_webp,quality )
74
+ # 3. 输出格式化
75
+ b64_str = base64.b64encode(final_data).decode('utf-8')
76
+
77
+ if out_format == OutputFormat.RAW_BASE64:
78
+ return b64_str
79
+
80
+ data_uri = f"data:{mime_type};base64,{b64_str}"
81
+
82
+ if out_format == OutputFormat.MIME_BASE64:
83
+ return data_uri
84
+ elif out_format == OutputFormat.MARKDOWN_NO_ALT:
85
+ return f"![]({data_uri})"
86
+ else: # 默认 MARKDOWN_ALT
87
+ return f"![{path.stem}]({data_uri})"
88
+
89
+ @classmethod
90
+ def convert_image(
91
+ cls,
92
+ img: Image.Image,
93
+ max_dim: int = 1200,
94
+ max_kb: int = 200,
95
+ force_webp: bool = True,
96
+ quality: int = 80
97
+ ) -> tuple[bytes, str]:
98
+ """
99
+ 核心处理逻辑:接收 Pillow Image 对象,返回压缩后的字节流和 MIME 类型
100
+ """
101
+ # 1. 确定格式
102
+ if force_webp:
103
+ target_format, mime_type = "WEBP", "image/webp"
104
+ else:
105
+ # 默认转 PNG 保证兼容性
106
+ target_format, mime_type = "PNG", "image/png"
107
+
108
+ # 2. 透明度处理
109
+ if target_format in ["JPEG", "WEBP"] or img.mode not in ["RGB", "RGBA", "L"]:
110
+ if img.mode in ["RGBA", "LA", "P"]:
111
+ background = Image.new('RGB', img.size, (255, 255, 255))
112
+ temp_rgba = img.convert('RGBA')
113
+ background.paste(temp_rgba, mask=temp_rgba.split()[-1])
114
+ img = background
115
+ else:
116
+ img = img.convert('RGB')
117
+
118
+ # 3. 等比缩放
119
+ if max(img.width, img.height) > max_dim:
120
+ img.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS)
121
+
122
+ # 4. 二分法压缩
123
+ raw_target_size = (max_kb * 1024) * 0.75
124
+ if target_format == "PNG":
125
+ buf = io.BytesIO()
126
+ img.save(buf, format="PNG", optimize=True)
127
+ return buf.getvalue(), mime_type
128
+ else:
129
+ low, high = 30, quality
130
+ best_data = None
131
+ for _ in range(3):
132
+ mid = (low + high) // 2
133
+ buf = io.BytesIO()
134
+ img.save(buf, format=target_format, quality=mid, optimize=True)
135
+ data = buf.getvalue()
136
+ if len(data) <= raw_target_size:
137
+ best_data, low = data, mid + 5
138
+ else:
139
+ high = mid - 5
140
+ return (best_data if best_data else buf.getvalue()), mime_type
141
+
142
+ @classmethod
143
+ def batch_convert(
144
+ cls,
145
+ input_dir: Union[str, Path],
146
+ output_md: Optional[Union[str, Path]] = None,
147
+ max_dim: int = 1200,
148
+ max_kb: int = 200,
149
+ force_webp: bool = True,
150
+ out_format: str = OutputFormat.MARKDOWN_ALT,
151
+ quality: int = 80
152
+ ) -> dict:
153
+ """
154
+ 批量转换目录下所有支持的图片文件
155
+ :param input_dir: 输入目录路径
156
+ :param output_md: 可选,将结果保存到的 Markdown 文件路径
157
+ :return: 包含 {文件名: Base64结果} 的字典
158
+ """
159
+ input_path = Path(input_dir)
160
+ if not input_path.is_dir():
161
+ raise ValueError(f"'{input_dir}' is not a valid directory.")
162
+
163
+ results = {}
164
+ # 获取所有支持的文件并排序
165
+ files = sorted([f for f in input_path.iterdir() if f.suffix.lower() in cls._SUPPORTED_EXTS])
166
+
167
+ if not files:
168
+ print(f"No supported images found in {input_dir}")
169
+ return results
170
+
171
+ print(f"Starting batch conversion of {len(files)} images...")
172
+
173
+ for file_path in files:
174
+ try:
175
+ # 调用单文件转换方法
176
+ res = cls.convert_file(
177
+ image_path=file_path,
178
+ max_dim=max_dim,
179
+ max_kb=max_kb,
180
+ force_webp=force_webp,
181
+ out_format=out_format,
182
+ quality=quality
183
+ )
184
+ results[file_path.name] = res
185
+ print(f"✓ Converted: {file_path.name}")
186
+ except Exception as e:
187
+ print(f"✗ Failed: {file_path.name} | Error: {e}")
188
+
189
+ # 如果指定了输出文件,则写入 Markdown
190
+ if output_md:
191
+ cls._save_to_markdown(results, output_md)
192
+
193
+ return results
194
+
195
+ @staticmethod
196
+ def _save_to_markdown(results: dict, output_path: Union[str, Path]):
197
+ """内部辅助:将结果写入文件"""
198
+ output_path = Path(output_path)
199
+ with open(output_path, 'w', encoding='utf-8') as f:
200
+ f.write(f"# Image Conversion Report\n\n")
201
+ f.write(f"Total processed: {len(results)}\n\n---\n\n")
202
+ for name, content in results.items():
203
+ f.write(f"### {name}\n\n")
204
+ # 如果内容本身不是 markdown 格式,则包裹一下
205
+ if not content.startswith('!['):
206
+ f.write(f"```text\n{content}\n```\n\n")
207
+ else:
208
+ f.write(f"{content}\n\n")
209
+ print(f"\n★ All results saved to: {output_path.absolute()}")
210
+
211
+ if __name__ == '__main__':
212
+ Image2Base64.batch_convert(
213
+ input_dir= r"D:\测试目录_全面\img",
214
+ output_md="gallery.md",
215
+ max_kb=150, # 每个图片限制在 150KB 以内
216
+ force_webp=True,
217
+ out_format=OutputFormat.MARKDOWN_ALT
218
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: indexdoc_converter
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: 可以将Word文档(仅.docx)、Excel表格、Html网页、PPt文件 转化为Markdown文件。
5
5
  Home-page: https://github.com/indexdoc/indexdoc-converter.git
6
6
  Author: 杭州智予数信息技术有限公司
@@ -9,4 +9,8 @@ indexdoc_converter.egg-info/PKG-INFO
9
9
  indexdoc_converter.egg-info/SOURCES.txt
10
10
  indexdoc_converter.egg-info/dependency_links.txt
11
11
  indexdoc_converter.egg-info/requires.txt
12
- indexdoc_converter.egg-info/top_level.txt
12
+ indexdoc_converter.egg-info/top_level.txt
13
+ indexdoc_converter/utils/FileUtil.py
14
+ indexdoc_converter/utils/IDUtil.py
15
+ indexdoc_converter/utils/__init__.py
16
+ indexdoc_converter/utils/img_to_base64.py
@@ -11,7 +11,7 @@ with open("requirements.txt", "r", encoding="utf-8") as f:
11
11
 
12
12
  setup(
13
13
  name="indexdoc_converter", # 你的工具名称(PyPI上唯一)
14
- version="0.2.0", # 版本号(遵循语义化版本)
14
+ version="0.2.1", # 版本号(遵循语义化版本)
15
15
  description="可以将Word文档(仅.docx)、Excel表格、Html网页、PPt文件 转化为Markdown文件。",
16
16
  long_description=README,
17
17
  long_description_content_type="text/markdown",