maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
maque/utils/docker.py ADDED
@@ -0,0 +1,96 @@
1
+ import os
2
+ from glob import glob
3
+ try:
4
+ from maque import probar
5
+ except ImportError:
6
+ probar = lambda x: x # fallback
7
+ from typing import List
8
+
9
+
10
+ def save_docker_images(filedir='.', skip_exists=True, use_stream=False):
11
+ import docker
12
+ from docker.models.images import Image
13
+ client = docker.from_env()
14
+ images_list: List[Image] = client.images.list()
15
+ exist_image_ids = []
16
+ for i in glob(os.path.join(filedir, "*")):
17
+ prefix, filename = os.path.split(i)
18
+ token_list = filename.split(',')
19
+ if len(token_list) > 1:
20
+ exist_image_ids.append(token_list[-1])
21
+ for image in probar(images_list):
22
+ image: Image
23
+ if image.tags:
24
+ if image.id.split(':')[-1] in exist_image_ids and skip_exists:
25
+ print(f"\n image: {image.id} exists, skipping")
26
+ else:
27
+ save_simgle_image(image, filedir, use_stream)
28
+
29
+
30
+ def save_simgle_image(image, filedir='.', use_stream=False):
31
+ image_id = image.id.split(':')[-1]
32
+ filename = f"{image.tags[0].replace('/', '#').replace(':', '@')},{image_id}"
33
+ filepath = os.path.join(filedir, filename)
34
+
35
+ if use_stream:
36
+ print(f"\n saving image [{image.tags[0]}] to:", filepath)
37
+ with open(filepath, 'wb') as f:
38
+ for chunk in image.save(named=True):
39
+ f.write(chunk)
40
+ else:
41
+ filepath += ".gz"
42
+ print(f"\n saving image [{image.tags[0]}] to:", filepath)
43
+ image_name = image.tags[0]
44
+ os.system(f"docker save {image_name} | gzip > {filepath}")
45
+
46
+
47
+ def add_tag_to_files(filedir='.'):
48
+ from docker.models.images import Image
49
+ import docker
50
+ client = docker.from_env()
51
+ images_list: List[Image] = client.images.list()
52
+ image_ids_map = dict([[image.id.split(':')[1], str(idx)] for idx, image in enumerate(images_list)])
53
+ file_image_ids = [[i.split('/')[-1], i] for i in glob(os.path.join(filedir, "*"))]
54
+ for file_image_id, filename in file_image_ids:
55
+ str_image_idx = image_ids_map.get(file_image_id)
56
+ if str_image_idx:
57
+ image = images_list[int(str_image_idx)]
58
+ prefix, file_id_name = os.path.split(filename)
59
+ os.rename(filename,
60
+ str(os.path.join(prefix, f"{image.tags[0].replace('/', '#').replace(':', '@')},{file_id_name}")))
61
+
62
+
63
+ def load_docker_images(filename_pattern="./*", skip_exists=True):
64
+ from docker.models.images import Image
65
+ import docker
66
+ client = docker.from_env()
67
+ images_list: List[Image] = client.images.list()
68
+ exist_image_ids = [image.id.split(":")[-1] for image in images_list]
69
+
70
+ for filename in probar(glob(filename_pattern)):
71
+ filename: str
72
+ file_name = os.path.split(filename)[-1]
73
+ file_image_id = file_name.split(',')[1]
74
+ if file_image_id in exist_image_ids and skip_exists:
75
+ print(f"\n image id: {file_image_id} exists, skipping")
76
+ else:
77
+ if filename.endswith('.gz'):
78
+ os.system(f"gunzip -c {filename}| docker load")
79
+ else:
80
+ os.system(f"docker load -i {filename}")
81
+
82
+
83
+ def find_by_pid(pid: int):
84
+ command = f"""\
85
+ dockerid=`cat /proc/{pid}/cgroup | grep -oPm1 .*/docker/.* | sed -e 's/.*docker\///'`
86
+ shortid=`echo $dockerid |cut -c1-12`
87
+ echo $shortid
88
+ docker ps |grep $shortid
89
+ """
90
+ os.system(command)
91
+
92
+
93
+ if __name__ == "__main__":
94
+ filedir = "/path/to/docker/backup"
95
+ save_docker_images(filedir, use_stream=False)
96
+ # load_dir_images(f"{filedir}/*")
@@ -0,0 +1,51 @@
1
+ import os
2
+
3
+
4
+ def download_model(repo_id, download_dir=None, backend="huggingface", token=None, repo_type="model", use_mirror=True):
5
+ """根据指定的 backend 下载模型或数据集,支持 Hugging Face 和 ModelScope,支持私有仓库模型下载.
6
+
7
+ Args:
8
+ repo_id (str): 模型或数据集仓库名称.
9
+ download_dir (str, optional): 下载的本地目录,默认为 None.
10
+ backend (str): 指定下载源,"huggingface" 或 "modelscope".
11
+ token (str, optional): 访问私有仓库的身份验证令牌,默认为 None.
12
+ repo_type (str): 仓库类型,可选值为 "model" 或 "dataset",默认为 "model".
13
+ use_mirror (bool): 是否使用镜像下载模型,默认为 True.
14
+ """
15
+ # 如果没有指定 download_dir,默认为当前目录下的 repo_id 文件夹
16
+ if download_dir is None:
17
+ download_dir = os.path.join(os.getcwd(), repo_id)
18
+
19
+ if backend == "huggingface":
20
+ if use_mirror:
21
+ os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
22
+ print(f"使用镜像下载模型: {repo_id}")
23
+ from huggingface_hub import snapshot_download as hf_snapshot_download
24
+
25
+ print(f"从 Hugging Face 下载{'模型' if repo_type == 'model' else '数据集'}: {repo_id}")
26
+ if token:
27
+ from huggingface_hub import HfApi
28
+
29
+ api = HfApi(token=token)
30
+ local_dir = api.snapshot_download(repo_id=repo_id, local_dir=download_dir, token=token, repo_type=repo_type)
31
+ else:
32
+ local_dir = hf_snapshot_download(repo_id=repo_id, local_dir=download_dir, token=token, repo_type=repo_type)
33
+ elif backend == "modelscope":
34
+ from modelscope.hub.snapshot_download import snapshot_download as ms_snapshot_download
35
+
36
+ if token:
37
+ from modelscope import HubApi
38
+
39
+ api = HubApi()
40
+ api.login(access_token=token)
41
+ print(f"从 ModelScope 下载模型: {repo_id}")
42
+ local_dir = ms_snapshot_download(model_id=repo_id, local_dir=download_dir)
43
+ else:
44
+ raise ValueError(f"不支持的 backend: {backend}")
45
+
46
+ print(f"模型文件已下载到: {local_dir}")
47
+ return local_dir
48
+
49
+
50
+ if __name__ == "__main__":
51
+ download_model("SWHL/ChineseOCRBench", repo_type="dataset")
@@ -0,0 +1,542 @@
1
+ # !/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """读取Excel文件,提取图片并将其与表格行对齐"""
4
+
5
+ import pandas as pd
6
+ import os
7
+ import hashlib
8
+ from openpyxl import load_workbook, Workbook
9
+ from openpyxl.drawing.image import Image
10
+ import glob
11
+
12
+
13
+ def extract_excel_with_images(
14
+ excel_path,
15
+ image_column_names, # 改为支持多列:可以是字符串(单列)或列表(多列)
16
+ output_column_name=None, # 输出合并后的列名,如果为None则使用第一个图像列名
17
+ image_output_dir="extracted_images",
18
+ sheet_name=0,
19
+ use_hash_filename=False,
20
+ save_updated_excel=False,
21
+ image_output_dir_prefix=False,
22
+ use_global_image_folder=False, # 是否使用全局统一图像文件夹
23
+ dtype_columns=None, # 指定列的数据类型,格式为 {'列名': 数据类型},如 {'nid': str}
24
+ use_absolute_path=False, # 是否使用绝对路径保存图像路径
25
+ ):
26
+ """
27
+ 读取包含图片的Excel文件,提取图片并将其与表格行对齐。
28
+
29
+ 参数:
30
+ - excel_path: Excel文件路径
31
+ - image_column_names: 图片所在的列名称,可以是字符串(单列)或列表(多列)
32
+ - output_column_name: 输出合并后的列名,如果为None则使用第一个图像列名
33
+ - image_output_dir: 图片保存路径
34
+ - sheet_name: 指定读取的工作表(默认读取第一个)
35
+ - use_hash_filename: 是否使用哈希值作为文件名(默认为False)
36
+ - save_updated_excel: 是否保存更新后的Excel文件(默认为False)
37
+ - image_output_dir_prefix: 图片输出目录前缀, 如果为False,则不使用前缀, 只有文件名,不包含路径
38
+ - use_global_image_folder: 是否使用全局统一图像文件夹(默认为False,每个Excel单独文件夹)
39
+ - dtype_columns: 指定列的数据类型,格式为 {'列名': 数据类型},如 {'nid': str}(默认为None)
40
+ - use_absolute_path: 是否使用绝对路径保存图像路径(默认为False,使用相对路径)
41
+
42
+ 返回:
43
+ - 一个带有图片路径的新DataFrame
44
+ """
45
+ # 获取Excel文件名(不含路径和扩展名)
46
+ excel_basename = os.path.splitext(os.path.basename(excel_path))[0]
47
+
48
+ # 加载 Excel 文件
49
+ wb = load_workbook(excel_path)
50
+ ws = wb[sheet_name] if isinstance(sheet_name, str) else wb.active
51
+
52
+ # 如果sheet_name为None或数字,获取当前活动的工作表名
53
+ actual_sheet_name = sheet_name if isinstance(sheet_name, str) else ws.title
54
+
55
+ # 根据use_global_image_folder参数决定图片输出目录结构
56
+ if use_global_image_folder:
57
+ # 使用全局统一图像文件夹
58
+ specific_image_dir = image_output_dir
59
+ else:
60
+ # 创建特定于此Excel的图片输出目录
61
+ # 如果只有一个工作表,不创建额外的工作表子目录
62
+ if len(wb.worksheets) == 1:
63
+ specific_image_dir = os.path.join(image_output_dir, excel_basename)
64
+ else:
65
+ specific_image_dir = os.path.join(
66
+ image_output_dir, excel_basename, actual_sheet_name
67
+ )
68
+ os.makedirs(specific_image_dir, exist_ok=True)
69
+
70
+ # 使用openpyxl直接读取数据
71
+ data = []
72
+ headers = []
73
+
74
+ # 获取表头
75
+ for cell in ws[1]:
76
+ headers.append(cell.value)
77
+
78
+ # 创建列名到索引的映射,用于快速查找需要特殊处理的列
79
+ col_to_index = {col_name: idx for idx, col_name in enumerate(headers)}
80
+ force_string_columns = set()
81
+
82
+ # 识别需要强制为字符串的列
83
+ if dtype_columns:
84
+ for col_name, col_dtype in dtype_columns.items():
85
+ if col_name in col_to_index and col_dtype == str:
86
+ force_string_columns.add(col_to_index[col_name])
87
+ print(
88
+ f"列 '{col_name}' (索引 {col_to_index[col_name]}) 将强制读取为字符串类型"
89
+ )
90
+
91
+ # 获取所有数据行
92
+ for row in ws.iter_rows(min_row=2): # 从第二行开始(跳过表头)
93
+ row_data = []
94
+ for col_idx, cell in enumerate(row):
95
+ if col_idx in force_string_columns:
96
+ # 对于需要强制为字符串的列,直接转换为字符串(保持原始格式)
97
+ row_data.append(str(cell.value) if cell.value is not None else "")
98
+ else:
99
+ # 对于其他列,保持原始值
100
+ row_data.append(cell.value if cell.value is not None else None)
101
+ data.append(row_data)
102
+
103
+ # 创建DataFrame
104
+ df = pd.DataFrame(data, columns=headers)
105
+
106
+ # 应用其他(非字符串)的数据类型
107
+ if dtype_columns:
108
+ for col_name, col_dtype in dtype_columns.items():
109
+ if (
110
+ col_name in df.columns and col_dtype != str
111
+ ): # 字符串类型已经在读取时处理了
112
+ try:
113
+ df[col_name] = df[col_name].astype(col_dtype)
114
+
115
+ print(f"列 '{col_name}' 已设置为 {col_dtype} 类型")
116
+ except Exception as e:
117
+ print(f"设置列 '{col_name}' 为 {col_dtype} 类型时出错: {e}")
118
+ elif col_name not in df.columns:
119
+ print(f"警告: 列 '{col_name}' 不存在,跳过类型设置")
120
+ # 打印DataFrame信息
121
+ print(f"DataFrame信息:")
122
+ print(f"- 总行数: {len(df)}")
123
+ if dtype_columns:
124
+ print(f"- 已应用的数据类型: {dtype_columns}")
125
+ # print(f"- 列名: {list(df.columns)}")
126
+ # print(f"- 数据类型:\n{df.dtypes}")
127
+
128
+ # 处理图像列名参数:支持单列(字符串)和多列(列表)
129
+ if isinstance(image_column_names, str):
130
+ image_columns_list = [image_column_names]
131
+ else:
132
+ image_columns_list = image_column_names
133
+
134
+ # 确定输出列名
135
+ if output_column_name is None:
136
+ output_column_name = image_columns_list[0]
137
+
138
+ # 确保指定的列名都存在
139
+ cols = list(df.columns)
140
+ image_columns = {} # column_name: column_index
141
+
142
+ for col_name in image_columns_list:
143
+ if col_name not in df.columns:
144
+ raise ValueError(f"列名 '{col_name}' 在Excel文件中不存在")
145
+ image_columns[col_name] = cols.index(col_name) + 1
146
+
147
+ # 如果有多列图像,需要准备合并列
148
+ if len(image_columns_list) > 1 and output_column_name not in df.columns:
149
+ # 添加新的输出列
150
+ df[output_column_name] = ""
151
+
152
+ # 提取图片和位置
153
+ image_map = {} # row_num: [image_filenames_list]
154
+ print(f"开始提取图片,总行数: {len(df)}")
155
+ print(f"目标图像列: {list(image_columns.keys())}")
156
+
157
+ for img in ws._images:
158
+ anchor = img.anchor._from # 起始锚点
159
+ row = anchor.row + 1
160
+ col = anchor.col + 1
161
+
162
+ # 检查是否在任何目标图像列中
163
+ target_col_name = None
164
+ for col_name, col_index in image_columns.items():
165
+ if col == col_index:
166
+ target_col_name = col_name
167
+ break
168
+
169
+ if target_col_name is None:
170
+ continue # 忽略不在目标列的图片
171
+
172
+ if use_hash_filename:
173
+ # 使用图片内容生成哈希值作为文件名
174
+ img_hash = hashlib.md5(img.ref.getvalue()).hexdigest()
175
+ filename = f"{img_hash}.png"
176
+ else:
177
+ # 包含Excel文件名以避免全局文件夹中的命名冲突
178
+ if use_global_image_folder:
179
+ filename = f"{excel_basename}_r{row}_c{col}_{target_col_name}.png"
180
+ else:
181
+ filename = f"image_r{row}_c{col}_{target_col_name}.png"
182
+
183
+ save_path = os.path.join(specific_image_dir, filename)
184
+ # 保存图片
185
+ with open(save_path, "wb") as img_file:
186
+ img_file.write(img.ref.getvalue())
187
+
188
+ # 根据 use_absolute_path 和 image_output_dir_prefix 参数决定保存的路径格式
189
+ if use_absolute_path:
190
+ # 使用绝对路径
191
+ img_path = os.path.abspath(save_path)
192
+ elif image_output_dir_prefix:
193
+ # 使用相对路径
194
+ img_path = os.path.join(specific_image_dir, filename)
195
+ else:
196
+ # 只使用文件名
197
+ img_path = filename
198
+
199
+ # 将图片路径添加到对应行
200
+ if row not in image_map:
201
+ image_map[row] = []
202
+ image_map[row].append(img_path)
203
+
204
+ total_images = sum(len(paths) for paths in image_map.values())
205
+ print(f"找到的图片数量: {total_images},涉及行数: {len(image_map)}")
206
+
207
+ # 添加图片路径到输出列,多个图片用\n分割
208
+ for row_num, img_paths in image_map.items():
209
+ df_index = row_num - 2 # 减2:1行表头 + 1行从1开始变成从0
210
+ if 0 <= df_index < len(df):
211
+ # 将多个图片路径用\n连接
212
+ combined_paths = "\n\n".join(img_paths)
213
+ df.loc[df_index, output_column_name] = combined_paths
214
+
215
+ # 如果使用了多列图像且创建了新的输出列,清空原始图像列(除了输出列)
216
+ if len(image_columns_list) > 1:
217
+ for col_name in image_columns_list:
218
+ if col_name != output_column_name:
219
+ df[col_name] = ""
220
+
221
+ # 生成更新后的Excel文件
222
+ if save_updated_excel:
223
+ output_excel_path = os.path.join(
224
+ os.path.dirname(excel_path),
225
+ f"{excel_basename}_updated{os.path.splitext(excel_path)[1]}",
226
+ )
227
+ df.to_excel(
228
+ output_excel_path,
229
+ sheet_name=actual_sheet_name,
230
+ index=False,
231
+ engine="openpyxl",
232
+ )
233
+ print(f"更新后的Excel文件已保存到: {output_excel_path}")
234
+
235
+ return df
236
+
237
+
238
+ def deduplicate_columns(df):
239
+ cols = pd.Series(df.columns)
240
+ for dup in cols[cols.duplicated()].unique():
241
+ cols[cols[cols == dup].index.values.tolist()] = [
242
+ dup + "." + str(i) if i != 0 else dup for i in range(sum(cols == dup))
243
+ ]
244
+ df.columns = cols
245
+ return df
246
+
247
+
248
+ def process_directory(
249
+ dir_path,
250
+ output_dir_suffix,
251
+ image_columns=["图片"],
252
+ output_column="图片",
253
+ use_global_folder=False,
254
+ merge_excel_output=False,
255
+ dtype_columns=None,
256
+ ):
257
+ """处理指定目录下的所有Excel文件
258
+
259
+ 参数:
260
+ - dir_path: 目录路径
261
+ - output_dir_suffix: 输出目录后缀
262
+ - image_columns: 图像列名列表,默认为["图片"]
263
+ - output_column: 输出列名,默认为"图片"
264
+ - use_global_folder: 是否使用全局统一图像文件夹
265
+ - merge_excel_output: 是否合并所有Excel输出为一个文件
266
+ - dtype_columns: 指定列的数据类型,格式为 {'列名': 数据类型},如 {'nid': str}(默认为None)
267
+ """
268
+ excel_files = glob.glob(os.path.join(dir_path, "*.xlsx"))
269
+ processed_dataframes = []
270
+
271
+ for excel_file in excel_files:
272
+ print(f"正在处理: {excel_file}")
273
+
274
+ # 跳过已经更新过的文件
275
+ if "_updated" in excel_file:
276
+ print(f"跳过已更新的文件: {excel_file}")
277
+ continue
278
+
279
+ try:
280
+ # 当需要合并Excel输出时,不保存单独的Excel文件
281
+ save_individual = not merge_excel_output
282
+
283
+ df = extract_excel_with_images(
284
+ excel_path=excel_file,
285
+ image_column_names=image_columns,
286
+ output_column_name=output_column,
287
+ image_output_dir=f"{output_dir_suffix}-images",
288
+ use_hash_filename=True,
289
+ save_updated_excel=save_individual,
290
+ use_global_image_folder=use_global_folder,
291
+ dtype_columns=dtype_columns,
292
+ )
293
+ df = deduplicate_columns(df)
294
+
295
+ if merge_excel_output:
296
+ # 添加源文件名列以便追踪数据来源
297
+ excel_basename = os.path.splitext(os.path.basename(excel_file))[0]
298
+ df["source_file"] = excel_basename
299
+ processed_dataframes.append(df)
300
+
301
+ print(f"完成处理: {excel_file}")
302
+ except Exception as e:
303
+ print(f"处理文件 {excel_file} 时出错: {str(e)}")
304
+
305
+ # 如果需要合并Excel输出
306
+ if merge_excel_output and processed_dataframes:
307
+ print(f"\n开始合并 {len(processed_dataframes)} 个Excel文件...")
308
+
309
+ # 合并所有DataFrame
310
+ merged_df = pd.concat(processed_dataframes, ignore_index=True)
311
+
312
+ # 保存合并后的CSV文件
313
+ merged_csv_path = os.path.join(dir_path, f"{output_dir_suffix}_merged.csv")
314
+ merged_df.to_csv(merged_csv_path, index=False, encoding="utf-8")
315
+
316
+ print(f"合并后的CSV文件已保存到: {merged_csv_path}")
317
+ print(f"合并数据总行数: {len(merged_df)}")
318
+ print(f"包含源文件: {list(merged_df['source_file'].unique())}")
319
+
320
+
321
+ def insert_images_to_excel(
322
+ excel_path,
323
+ image_column_name,
324
+ images_dir="images",
325
+ sheet_name=0,
326
+ image_row_start=2,
327
+ image_width=100,
328
+ image_height=100,
329
+ create_if_not_exists=True,
330
+ image_file_patterns=None, # 图像文件匹配模式,如 ["*.jpg", "*.png", "*.jpeg"]
331
+ sort_images=True, # 是否对图像文件进行排序
332
+ ):
333
+ """
334
+ 将本地图像文件插入到Excel文件的指定列中。
335
+
336
+ 参数:
337
+ - excel_path: Excel文件路径
338
+ - image_column_name: 要插入图像的列名
339
+ - images_dir: 图像文件夹路径,默认为"images"
340
+ - sheet_name: 指定工作表(默认第一个)
341
+ - image_row_start: 开始插入图像的行号,默认从第2行开始(跳过表头)
342
+ - image_width: 图像宽度,像素,默认100
343
+ - image_height: 图像高度,像素,默认100
344
+ - create_if_not_exists: 如果Excel文件不存在是否创建新文件,默认True
345
+ - image_file_patterns: 图像文件匹配模式,默认为常见图像格式
346
+ - sort_images: 是否对图像文件进行排序,默认True
347
+
348
+ 返回:
349
+ - 插入图像的数量
350
+ """
351
+ # 默认图像文件匹配模式
352
+ if image_file_patterns is None:
353
+ image_file_patterns = ["*.jpg", "*.jpeg", "*.png", "*.gif", "*.bmp"]
354
+
355
+ # 检查图像文件夹是否存在
356
+ if not os.path.exists(images_dir):
357
+ raise ValueError(f"图像文件夹不存在: {images_dir}")
358
+
359
+ # 获取所有符合条件的图像文件
360
+ image_files = []
361
+ for pattern in image_file_patterns:
362
+ image_files.extend(glob.glob(os.path.join(images_dir, pattern)))
363
+
364
+ if sort_images:
365
+ image_files.sort()
366
+
367
+ if not image_files:
368
+ print(f"在 {images_dir} 目录中未找到符合条件的图像文件")
369
+ return 0
370
+
371
+ print(f"找到 {len(image_files)} 个图像文件: {[os.path.basename(f) for f in image_files]}")
372
+
373
+ # 处理Excel文件
374
+ if os.path.exists(excel_path):
375
+ # 加载现有Excel文件
376
+ wb = load_workbook(excel_path)
377
+ print(f"加载现有Excel文件: {excel_path}")
378
+ else:
379
+ if create_if_not_exists:
380
+ # 创建新的Excel文件
381
+ wb = Workbook()
382
+ print(f"创建新Excel文件: {excel_path}")
383
+ else:
384
+ raise ValueError(f"Excel文件不存在且不允许创建: {excel_path}")
385
+
386
+ # 获取工作表
387
+ if isinstance(sheet_name, str):
388
+ if sheet_name in wb.sheetnames:
389
+ ws = wb[sheet_name]
390
+ else:
391
+ ws = wb.create_sheet(sheet_name)
392
+ print(f"创建新工作表: {sheet_name}")
393
+ else:
394
+ ws = wb.active
395
+
396
+ # 确保有表头行
397
+ if ws.max_row == 0 or ws.cell(1, 1).value is None:
398
+ # 创建简单的表头
399
+ ws.cell(1, 1, "ID")
400
+ if image_column_name != "ID":
401
+ ws.cell(1, 2, image_column_name)
402
+ target_col = 2
403
+ else:
404
+ target_col = 1
405
+ else:
406
+ # 查找目标列或创建新列
407
+ target_col = None
408
+ for col in range(1, ws.max_column + 1):
409
+ if ws.cell(1, col).value == image_column_name:
410
+ target_col = col
411
+ break
412
+
413
+ if target_col is None:
414
+ # 在最后添加新列
415
+ target_col = ws.max_column + 1
416
+ ws.cell(1, target_col, image_column_name)
417
+
418
+ print(f"将图像插入到第 {target_col} 列 '{image_column_name}'")
419
+
420
+ # 插入图像
421
+ inserted_count = 0
422
+ current_row = image_row_start
423
+
424
+ for image_file in image_files:
425
+ try:
426
+ # 创建图像对象
427
+ img = Image(image_file)
428
+
429
+ # 调整图像尺寸
430
+ img.width = image_width
431
+ img.height = image_height
432
+
433
+ # 获取单元格位置
434
+ cell_address = ws.cell(current_row, target_col).coordinate
435
+
436
+ # 插入图像到单元格
437
+ img.anchor = cell_address
438
+ ws.add_image(img)
439
+
440
+ # 调整行高以适应图像
441
+ row_height_points = image_height * 0.75 # 像素转点数的近似换算
442
+ if ws.row_dimensions[current_row].height is None or ws.row_dimensions[current_row].height < row_height_points:
443
+ ws.row_dimensions[current_row].height = row_height_points
444
+
445
+ # 调整列宽以适应图像
446
+ column_letter = ws.cell(current_row, target_col).column_letter
447
+ column_width_chars = image_width / 7 # 像素转字符宽度的近似换算
448
+ if ws.column_dimensions[column_letter].width is None or ws.column_dimensions[column_letter].width < column_width_chars:
449
+ ws.column_dimensions[column_letter].width = column_width_chars
450
+
451
+ # 在同一行添加图像文件名(可选)
452
+ if target_col > 1: # 如果不是第一列,在第一列添加文件名
453
+ ws.cell(current_row, 1, os.path.splitext(os.path.basename(image_file))[0])
454
+
455
+ print(f"插入图像 {os.path.basename(image_file)} 到第 {current_row} 行")
456
+ inserted_count += 1
457
+ current_row += 1
458
+
459
+ except Exception as e:
460
+ print(f"插入图像 {image_file} 时出错: {str(e)}")
461
+
462
+ # 保存Excel文件
463
+ try:
464
+ wb.save(excel_path)
465
+ print(f"Excel文件已保存: {excel_path}")
466
+ print(f"成功插入 {inserted_count} 个图像")
467
+ except Exception as e:
468
+ print(f"保存Excel文件时出错: {str(e)}")
469
+ return 0
470
+
471
+ return inserted_count
472
+
473
+
474
+ def create_image_excel_from_folder(
475
+ images_dir="images",
476
+ output_excel_path="images_output.xlsx",
477
+ image_column_name="图片",
478
+ image_width=150,
479
+ image_height=150,
480
+ ):
481
+ """
482
+ 从图像文件夹创建包含所有图像的Excel文件的便利函数。
483
+
484
+ 参数:
485
+ - images_dir: 图像文件夹路径,默认为"images"
486
+ - output_excel_path: 输出Excel文件路径,默认为"images_output.xlsx"
487
+ - image_column_name: 图像列名,默认为"图片"
488
+ - image_width: 图像宽度,像素,默认150
489
+ - image_height: 图像高度,像素,默认150
490
+
491
+ 返回:
492
+ - 插入图像的数量
493
+
494
+ 示例用法:
495
+ >>> count = create_image_excel_from_folder("images", "test_output.xlsx", "图片", 120, 120)
496
+ >>> print(f"成功插入了 {count} 个图像")
497
+ """
498
+ return insert_images_to_excel(
499
+ excel_path=output_excel_path,
500
+ image_column_name=image_column_name,
501
+ images_dir=images_dir,
502
+ image_width=image_width,
503
+ image_height=image_height,
504
+ create_if_not_exists=True,
505
+ )
506
+
507
+
508
+ if __name__ == "__main__":
509
+ # 配置示例
510
+ directories = [
511
+ # (目录名, 输出前缀, 图像列列表, 输出列名, 是否使用全局文件夹, 是否合并Excel输出)
512
+ (
513
+ "origin",
514
+ "target",
515
+ ["黑图索引1", "黑图索引2", "黑图索引3"],
516
+ "image_urls",
517
+ True,
518
+ True,
519
+ ),
520
+ # 更多示例:
521
+ # ("origin", "target", ["图片"], "图片", False, False), # 单列,每个Excel独立文件夹,不合并
522
+ # ("origin", "target", ["图片1", "图片2"], "合并图片", True, True), # 多列,全局文件夹,合并Excel
523
+ ]
524
+
525
+ for (
526
+ dir_name,
527
+ output_prefix,
528
+ img_cols,
529
+ out_col,
530
+ global_folder,
531
+ merge_excel,
532
+ ) in directories:
533
+ print(f"\n开始处理目录: {dir_name}")
534
+ print(
535
+ f"图像列: {img_cols},输出列: {out_col},全局文件夹: {global_folder},合并Excel: {merge_excel}"
536
+ )
537
+ process_directory(
538
+ dir_name, output_prefix, img_cols, out_col, global_folder, merge_excel
539
+ )
540
+ print(f"完成处理目录: {dir_name}")
541
+
542
+ print("\n所有文件处理完成!")