maque 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. maque/__init__.py +30 -0
  2. maque/__main__.py +926 -0
  3. maque/ai_platform/__init__.py +0 -0
  4. maque/ai_platform/crawl.py +45 -0
  5. maque/ai_platform/metrics.py +258 -0
  6. maque/ai_platform/nlp_preprocess.py +67 -0
  7. maque/ai_platform/webpage_screen_shot.py +195 -0
  8. maque/algorithms/__init__.py +78 -0
  9. maque/algorithms/bezier.py +15 -0
  10. maque/algorithms/bktree.py +117 -0
  11. maque/algorithms/core.py +104 -0
  12. maque/algorithms/hilbert.py +16 -0
  13. maque/algorithms/rate_function.py +92 -0
  14. maque/algorithms/transform.py +27 -0
  15. maque/algorithms/trie.py +272 -0
  16. maque/algorithms/utils.py +63 -0
  17. maque/algorithms/video.py +587 -0
  18. maque/api/__init__.py +1 -0
  19. maque/api/common.py +110 -0
  20. maque/api/fetch.py +26 -0
  21. maque/api/static/icon.png +0 -0
  22. maque/api/static/redoc.standalone.js +1782 -0
  23. maque/api/static/swagger-ui-bundle.js +3 -0
  24. maque/api/static/swagger-ui.css +3 -0
  25. maque/cli/__init__.py +1 -0
  26. maque/cli/clean_invisible_chars.py +324 -0
  27. maque/cli/core.py +34 -0
  28. maque/cli/groups/__init__.py +26 -0
  29. maque/cli/groups/config.py +205 -0
  30. maque/cli/groups/data.py +615 -0
  31. maque/cli/groups/doctor.py +259 -0
  32. maque/cli/groups/embedding.py +222 -0
  33. maque/cli/groups/git.py +29 -0
  34. maque/cli/groups/help.py +410 -0
  35. maque/cli/groups/llm.py +223 -0
  36. maque/cli/groups/mcp.py +241 -0
  37. maque/cli/groups/mllm.py +1795 -0
  38. maque/cli/groups/mllm_simple.py +60 -0
  39. maque/cli/groups/quant.py +210 -0
  40. maque/cli/groups/service.py +490 -0
  41. maque/cli/groups/system.py +570 -0
  42. maque/cli/mllm_run.py +1451 -0
  43. maque/cli/script.py +52 -0
  44. maque/cli/tree.py +49 -0
  45. maque/clustering/__init__.py +52 -0
  46. maque/clustering/analyzer.py +347 -0
  47. maque/clustering/clusterers.py +464 -0
  48. maque/clustering/sampler.py +134 -0
  49. maque/clustering/visualizer.py +205 -0
  50. maque/constant.py +13 -0
  51. maque/core.py +133 -0
  52. maque/cv/__init__.py +1 -0
  53. maque/cv/image.py +219 -0
  54. maque/cv/utils.py +68 -0
  55. maque/cv/video/__init__.py +3 -0
  56. maque/cv/video/keyframe_extractor.py +368 -0
  57. maque/embedding/__init__.py +43 -0
  58. maque/embedding/base.py +56 -0
  59. maque/embedding/multimodal.py +308 -0
  60. maque/embedding/server.py +523 -0
  61. maque/embedding/text.py +311 -0
  62. maque/git/__init__.py +24 -0
  63. maque/git/pure_git.py +912 -0
  64. maque/io/__init__.py +29 -0
  65. maque/io/core.py +38 -0
  66. maque/io/ops.py +194 -0
  67. maque/llm/__init__.py +111 -0
  68. maque/llm/backend.py +416 -0
  69. maque/llm/base.py +411 -0
  70. maque/llm/server.py +366 -0
  71. maque/mcp_server.py +1096 -0
  72. maque/mllm_data_processor_pipeline/__init__.py +17 -0
  73. maque/mllm_data_processor_pipeline/core.py +341 -0
  74. maque/mllm_data_processor_pipeline/example.py +291 -0
  75. maque/mllm_data_processor_pipeline/steps/__init__.py +56 -0
  76. maque/mllm_data_processor_pipeline/steps/data_alignment.py +267 -0
  77. maque/mllm_data_processor_pipeline/steps/data_loader.py +172 -0
  78. maque/mllm_data_processor_pipeline/steps/data_validation.py +304 -0
  79. maque/mllm_data_processor_pipeline/steps/format_conversion.py +411 -0
  80. maque/mllm_data_processor_pipeline/steps/mllm_annotation.py +331 -0
  81. maque/mllm_data_processor_pipeline/steps/mllm_refinement.py +446 -0
  82. maque/mllm_data_processor_pipeline/steps/result_validation.py +501 -0
  83. maque/mllm_data_processor_pipeline/web_app.py +317 -0
  84. maque/nlp/__init__.py +14 -0
  85. maque/nlp/ngram.py +9 -0
  86. maque/nlp/parser.py +63 -0
  87. maque/nlp/risk_matcher.py +543 -0
  88. maque/nlp/sentence_splitter.py +202 -0
  89. maque/nlp/simple_tradition_cvt.py +31 -0
  90. maque/performance/__init__.py +21 -0
  91. maque/performance/_measure_time.py +70 -0
  92. maque/performance/_profiler.py +367 -0
  93. maque/performance/_stat_memory.py +51 -0
  94. maque/pipelines/__init__.py +15 -0
  95. maque/pipelines/clustering.py +252 -0
  96. maque/quantization/__init__.py +42 -0
  97. maque/quantization/auto_round.py +120 -0
  98. maque/quantization/base.py +145 -0
  99. maque/quantization/bitsandbytes.py +127 -0
  100. maque/quantization/llm_compressor.py +102 -0
  101. maque/retriever/__init__.py +35 -0
  102. maque/retriever/chroma.py +654 -0
  103. maque/retriever/document.py +140 -0
  104. maque/retriever/milvus.py +1140 -0
  105. maque/table_ops/__init__.py +1 -0
  106. maque/table_ops/core.py +133 -0
  107. maque/table_viewer/__init__.py +4 -0
  108. maque/table_viewer/download_assets.py +57 -0
  109. maque/table_viewer/server.py +698 -0
  110. maque/table_viewer/static/element-plus-icons.js +5791 -0
  111. maque/table_viewer/static/element-plus.css +1 -0
  112. maque/table_viewer/static/element-plus.js +65236 -0
  113. maque/table_viewer/static/main.css +268 -0
  114. maque/table_viewer/static/main.js +669 -0
  115. maque/table_viewer/static/vue.global.js +18227 -0
  116. maque/table_viewer/templates/index.html +401 -0
  117. maque/utils/__init__.py +56 -0
  118. maque/utils/color.py +68 -0
  119. maque/utils/color_string.py +45 -0
  120. maque/utils/compress.py +66 -0
  121. maque/utils/constant.py +183 -0
  122. maque/utils/core.py +261 -0
  123. maque/utils/cursor.py +143 -0
  124. maque/utils/distance.py +58 -0
  125. maque/utils/docker.py +96 -0
  126. maque/utils/downloads.py +51 -0
  127. maque/utils/excel_helper.py +542 -0
  128. maque/utils/helper_metrics.py +121 -0
  129. maque/utils/helper_parser.py +168 -0
  130. maque/utils/net.py +64 -0
  131. maque/utils/nvidia_stat.py +140 -0
  132. maque/utils/ops.py +53 -0
  133. maque/utils/packages.py +31 -0
  134. maque/utils/path.py +57 -0
  135. maque/utils/tar.py +260 -0
  136. maque/utils/untar.py +129 -0
  137. maque/web/__init__.py +0 -0
  138. maque/web/image_downloader.py +1410 -0
  139. maque-0.2.1.dist-info/METADATA +450 -0
  140. maque-0.2.1.dist-info/RECORD +143 -0
  141. maque-0.2.1.dist-info/WHEEL +4 -0
  142. maque-0.2.1.dist-info/entry_points.txt +3 -0
  143. maque-0.2.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1410 @@
1
+ # 使用icrawler库实现图片下载器
2
+ # pip install icrawler Pillow
3
+
4
+ import os
5
+ import sys
6
+ import hashlib
7
+ import glob
8
+ import shutil
9
+ import io
10
+ import logging
11
+ import re
12
+ from collections import defaultdict
13
+
14
+
15
+ # 自动安装缺失的依赖
16
+ def _install_missing_packages():
17
+ """自动安装缺失的依赖包"""
18
+ missing_packages = []
19
+
20
+ try:
21
+ import icrawler
22
+ except ImportError:
23
+ missing_packages.append("icrawler")
24
+
25
+ try:
26
+ from PIL import Image
27
+ except ImportError:
28
+ missing_packages.append("Pillow")
29
+
30
+ try:
31
+ import requests
32
+ except ImportError:
33
+ missing_packages.append("requests")
34
+
35
+ try:
36
+ from bs4 import BeautifulSoup
37
+ except ImportError:
38
+ missing_packages.append("beautifulsoup4")
39
+
40
+ if missing_packages:
41
+ print(f"检测到缺失的依赖包: {missing_packages}")
42
+ print("正在尝试自动安装...")
43
+
44
+ import subprocess
45
+
46
+ for package in missing_packages:
47
+ try:
48
+ subprocess.check_call([sys.executable, "-m", "pip", "install", package])
49
+ print(f"✓ 成功安装 {package}")
50
+ except subprocess.CalledProcessError as e:
51
+ print(f"✗ 安装 {package} 失败: {e}")
52
+ print(f"请手动运行: pip install {package}")
53
+ return False
54
+
55
+ print("依赖包安装完成,正在重新导入...")
56
+ return True
57
+ return True
58
+
59
+
60
+ # 尝试安装缺失的包
61
+ if not _install_missing_packages():
62
+ print("请手动安装缺失的依赖包后重试")
63
+ sys.exit(1)
64
+
65
+ # 导入依赖包
66
+ try:
67
+ from icrawler.builtin import (
68
+ BingImageCrawler,
69
+ BaiduImageCrawler,
70
+ GoogleImageCrawler, # 原版有bug,使用下方的FixedGoogleImageCrawler替代
71
+ FlickrImageCrawler,
72
+ GreedyImageCrawler,
73
+ UrlListCrawler,
74
+ )
75
+ from icrawler.builtin.google import GoogleFeeder
76
+ from icrawler import Crawler, Parser, ImageDownloader as IcrawlerDownloader
77
+ from PIL import Image
78
+ from icrawler.storage import BaseStorage
79
+ import requests
80
+ from bs4 import BeautifulSoup
81
+ import urllib.parse
82
+ import time
83
+ import json
84
+ except ImportError as e:
85
+ print(f"导入依赖包失败: {e}")
86
+ print("请运行以下命令安装依赖:")
87
+ print("pip install icrawler Pillow requests beautifulsoup4")
88
+ sys.exit(1)
89
+
90
+
91
+ class FixedGoogleParser(Parser):
92
+ """修复的Google图片解析器,确保总是返回列表而不是None"""
93
+
94
+ def parse(self, response):
95
+ # 使用response.text让requests自动处理编码和解压
96
+ try:
97
+ content = response.text
98
+ except Exception:
99
+ content = response.content.decode("utf-8", "ignore")
100
+
101
+ soup = BeautifulSoup(content, "html.parser")
102
+ image_divs = soup.find_all(name="script")
103
+
104
+ all_uris = []
105
+ for div in image_divs:
106
+ txt = str(div)
107
+ # 使用更精确的正则表达式匹配图片URL
108
+ uris = re.findall(r'https?://[^\s\'"<>\[\]]+\.(?:jpg|jpeg|png|webp)', txt, re.I)
109
+ if uris:
110
+ # 解码unicode转义序列
111
+ decoded_uris = []
112
+ for uri in uris:
113
+ try:
114
+ decoded = bytes(uri, "utf-8").decode("unicode-escape")
115
+ decoded_uris.append(decoded)
116
+ except Exception:
117
+ decoded_uris.append(uri)
118
+ all_uris.extend(decoded_uris)
119
+
120
+ # 去重
121
+ unique_uris = list(set(all_uris))
122
+ if unique_uris:
123
+ return [{"file_url": uri} for uri in unique_uris]
124
+ # 返回空列表而不是None,避免TypeError
125
+ return []
126
+
127
+
128
+ class FixedGoogleImageCrawler(Crawler):
129
+ """使用修复的解析器的Google图片爬虫"""
130
+
131
+ def __init__(
132
+ self, feeder_cls=GoogleFeeder, parser_cls=FixedGoogleParser,
133
+ downloader_cls=IcrawlerDownloader, *args, **kwargs
134
+ ):
135
+ super().__init__(feeder_cls, parser_cls, downloader_cls, *args, **kwargs)
136
+ # 设置更真实的浏览器请求头,避免被Google识别为bot
137
+ # 注意:不能用set_session()因为它会创建新的session对象,
138
+ # 而parser/downloader已经持有旧session的引用
139
+ custom_headers = {
140
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
141
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
142
+ "Accept-Language": "en-US,en;q=0.9",
143
+ }
144
+ self.session.headers.update(custom_headers)
145
+
146
+ def crawl(
147
+ self,
148
+ keyword,
149
+ filters=None,
150
+ offset=0,
151
+ max_num=1000,
152
+ min_size=None,
153
+ max_size=None,
154
+ language=None,
155
+ file_idx_offset=0,
156
+ overwrite=False,
157
+ max_idle_time=None,
158
+ ):
159
+ if offset + max_num > 1000:
160
+ if offset > 1000:
161
+ self.logger.error("Offset cannot exceed 1000")
162
+ return
163
+ elif max_num > 1000:
164
+ max_num = 1000 - offset
165
+ self.logger.warning(
166
+ "Due to Google's limitation, max_num has been set to %d.",
167
+ 1000 - offset,
168
+ )
169
+ feeder_kwargs = dict(
170
+ keyword=keyword, offset=offset, max_num=max_num,
171
+ language=language, filters=filters
172
+ )
173
+ downloader_kwargs = dict(
174
+ max_num=max_num,
175
+ min_size=min_size,
176
+ max_size=max_size,
177
+ file_idx_offset=file_idx_offset,
178
+ overwrite=overwrite,
179
+ max_idle_time=max_idle_time,
180
+ )
181
+ super().crawl(feeder_kwargs=feeder_kwargs, downloader_kwargs=downloader_kwargs)
182
+
183
+
184
+ class URLCapturingHandler(logging.Handler):
185
+ """自定义日志处理器,用于捕获icrawler的URL信息"""
186
+
187
+ def __init__(self):
188
+ super().__init__()
189
+ self.url_mappings = {}
190
+ self.image_counter = 0
191
+
192
+ def emit(self, record):
193
+ """处理日志记录,提取URL信息"""
194
+ if hasattr(record, "getMessage"):
195
+ message = record.getMessage()
196
+ # 匹配类似 "image #1 https://example.com/image.jpg" 的日志格式
197
+ url_match = re.search(r"image #(\d+)\s+(https?://[^\s]+)", message)
198
+ if url_match:
199
+ image_num = int(url_match.group(1))
200
+ url = url_match.group(2)
201
+ # 根据icrawler的命名约定,图片文件名格式为 000001.jpg, 000002.jpg 等
202
+ filename = f"{image_num:06d}.jpg"
203
+ self.url_mappings[filename] = url
204
+
205
+
206
+ class URLMappingStorage(BaseStorage):
207
+ """自定义存储类,用于捕获URL映射信息"""
208
+
209
+ def __init__(self, root_dir, url_mappings):
210
+ super().__init__(root_dir)
211
+ self.url_mappings = url_mappings
212
+
213
+ def write(self, task, **kwargs):
214
+ """重写write方法来捕获URL信息"""
215
+ file_idx = super().write(task, **kwargs)
216
+ if file_idx is not None:
217
+ # 捕获URL和文件路径信息
218
+ filename = self.get_filename(task, file_idx, **kwargs)
219
+ self.url_mappings.append(
220
+ {
221
+ "file_path": filename,
222
+ "original_url": task.get("img_url", ""),
223
+ "keyword": task.get("keyword", ""),
224
+ "engine": task.get("engine", ""),
225
+ }
226
+ )
227
+ return file_idx
228
+
229
+
230
+ class ImageDownloader:
231
+ """使用icrawler库实现的图片下载器"""
232
+
233
+ def __init__(self, save_dir="downloaded_images"):
234
+ """
235
+ 初始化图片下载器
236
+
237
+ 参数:
238
+ save_dir: 图片保存的目录,默认为"downloaded_images"
239
+ """
240
+ self.save_dir = save_dir
241
+ self.save_mapping = False
242
+ self.url_mappings = []
243
+ os.makedirs(save_dir, exist_ok=True)
244
+
245
+ def download_from_baidu(self, keyword, num_images=20):
246
+ """
247
+ 从百度图片搜索并下载图片
248
+
249
+ 参数:
250
+ keyword: 搜索关键词
251
+ num_images: 要下载的图片数量
252
+
253
+ 返回:
254
+ 下载的图片数量
255
+ """
256
+ # 为每个关键词创建一个临时目录
257
+ temp_dir = os.path.join(self.save_dir, "_temp_" + keyword.replace(" ", "_"))
258
+ os.makedirs(temp_dir, exist_ok=True)
259
+
260
+ # 创建最终保存的关键词目录
261
+ keyword_dir = os.path.join(self.save_dir, keyword.replace(" ", "_"))
262
+ os.makedirs(keyword_dir, exist_ok=True)
263
+
264
+ print(f"从百度搜索并下载 '{keyword}' 的图片...")
265
+
266
+ # 创建百度爬虫
267
+ crawler = BaiduImageCrawler(
268
+ downloader_threads=4, storage={"root_dir": temp_dir}
269
+ )
270
+
271
+ # 执行爬取
272
+ crawler.crawl(keyword=keyword, max_num=num_images)
273
+
274
+ # 如果需要保存URL映射,先从日志中提取URL信息
275
+ url_mappings = []
276
+ if self.save_mapping:
277
+ url_mappings = self._extract_urls_from_temp_dir(temp_dir, keyword, "baidu")
278
+
279
+ # 转换所有图片为jpg格式并使用哈希文件名,保存到对应关键词目录
280
+ converted = self._convert_images_to_jpg_with_hash(
281
+ temp_dir, keyword, keyword_dir, "baidu", url_mappings
282
+ )
283
+
284
+ # 清理临时目录
285
+ shutil.rmtree(temp_dir, ignore_errors=True)
286
+
287
+ return converted
288
+
289
+ def download_from_bing(self, keyword, num_images=20):
290
+ """
291
+ 从必应图片搜索并下载图片
292
+
293
+ 参数:
294
+ keyword: 搜索关键词
295
+ num_images: 要下载的图片数量
296
+
297
+ 返回:
298
+ 下载的图片数量
299
+ """
300
+ # 为每个关键词创建一个临时目录
301
+ temp_dir = os.path.join(self.save_dir, "_temp_" + keyword.replace(" ", "_"))
302
+ os.makedirs(temp_dir, exist_ok=True)
303
+
304
+ # 创建最终保存的关键词目录
305
+ keyword_dir = os.path.join(self.save_dir, keyword.replace(" ", "_"))
306
+ os.makedirs(keyword_dir, exist_ok=True)
307
+
308
+ print(f"从必应搜索并下载 '{keyword}' 的图片...")
309
+
310
+ # 如果需要捕获URL,设置日志处理器
311
+ url_handler = None
312
+ if self.save_mapping:
313
+ url_handler = URLCapturingHandler()
314
+ # 获取icrawler相关的所有logger并添加我们的处理器
315
+ loggers = ["icrawler", "downloader", "parser", "feeder"]
316
+ for logger_name in loggers:
317
+ logger = logging.getLogger(logger_name)
318
+ logger.addHandler(url_handler)
319
+ logger.setLevel(logging.INFO)
320
+
321
+ # 创建必应爬虫
322
+ crawler = BingImageCrawler(downloader_threads=4, storage={"root_dir": temp_dir})
323
+
324
+ # 执行爬取
325
+ crawler.crawl(keyword=keyword, max_num=num_images)
326
+
327
+ # 移除URL处理器
328
+ if url_handler:
329
+ loggers = ["icrawler", "downloader", "parser", "feeder"]
330
+ for logger_name in loggers:
331
+ logger = logging.getLogger(logger_name)
332
+ logger.removeHandler(url_handler)
333
+
334
+ # 如果需要保存URL映射,从URL处理器中获取URL信息
335
+ url_mappings = {}
336
+ if self.save_mapping and url_handler:
337
+ url_mappings = url_handler.url_mappings
338
+
339
+ # 转换所有图片为jpg格式并使用哈希文件名,保存到对应关键词目录
340
+ converted = self._convert_images_to_jpg_with_hash(
341
+ temp_dir, keyword, keyword_dir, "bing", url_mappings
342
+ )
343
+
344
+ # 清理临时目录
345
+ shutil.rmtree(temp_dir, ignore_errors=True)
346
+
347
+ return converted
348
+
349
+ def download_from_google(self, keyword, num_images=20):
350
+ """
351
+ 从谷歌图片搜索并下载图片
352
+
353
+ 参数:
354
+ keyword: 搜索关键词
355
+ num_images: 要下载的图片数量
356
+
357
+ 返回:
358
+ 下载的图片数量
359
+
360
+ 注意:
361
+ Google经常更改其HTML结构,可能导致icrawler的解析器失效。
362
+ 如果遇到问题,建议使用bing或baidu引擎替代。
363
+ """
364
+ # 为每个关键词创建一个临时目录
365
+ temp_dir = os.path.join(self.save_dir, "_temp_" + keyword.replace(" ", "_"))
366
+ os.makedirs(temp_dir, exist_ok=True)
367
+
368
+ # 创建最终保存的关键词目录
369
+ keyword_dir = os.path.join(self.save_dir, keyword.replace(" ", "_"))
370
+ os.makedirs(keyword_dir, exist_ok=True)
371
+
372
+ print(f"从谷歌搜索并下载 '{keyword}' 的图片...")
373
+
374
+ try:
375
+ # 创建使用修复解析器的谷歌爬虫
376
+ crawler = FixedGoogleImageCrawler(
377
+ downloader_threads=4, storage={"root_dir": temp_dir}
378
+ )
379
+
380
+ # 执行爬取
381
+ crawler.crawl(keyword=keyword, max_num=num_images)
382
+ except Exception as e:
383
+ print(f"Google搜索失败: {e}")
384
+ print("提示: Google经常更改HTML结构,建议使用bing或baidu引擎替代")
385
+
386
+ # 如果需要保存URL映射,先从日志中提取URL信息
387
+ url_mappings = []
388
+ if self.save_mapping:
389
+ url_mappings = self._extract_urls_from_temp_dir(temp_dir, keyword, "google")
390
+
391
+ # 转换所有图片为jpg格式并使用哈希文件名,保存到对应关键词目录
392
+ converted = self._convert_images_to_jpg_with_hash(
393
+ temp_dir, keyword, keyword_dir, "google", url_mappings
394
+ )
395
+
396
+ # 清理临时目录
397
+ shutil.rmtree(temp_dir, ignore_errors=True)
398
+
399
+ return converted
400
+
401
+ def download_from_flickr(
402
+ self, keyword, num_images=20, api_key=None, api_secret=None, tag_mode="any"
403
+ ):
404
+ """
405
+ 从Flickr搜索并下载图片
406
+
407
+ 参数:
408
+ keyword: 搜索关键词
409
+ num_images: 要下载的图片数量
410
+ api_key: Flickr API key (可选,如果没有会使用匿名模式,但功能受限)
411
+ api_secret: Flickr API secret (可选)
412
+ tag_mode: 标签匹配模式,'any'或'all'
413
+
414
+ 返回:
415
+ 下载的图片数量
416
+ """
417
+ # 为每个关键词创建一个临时目录
418
+ temp_dir = os.path.join(self.save_dir, "_temp_" + keyword.replace(" ", "_"))
419
+ os.makedirs(temp_dir, exist_ok=True)
420
+
421
+ # 创建最终保存的关键词目录
422
+ keyword_dir = os.path.join(self.save_dir, keyword.replace(" ", "_"))
423
+ os.makedirs(keyword_dir, exist_ok=True)
424
+
425
+ print(f"从Flickr搜索并下载 '{keyword}' 的图片...")
426
+
427
+ try:
428
+ # 创建Flickr爬虫
429
+ if api_key and api_secret:
430
+ # 使用API密钥
431
+ crawler = FlickrImageCrawler(
432
+ api_key=api_key,
433
+ api_secret=api_secret,
434
+ downloader_threads=4,
435
+ storage={"root_dir": temp_dir},
436
+ )
437
+ else:
438
+ # 匿名模式(功能受限但不需要API密钥)
439
+ print("警告: 未提供Flickr API密钥,使用匿名模式(功能受限)")
440
+ crawler = FlickrImageCrawler(
441
+ downloader_threads=4, storage={"root_dir": temp_dir}
442
+ )
443
+
444
+ # 执行爬取
445
+ crawler.crawl(text=keyword, max_num=num_images, tag_mode=tag_mode)
446
+
447
+ except Exception as e:
448
+ print(f"Flickr爬取失败: {e}")
449
+ print("提示: 如果需要更好的功能,请申请Flickr API密钥")
450
+ # 清理临时目录
451
+ shutil.rmtree(temp_dir, ignore_errors=True)
452
+ return 0
453
+
454
+ # 如果需要保存URL映射,先从日志中提取URL信息
455
+ url_mappings = []
456
+ if self.save_mapping:
457
+ url_mappings = self._extract_urls_from_temp_dir(temp_dir, keyword, "flickr")
458
+
459
+ # 转换所有图片为jpg格式并使用哈希文件名,保存到对应关键词目录
460
+ converted = self._convert_images_to_jpg_with_hash(
461
+ temp_dir, keyword, keyword_dir, "flickr", url_mappings
462
+ )
463
+
464
+ # 清理临时目录
465
+ shutil.rmtree(temp_dir, ignore_errors=True)
466
+
467
+ return converted
468
+
469
+ def download_from_website(
470
+ self, urls, keyword="website_images", num_images=20, allowed_domains=None
471
+ ):
472
+ """
473
+ 从指定网站抓取所有图片(贪婪模式)
474
+
475
+ 参数:
476
+ urls: 目标网站URL列表或单个URL
477
+ keyword: 用于目录命名的关键词
478
+ num_images: 最大下载图片数量
479
+ allowed_domains: 允许的域名列表,None表示不限制
480
+
481
+ 返回:
482
+ 下载的图片数量
483
+ """
484
+ if isinstance(urls, str):
485
+ urls = [urls]
486
+
487
+ # 为每个关键词创建一个临时目录
488
+ temp_dir = os.path.join(self.save_dir, "_temp_" + keyword.replace(" ", "_"))
489
+ os.makedirs(temp_dir, exist_ok=True)
490
+
491
+ # 创建最终保存的关键词目录
492
+ keyword_dir = os.path.join(self.save_dir, keyword.replace(" ", "_"))
493
+ os.makedirs(keyword_dir, exist_ok=True)
494
+
495
+ print(f"从网站 {urls} 贪婪抓取图片...")
496
+
497
+ try:
498
+ # 创建贪婪爬虫
499
+ crawler = GreedyImageCrawler(
500
+ downloader_threads=4, storage={"root_dir": temp_dir}
501
+ )
502
+
503
+ # 执行爬取
504
+ for url in urls:
505
+ print(f"正在抓取网站: {url}")
506
+ crawler.crawl(
507
+ domains=[url] if allowed_domains is None else allowed_domains,
508
+ max_num=num_images // len(urls), # 平均分配每个URL的下载量
509
+ )
510
+
511
+ except Exception as e:
512
+ print(f"网站抓取失败: {e}")
513
+ # 清理临时目录
514
+ shutil.rmtree(temp_dir, ignore_errors=True)
515
+ return 0
516
+
517
+ # 如果需要保存URL映射,先从日志中提取URL信息
518
+ url_mappings = []
519
+ if self.save_mapping:
520
+ url_mappings = self._extract_urls_from_temp_dir(
521
+ temp_dir, keyword, "website"
522
+ )
523
+
524
+ # 转换所有图片为jpg格式并使用哈希文件名,保存到对应关键词目录
525
+ converted = self._convert_images_to_jpg_with_hash(
526
+ temp_dir, keyword, keyword_dir, "website", url_mappings
527
+ )
528
+
529
+ # 清理临时目录
530
+ shutil.rmtree(temp_dir, ignore_errors=True)
531
+
532
+ return converted
533
+
534
+ def download_from_urls(self, url_list, keyword="url_images", num_images=None):
535
+ """
536
+ 从URL列表下载图片
537
+
538
+ 参数:
539
+ url_list: 图片URL列表或包含URL的文件路径
540
+ keyword: 用于目录命名的关键词
541
+ num_images: 最大下载数量,None表示下载所有
542
+
543
+ 返回:
544
+ 下载的图片数量
545
+ """
546
+ # 处理URL列表输入
547
+ if isinstance(url_list, str):
548
+ # 如果是文件路径
549
+ if os.path.isfile(url_list):
550
+ with open(url_list, "r", encoding="utf-8") as f:
551
+ urls = [line.strip() for line in f if line.strip()]
552
+ else:
553
+ # 如果是单个URL
554
+ urls = [url_list]
555
+ else:
556
+ urls = url_list
557
+
558
+ if not urls:
559
+ print("错误: 没有找到有效的URL")
560
+ return 0
561
+
562
+ if num_images:
563
+ urls = urls[:num_images]
564
+
565
+ # 为每个关键词创建一个临时目录
566
+ temp_dir = os.path.join(self.save_dir, "_temp_" + keyword.replace(" ", "_"))
567
+ os.makedirs(temp_dir, exist_ok=True)
568
+
569
+ # 创建最终保存的关键词目录
570
+ keyword_dir = os.path.join(self.save_dir, keyword.replace(" ", "_"))
571
+ os.makedirs(keyword_dir, exist_ok=True)
572
+
573
+ print(f"从 {len(urls)} 个URL下载图片...")
574
+
575
+ try:
576
+ # 创建URL列表爬虫
577
+ crawler = UrlListCrawler(
578
+ downloader_threads=4, storage={"root_dir": temp_dir}
579
+ )
580
+
581
+ # 执行爬取
582
+ crawler.crawl(urls)
583
+
584
+ except Exception as e:
585
+ print(f"URL列表下载失败: {e}")
586
+ # 清理临时目录
587
+ shutil.rmtree(temp_dir, ignore_errors=True)
588
+ return 0
589
+
590
+ # 如果需要保存URL映射,先从日志中提取URL信息
591
+ url_mappings = []
592
+ if self.save_mapping:
593
+ url_mappings = self._extract_urls_from_temp_dir(temp_dir, keyword, "urls")
594
+
595
+ # 转换所有图片为jpg格式并使用哈希文件名,保存到对应关键词目录
596
+ converted = self._convert_images_to_jpg_with_hash(
597
+ temp_dir, keyword, keyword_dir, "urls", url_mappings
598
+ )
599
+
600
+ # 清理临时目录
601
+ shutil.rmtree(temp_dir, ignore_errors=True)
602
+
603
+ return converted
604
+
605
+ def download_from_unsplash(self, keyword, num_images=20, per_page=30):
606
+ """
607
+ 从Unsplash搜索并下载高质量免费图片(改用HTML解析方式)
608
+
609
+ 参数:
610
+ keyword: 搜索关键词
611
+ num_images: 要下载的图片数量
612
+ per_page: 每页图片数量(最大30)
613
+
614
+ 返回:
615
+ 下载的图片数量
616
+ """
617
+ # 为每个关键词创建一个临时目录
618
+ temp_dir = os.path.join(self.save_dir, "_temp_" + keyword.replace(" ", "_"))
619
+ os.makedirs(temp_dir, exist_ok=True)
620
+
621
+ # 创建最终保存的关键词目录
622
+ keyword_dir = os.path.join(self.save_dir, keyword.replace(" ", "_"))
623
+ os.makedirs(keyword_dir, exist_ok=True)
624
+
625
+ print(f"从Unsplash搜索并下载 '{keyword}' 的图片...")
626
+
627
+ try:
628
+ downloaded_count = 0
629
+ page = 1
630
+
631
+ # 更真实的浏览器请求头
632
+ session = requests.Session()
633
+ headers = {
634
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
635
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
636
+ "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7",
637
+ "Accept-Encoding": "gzip, deflate, br",
638
+ "DNT": "1",
639
+ "Connection": "keep-alive",
640
+ "Upgrade-Insecure-Requests": "1",
641
+ "Sec-Fetch-Dest": "document",
642
+ "Sec-Fetch-Mode": "navigate",
643
+ "Sec-Fetch-Site": "none",
644
+ "Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
645
+ "Sec-Ch-Ua-Mobile": "?0",
646
+ "Sec-Ch-Ua-Platform": '"Windows"',
647
+ }
648
+ session.headers.update(headers)
649
+
650
+ while downloaded_count < num_images:
651
+ try:
652
+ # 构建搜索URL(使用网页版搜索)
653
+ search_url = (
654
+ f"https://unsplash.com/s/photos/{urllib.parse.quote(keyword)}"
655
+ )
656
+ if page > 1:
657
+ search_url += f"?page={page}"
658
+
659
+ print(f" 正在获取第 {page} 页...")
660
+ time.sleep(1) # 增加延迟避免被限制
661
+
662
+ response = session.get(search_url, timeout=15)
663
+
664
+ if response.status_code != 200:
665
+ print(f" 搜索失败,状态码: {response.status_code}")
666
+ break
667
+
668
+ # 解析HTML寻找图片
669
+ soup = BeautifulSoup(response.content, "html.parser")
670
+
671
+ # 寻找图片元素
672
+ img_elements = soup.find_all("img", {"src": True})
673
+ found_images = []
674
+
675
+ for img in img_elements:
676
+ src = img.get("src", "")
677
+ # 过滤出Unsplash的图片URL
678
+ if "images.unsplash.com" in src and (
679
+ "photo-" in src or "unsplash-" in src
680
+ ):
681
+ # 尝试获取更高质量版本
682
+ if "?ixlib=" in src:
683
+ # 修改URL参数获取更大尺寸
684
+ src = src.split("?")[0] + "?ixlib=rb-4.0.3&w=1080&q=80"
685
+ found_images.append(src)
686
+
687
+ if not found_images:
688
+ print(f" 第 {page} 页没有找到有效图片")
689
+ break
690
+
691
+ # 下载图片
692
+ for img_url in found_images[
693
+ : min(20, num_images - downloaded_count)
694
+ ]:
695
+ if downloaded_count >= num_images:
696
+ break
697
+
698
+ try:
699
+ # 下载图片
700
+ img_response = session.get(img_url, timeout=15)
701
+ if img_response.status_code == 200:
702
+ # 生成文件名
703
+ filename = f"unsplash_{downloaded_count + 1}.jpg"
704
+ temp_path = os.path.join(temp_dir, filename)
705
+
706
+ with open(temp_path, "wb") as f:
707
+ f.write(img_response.content)
708
+
709
+ downloaded_count += 1
710
+ print(
711
+ f" 下载图片 {downloaded_count}/{num_images}: {filename}"
712
+ )
713
+
714
+ # 避免请求过快
715
+ time.sleep(0.3)
716
+
717
+ except Exception as e:
718
+ print(f" 下载单张图片失败: {e}")
719
+ continue
720
+
721
+ page += 1
722
+ if page > 10: # 限制最大页数避免无限循环
723
+ break
724
+
725
+ except Exception as e:
726
+ print(f" 页面获取失败: {e}")
727
+ break
728
+
729
+ print(f" Unsplash: 成功下载 {downloaded_count} 张图片")
730
+
731
+ except Exception as e:
732
+ print(f"Unsplash搜索失败: {e}")
733
+ downloaded_count = 0
734
+
735
+ # 转换所有图片为jpg格式并使用哈希文件名
736
+ url_mappings = []
737
+ if self.save_mapping:
738
+ url_mappings = self._extract_urls_from_temp_dir(
739
+ temp_dir, keyword, "unsplash"
740
+ )
741
+
742
+ converted = self._convert_images_to_jpg_with_hash(
743
+ temp_dir, keyword, keyword_dir, "unsplash", url_mappings
744
+ )
745
+
746
+ # 清理临时目录
747
+ shutil.rmtree(temp_dir, ignore_errors=True)
748
+
749
+ return converted
750
+
751
+ def download_from_pixabay(self, keyword, num_images=20, category="all"):
752
+ """
753
+ 从Pixabay搜索并下载免费图片
754
+
755
+ 参数:
756
+ keyword: 搜索关键词
757
+ num_images: 要下载的图片数量
758
+ category: 图片分类(all, backgrounds, fashion, nature, science, education等)
759
+
760
+ 返回:
761
+ 下载的图片数量
762
+ """
763
+ # 为每个关键词创建一个临时目录
764
+ temp_dir = os.path.join(self.save_dir, "_temp_" + keyword.replace(" ", "_"))
765
+ os.makedirs(temp_dir, exist_ok=True)
766
+
767
+ # 创建最终保存的关键词目录
768
+ keyword_dir = os.path.join(self.save_dir, keyword.replace(" ", "_"))
769
+ os.makedirs(keyword_dir, exist_ok=True)
770
+
771
+ print(f"从Pixabay搜索并下载 '{keyword}' 的图片...")
772
+
773
+ try:
774
+ downloaded_count = 0
775
+ page = 1
776
+ per_page = 20 # Pixabay每页最多20张
777
+
778
+ headers = {
779
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
780
+ }
781
+
782
+ while downloaded_count < num_images:
783
+ # 构建搜索URL
784
+ search_url = "https://pixabay.com/api/"
785
+ params = {
786
+ "key": "9656065-a4094594c34f9ac14c7fc4c39", # 免费的默认key
787
+ "q": keyword,
788
+ "image_type": "photo",
789
+ "category": category,
790
+ "per_page": min(per_page, num_images - downloaded_count),
791
+ "page": page,
792
+ }
793
+
794
+ print(f" 正在获取第 {page} 页...")
795
+ response = requests.get(
796
+ search_url, params=params, headers=headers, timeout=10
797
+ )
798
+
799
+ if response.status_code != 200:
800
+ print(f" 搜索失败,状态码: {response.status_code}")
801
+ break
802
+
803
+ data = response.json()
804
+ hits = data.get("hits", [])
805
+
806
+ if not hits:
807
+ print(f" 第 {page} 页没有更多结果")
808
+ break
809
+
810
+ # 下载图片
811
+ for photo in hits:
812
+ if downloaded_count >= num_images:
813
+ break
814
+
815
+ try:
816
+ # 获取图片URL(选择webformatURL)
817
+ img_url = photo.get("webformatURL") or photo.get(
818
+ "largeImageURL"
819
+ )
820
+ if not img_url:
821
+ continue
822
+
823
+ # 下载图片
824
+ img_response = requests.get(
825
+ img_url, headers=headers, timeout=15
826
+ )
827
+ if img_response.status_code == 200:
828
+ # 生成文件名
829
+ photo_id = photo.get("id", f"pixabay_{downloaded_count}")
830
+ filename = f"{photo_id}.jpg"
831
+ temp_path = os.path.join(temp_dir, filename)
832
+
833
+ with open(temp_path, "wb") as f:
834
+ f.write(img_response.content)
835
+
836
+ downloaded_count += 1
837
+ print(
838
+ f" 下载图片 {downloaded_count}/{num_images}: {filename}"
839
+ )
840
+
841
+ # 避免请求过快
842
+ time.sleep(0.1)
843
+
844
+ except Exception as e:
845
+ print(f" 下载单张图片失败: {e}")
846
+ continue
847
+
848
+ page += 1
849
+ time.sleep(0.5) # 页面间延迟
850
+
851
+ print(f" Pixabay: 成功下载 {downloaded_count} 张图片")
852
+
853
+ except Exception as e:
854
+ print(f"Pixabay搜索失败: {e}")
855
+ downloaded_count = 0
856
+
857
+ # 转换所有图片为jpg格式并使用哈希文件名
858
+ url_mappings = []
859
+ if self.save_mapping:
860
+ url_mappings = self._extract_urls_from_temp_dir(
861
+ temp_dir, keyword, "pixabay"
862
+ )
863
+
864
+ converted = self._convert_images_to_jpg_with_hash(
865
+ temp_dir, keyword, keyword_dir, "pixabay", url_mappings
866
+ )
867
+
868
+ # 清理临时目录
869
+ shutil.rmtree(temp_dir, ignore_errors=True)
870
+
871
+ return converted
872
+
873
+ def download_from_pexels(self, keyword, num_images=20, per_page=20):
874
+ """
875
+ 从Pexels搜索并下载免费图片(改进反爬策略)
876
+
877
+ 参数:
878
+ keyword: 搜索关键词
879
+ num_images: 要下载的图片数量
880
+ per_page: 每页图片数量
881
+
882
+ 返回:
883
+ 下载的图片数量
884
+ """
885
+ # 为每个关键词创建一个临时目录
886
+ temp_dir = os.path.join(self.save_dir, "_temp_" + keyword.replace(" ", "_"))
887
+ os.makedirs(temp_dir, exist_ok=True)
888
+
889
+ # 创建最终保存的关键词目录
890
+ keyword_dir = os.path.join(self.save_dir, keyword.replace(" ", "_"))
891
+ os.makedirs(keyword_dir, exist_ok=True)
892
+
893
+ print(f"从Pexels搜索并下载 '{keyword}' 的图片...")
894
+
895
+ try:
896
+ downloaded_count = 0
897
+ page = 1
898
+
899
+ # 创建会话并设置完整的浏览器请求头
900
+ session = requests.Session()
901
+ headers = {
902
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
903
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
904
+ "Accept-Language": "en-US,en;q=0.9",
905
+ "Accept-Encoding": "gzip, deflate, br",
906
+ "DNT": "1",
907
+ "Connection": "keep-alive",
908
+ "Upgrade-Insecure-Requests": "1",
909
+ "Sec-Fetch-Dest": "document",
910
+ "Sec-Fetch-Mode": "navigate",
911
+ "Sec-Fetch-Site": "none",
912
+ "Sec-Fetch-User": "?1",
913
+ "Sec-Ch-Ua": '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
914
+ "Sec-Ch-Ua-Mobile": "?0",
915
+ "Sec-Ch-Ua-Platform": '"Windows"',
916
+ "Cache-Control": "max-age=0",
917
+ }
918
+ session.headers.update(headers)
919
+
920
+ # 首先访问主页建立会话
921
+ try:
922
+ print(" 正在建立会话...")
923
+ session.get("https://www.pexels.com/", timeout=10)
924
+ time.sleep(1)
925
+ except:
926
+ pass # 忽略主页访问错误
927
+
928
+ while downloaded_count < num_images:
929
+ try:
930
+ # 构建搜索URL
931
+ if page == 1:
932
+ search_url = f"https://www.pexels.com/search/{urllib.parse.quote(keyword)}/"
933
+ else:
934
+ search_url = f"https://www.pexels.com/search/{urllib.parse.quote(keyword)}/?page={page}"
935
+
936
+ print(f" 正在获取第 {page} 页...")
937
+ time.sleep(2) # 增加延迟
938
+
939
+ response = session.get(search_url, timeout=15)
940
+
941
+ if response.status_code != 200:
942
+ print(f" 搜索失败,状态码: {response.status_code}")
943
+ if response.status_code == 403:
944
+ print(" 可能被反爬机制阻止,尝试继续...")
945
+ break
946
+
947
+ # 解析HTML
948
+ soup = BeautifulSoup(response.content, "html.parser")
949
+
950
+ # 查找图片元素 - 使用更精确的选择器
951
+ found_images = []
952
+
953
+ # 方法1: 查找所有img标签
954
+ img_elements = soup.find_all("img")
955
+ for img in img_elements:
956
+ src = img.get("src", "")
957
+ # 过滤Pexels图片URL
958
+ if "images.pexels.com" in src:
959
+ # 尝试获取更大尺寸的图片
960
+ if "?auto=compress" in src and "w=" in src:
961
+ # 修改URL获取更大尺寸
962
+ base_url = src.split("?")[0]
963
+ src = (
964
+ base_url
965
+ + "?auto=compress&cs=tinysrgb&w=1260&h=750&dpr=1"
966
+ )
967
+ found_images.append(src)
968
+
969
+ # 方法2: 查找srcset属性
970
+ for img in img_elements:
971
+ srcset = img.get("srcset", "")
972
+ if srcset and "images.pexels.com" in srcset:
973
+ # 从srcset中提取最大尺寸的URL
974
+ urls = srcset.split(",")
975
+ for url_info in urls:
976
+ url_part = url_info.strip().split(" ")[0]
977
+ if "images.pexels.com" in url_part:
978
+ found_images.append(url_part)
979
+
980
+ # 去重
981
+ found_images = list(set(found_images))
982
+
983
+ if not found_images:
984
+ print(f" 第 {page} 页没有找到有效图片")
985
+ break
986
+
987
+ print(f" 找到 {len(found_images)} 张图片")
988
+
989
+ # 下载图片
990
+ for img_url in found_images[
991
+ : min(15, num_images - downloaded_count)
992
+ ]:
993
+ if downloaded_count >= num_images:
994
+ break
995
+
996
+ try:
997
+ # 清理URL
998
+ if img_url.startswith("//"):
999
+ img_url = "https:" + img_url
1000
+
1001
+ # 下载图片
1002
+ img_response = session.get(img_url, timeout=15)
1003
+ if img_response.status_code == 200:
1004
+ # 生成文件名
1005
+ filename = f"pexels_{downloaded_count + 1}.jpg"
1006
+ temp_path = os.path.join(temp_dir, filename)
1007
+
1008
+ with open(temp_path, "wb") as f:
1009
+ f.write(img_response.content)
1010
+
1011
+ downloaded_count += 1
1012
+ print(
1013
+ f" 下载图片 {downloaded_count}/{num_images}: {filename}"
1014
+ )
1015
+
1016
+ # 避免请求过快
1017
+ time.sleep(0.5)
1018
+
1019
+ except Exception as e:
1020
+ print(f" 下载单张图片失败: {e}")
1021
+ continue
1022
+
1023
+ page += 1
1024
+ if page > 8: # 限制最大页数
1025
+ break
1026
+
1027
+ except Exception as e:
1028
+ print(f" 页面获取失败: {e}")
1029
+ break
1030
+
1031
+ print(f" Pexels: 成功下载 {downloaded_count} 张图片")
1032
+
1033
+ except Exception as e:
1034
+ print(f"Pexels搜索失败: {e}")
1035
+ downloaded_count = 0
1036
+
1037
+ # 转换所有图片为jpg格式并使用哈希文件名
1038
+ url_mappings = []
1039
+ if self.save_mapping:
1040
+ url_mappings = self._extract_urls_from_temp_dir(temp_dir, keyword, "pexels")
1041
+
1042
+ converted = self._convert_images_to_jpg_with_hash(
1043
+ temp_dir, keyword, keyword_dir, "pexels", url_mappings
1044
+ )
1045
+
1046
+ # 清理临时目录
1047
+ shutil.rmtree(temp_dir, ignore_errors=True)
1048
+
1049
+ return converted
1050
+
1051
+ def download_images(self, keyword, num_images=20, engine="bing", **kwargs):
1052
+ """
1053
+ 根据关键词从指定搜索引擎下载图片
1054
+
1055
+ 参数:
1056
+ keyword: 搜索关键词
1057
+ num_images: 要下载的图片数量
1058
+ engine: 搜索引擎,支持传统引擎和免费图片源
1059
+ 传统引擎: "baidu"、"bing"、"google"、"flickr"
1060
+ 免费图片源: "unsplash"、"pixabay"、"pexels" (无需API密钥)
1061
+ 特殊功能: "website"、"urls"
1062
+ **kwargs: 特定引擎的额外参数
1063
+ - flickr: api_key, api_secret, tag_mode
1064
+ - website: urls (必需), allowed_domains
1065
+ - urls: url_list (必需)
1066
+ - pixabay: category (可选)
1067
+ - unsplash/pexels: per_page (可选)
1068
+
1069
+ 返回:
1070
+ 下载的图片数量
1071
+ """
1072
+ if engine == "baidu":
1073
+ return self.download_from_baidu(keyword, num_images)
1074
+ elif engine == "bing":
1075
+ return self.download_from_bing(keyword, num_images)
1076
+ elif engine == "google":
1077
+ return self.download_from_google(keyword, num_images)
1078
+ elif engine == "flickr":
1079
+ return self.download_from_flickr(
1080
+ keyword,
1081
+ num_images,
1082
+ api_key=kwargs.get("api_key"),
1083
+ api_secret=kwargs.get("api_secret"),
1084
+ tag_mode=kwargs.get("tag_mode", "any"),
1085
+ )
1086
+ elif engine == "unsplash":
1087
+ return self.download_from_unsplash(
1088
+ keyword, num_images, per_page=kwargs.get("per_page", 30)
1089
+ )
1090
+ elif engine == "pixabay":
1091
+ return self.download_from_pixabay(
1092
+ keyword, num_images, category=kwargs.get("category", "all")
1093
+ )
1094
+ elif engine == "pexels":
1095
+ return self.download_from_pexels(
1096
+ keyword, num_images, per_page=kwargs.get("per_page", 20)
1097
+ )
1098
+ elif engine == "website":
1099
+ urls = kwargs.get("urls")
1100
+ if not urls:
1101
+ raise ValueError("使用website引擎时必须提供'urls'参数")
1102
+ return self.download_from_website(
1103
+ urls,
1104
+ keyword=keyword,
1105
+ num_images=num_images,
1106
+ allowed_domains=kwargs.get("allowed_domains"),
1107
+ )
1108
+ elif engine == "urls":
1109
+ url_list = kwargs.get("url_list")
1110
+ if not url_list:
1111
+ raise ValueError("使用urls引擎时必须提供'url_list'参数")
1112
+ return self.download_from_urls(
1113
+ url_list, keyword=keyword, num_images=num_images
1114
+ )
1115
+ else:
1116
+ raise ValueError(
1117
+ f"不支持的搜索引擎: {engine},请使用 'baidu', 'bing', 'google', 'flickr', 'unsplash', 'pixabay', 'pexels', 'website' 或 'urls'"
1118
+ )
1119
+
1120
+ def _get_image_hash(self, image_data):
1121
+ """
1122
+ 计算图片内容的MD5哈希值
1123
+
1124
+ 参数:
1125
+ image_data: 图片二进制数据
1126
+
1127
+ 返回:
1128
+ 图片的哈希值
1129
+ """
1130
+ return hashlib.md5(image_data).hexdigest()
1131
+
1132
+ def _extract_urls_from_temp_dir(self, temp_dir, keyword, engine):
1133
+ """
1134
+ 从临时目录中提取文件名到URL的映射(使用icrawler的内置文件名约定)
1135
+
1136
+ 参数:
1137
+ temp_dir: 临时目录路径
1138
+ keyword: 搜索关键词
1139
+ engine: 搜索引擎
1140
+
1141
+ 返回:
1142
+ URL映射列表
1143
+ """
1144
+ # icrawler会将下载的图片按数字序号命名(000001.jpg, 000002.jpg, ...)
1145
+ # 我们创建一个映射字典来存储已知的URL信息
1146
+ # 由于无法直接从icrawler获取URL映射,这里提供基础结构
1147
+ # URL信息将在转换过程中从其他源获取
1148
+ mappings = []
1149
+
1150
+ # 获取临时目录中的所有图片文件
1151
+ image_files = sorted(glob.glob(os.path.join(temp_dir, "*.*")))
1152
+
1153
+ for i, img_path in enumerate(image_files):
1154
+ filename = os.path.basename(img_path)
1155
+ mappings.append(
1156
+ {
1157
+ "temp_filename": filename,
1158
+ "temp_path": img_path,
1159
+ "index": i + 1,
1160
+ "original_url": "", # 将在后续流程中填充
1161
+ }
1162
+ )
1163
+
1164
+ return mappings
1165
+
1166
+ def _convert_images_to_jpg_with_hash(
1167
+ self, directory, keyword, target_dir, engine, url_mappings=None
1168
+ ):
1169
+ """
1170
+ 将目录中的所有图片转换为jpg格式,并使用哈希值作为文件名
1171
+
1172
+ 参数:
1173
+ directory: 图片所在目录
1174
+ keyword: 搜索关键词(用于元数据)
1175
+ target_dir: 图片保存的目标目录
1176
+ engine: 使用的搜索引擎
1177
+ url_mappings: URL映射列表(可选)
1178
+
1179
+ 返回:
1180
+ 成功转换的图片数量
1181
+ """
1182
+ converted_count = 0
1183
+ # 获取所有图片文件
1184
+ image_files = glob.glob(os.path.join(directory, "*.*"))
1185
+
1186
+ for i, img_path in enumerate(image_files):
1187
+ try:
1188
+ # 尝试打开图片
1189
+ with open(img_path, "rb") as f:
1190
+ image_data = f.read()
1191
+
1192
+ # 计算图片内容的哈希值
1193
+ hash_value = self._get_image_hash(image_data)
1194
+
1195
+ try:
1196
+ # 尝试加载图片以确保它是有效的
1197
+ img = Image.open(io.BytesIO(image_data))
1198
+
1199
+ # 转换为RGB模式(以防是RGBA或其他模式)
1200
+ if img.mode != "RGB":
1201
+ img = img.convert("RGB")
1202
+
1203
+ # 使用哈希值作为文件名
1204
+ jpg_filename = f"{hash_value}.jpg"
1205
+ jpg_path = os.path.join(target_dir, jpg_filename)
1206
+
1207
+ # 如果文件已存在,跳过(避免重复)
1208
+ if os.path.exists(jpg_path):
1209
+ print(f"图片已存在 (哈希值: {hash_value})")
1210
+ converted_count += 1
1211
+ continue
1212
+
1213
+ # 保存为jpg
1214
+ img.save(jpg_path, "JPEG")
1215
+
1216
+ # 更新URL映射信息(如果有保存URL映射的需求)
1217
+ if self.save_mapping:
1218
+ original_filename = os.path.basename(img_path)
1219
+ # 从URL映射中获取原始URL
1220
+ original_url = ""
1221
+ if url_mappings and isinstance(url_mappings, dict):
1222
+ original_url = url_mappings.get(original_filename, "")
1223
+
1224
+ mapping_entry = {
1225
+ "original_filename": original_filename,
1226
+ "final_filename": jpg_filename,
1227
+ "final_path": jpg_path,
1228
+ "keyword": keyword,
1229
+ "engine": engine,
1230
+ "original_url": original_url,
1231
+ "hash": hash_value,
1232
+ }
1233
+ self.url_mappings.append(mapping_entry)
1234
+
1235
+ converted_count += 1
1236
+ print(f"保存图片到 {target_dir}: {jpg_filename}")
1237
+
1238
+ except Exception as e:
1239
+ print(f"处理图片失败: {e}")
1240
+
1241
+ except Exception as e:
1242
+ print(f"无法处理图片 {img_path}: {e}")
1243
+
1244
+ print(f"成功处理并哈希化 {converted_count} 张图片,保存到 '{target_dir}'")
1245
+ return converted_count
1246
+
1247
+
1248
+ def download_images_cli(
1249
+ keywords,
1250
+ num_images=50,
1251
+ engines=None,
1252
+ save_dir="downloaded_images",
1253
+ save_mapping=True,
1254
+ flickr_api_key=None,
1255
+ flickr_api_secret=None,
1256
+ website_urls=None,
1257
+ url_list_file=None,
1258
+ ):
1259
+ """
1260
+ CLI友好的图片下载函数
1261
+
1262
+ 参数:
1263
+ keywords: 搜索关键词列表或单个关键词字符串
1264
+ num_images: 每个关键词要下载的图片数量,默认50
1265
+ engines: 要使用的搜索引擎列表,默认["bing", "google"]
1266
+ 支持: "bing", "google", "baidu", "flickr", "website", "urls"
1267
+ save_dir: 图片保存目录,默认"downloaded_images"
1268
+ save_mapping: 是否保存图像元数据到metadata.jsonl文件,默认True
1269
+ flickr_api_key: Flickr API密钥(使用flickr引擎时需要)
1270
+ flickr_api_secret: Flickr API密钥(使用flickr引擎时需要)
1271
+ website_urls: 网站URL列表(使用website引擎时需要),用逗号分隔
1272
+ url_list_file: 包含图片URL列表的文件路径(使用urls引擎时需要)
1273
+
1274
+ 返回:
1275
+ 下载统计信息字典
1276
+ """
1277
+ # 处理输入参数
1278
+ if isinstance(keywords, str):
1279
+ keywords = [keywords]
1280
+
1281
+ if engines is None:
1282
+ engines = ["bing", "google"]
1283
+ elif isinstance(engines, str):
1284
+ engines = [engines]
1285
+
1286
+ # 创建下载器实例
1287
+ downloader = ImageDownloader(save_dir=save_dir)
1288
+ downloader.save_mapping = save_mapping
1289
+
1290
+ # 统计信息
1291
+ stats = {
1292
+ "total_keywords": len(keywords),
1293
+ "total_engines": len(engines),
1294
+ "downloads": {},
1295
+ "total_downloaded": 0,
1296
+ }
1297
+
1298
+ print(f"开始下载图片...")
1299
+ print(f"关键词数量: {len(keywords)}")
1300
+ print(f"每个关键词下载: {num_images} 张图片")
1301
+ print(f"使用搜索引擎: {', '.join(engines)}")
1302
+ print(f"保存目录: {save_dir}")
1303
+ print("-" * 60)
1304
+
1305
+ # 下载每个关键词的图片
1306
+ for i, keyword in enumerate(keywords, 1):
1307
+ print(f"\n[{i}/{len(keywords)}] 处理关键词: '{keyword}'")
1308
+ stats["downloads"][keyword] = {}
1309
+
1310
+ for engine in engines:
1311
+ try:
1312
+ print(f" 使用 {engine} 搜索...")
1313
+
1314
+ # 准备引擎特定的参数
1315
+ engine_kwargs = {}
1316
+
1317
+ if engine == "flickr":
1318
+ if flickr_api_key:
1319
+ engine_kwargs["api_key"] = flickr_api_key
1320
+ if flickr_api_secret:
1321
+ engine_kwargs["api_secret"] = flickr_api_secret
1322
+
1323
+ elif engine == "website":
1324
+ if website_urls:
1325
+ urls = [url.strip() for url in website_urls.split(",")]
1326
+ engine_kwargs["urls"] = urls
1327
+ else:
1328
+ print(f" {engine}: 跳过 - 需要提供 website_urls 参数")
1329
+ stats["downloads"][keyword][engine] = 0
1330
+ continue
1331
+
1332
+ elif engine == "urls":
1333
+ if url_list_file:
1334
+ engine_kwargs["url_list"] = url_list_file
1335
+ else:
1336
+ print(f" {engine}: 跳过 - 需要提供 url_list_file 参数")
1337
+ stats["downloads"][keyword][engine] = 0
1338
+ continue
1339
+
1340
+ downloaded_count = downloader.download_images(
1341
+ keyword, num_images=num_images, engine=engine, **engine_kwargs
1342
+ )
1343
+ stats["downloads"][keyword][engine] = downloaded_count
1344
+ stats["total_downloaded"] += downloaded_count
1345
+ print(f" {engine}: 成功下载 {downloaded_count} 张图片")
1346
+
1347
+ except Exception as e:
1348
+ print(f" {engine}: 下载失败 - {e}")
1349
+ stats["downloads"][keyword][engine] = 0
1350
+
1351
+ print("-" * 50)
1352
+
1353
+ # 保存元数据表(如果需要)
1354
+ if save_mapping and downloader.url_mappings:
1355
+ import json
1356
+ from pathlib import Path
1357
+
1358
+ metadata_file = Path(save_dir) / "metadata.jsonl"
1359
+
1360
+ # 检查文件是否已存在,决定是新建还是追加
1361
+ if metadata_file.exists():
1362
+ print(f"\n追加元数据到: {metadata_file}")
1363
+ mode = "a"
1364
+ else:
1365
+ print(f"\n保存元数据表到: {metadata_file}")
1366
+ mode = "w"
1367
+
1368
+ with open(metadata_file, mode, encoding="utf-8") as f:
1369
+ for mapping in downloader.url_mappings:
1370
+ json.dump(mapping, f, ensure_ascii=False)
1371
+ f.write("\n")
1372
+
1373
+ print(f"已保存 {len(downloader.url_mappings)} 条元数据记录")
1374
+
1375
+ # 打印总结
1376
+ print(f"\n下载完成!")
1377
+ print(f"总共下载了 {stats['total_downloaded']} 张图片")
1378
+ print(f"图片保存在: {save_dir}")
1379
+
1380
+ return stats
1381
+
1382
+
1383
+ def main():
1384
+ """主函数,演示如何使用ImageDownloader类"""
1385
+ # 创建下载器实例
1386
+ downloader = ImageDownloader(save_dir="女性图片集")
1387
+
1388
+ # 示例关键词
1389
+ keywords = [
1390
+ "户外自拍女性",
1391
+ "女性写真",
1392
+ "动漫女角色",
1393
+ "影视剧女角色",
1394
+ "短片",
1395
+ "校园 女生",
1396
+ "随手拍",
1397
+ "女性 自拍",
1398
+ ]
1399
+
1400
+ # 下载每个关键词的图片
1401
+ for keyword in keywords:
1402
+ downloader.download_images(keyword, num_images=100, engine="bing")
1403
+ downloader.download_images(keyword, num_images=100, engine="google")
1404
+ downloader.download_images(keyword, num_images=100, engine="baidu")
1405
+ print("-" * 50)
1406
+
1407
+
1408
+ if __name__ == "__main__":
1409
+ # download_images_simple()
1410
+ main()