aio-scrapy 2.1.3__py3-none-any.whl → 2.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/LICENSE +1 -1
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/METADATA +53 -40
- aio_scrapy-2.1.6.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +523 -18
- aioscrapy/core/downloader/handlers/__init__.py +188 -6
- aioscrapy/core/downloader/handlers/aiohttp.py +188 -4
- aioscrapy/core/downloader/handlers/curl_cffi.py +125 -4
- aioscrapy/core/downloader/handlers/httpx.py +134 -4
- aioscrapy/core/downloader/handlers/pyhttpx.py +133 -4
- aioscrapy/core/downloader/handlers/requests.py +121 -3
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +170 -14
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +193 -7
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +313 -13
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.3.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -110
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -53
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,545 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Excel Pipeline for AioScrapy
|
|
3
|
+
AioScrapy的Excel管道
|
|
4
|
+
|
|
5
|
+
This module provides a pipeline for storing scraped items in Excel files.
|
|
6
|
+
It includes a mixin class for Excel file handling and a pipeline class that
|
|
7
|
+
uses the mixin to process items and save them to Excel files. It supports
|
|
8
|
+
formatting dates and embedding images.
|
|
9
|
+
此模块提供了一个用于将抓取的项目存储在Excel文件中的管道。
|
|
10
|
+
它包括一个用于Excel文件处理的混入类和一个使用该混入类处理项目并将其保存到Excel文件的管道类。
|
|
11
|
+
它支持格式化日期和嵌入图像。
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
import math
|
|
16
|
+
from io import BytesIO
|
|
17
|
+
from typing import Tuple, Optional
|
|
18
|
+
|
|
19
|
+
import requests
|
|
20
|
+
import xlsxwriter
|
|
21
|
+
from PIL import Image, ImageFile
|
|
22
|
+
|
|
23
|
+
from aioscrapy.utils.log import logger
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
# Use LANCZOS resampling filter for PIL 9.1.0 and above
|
|
27
|
+
# 对PIL 9.1.0及以上版本使用LANCZOS重采样过滤器
|
|
28
|
+
resample = Image.LANCZOS
|
|
29
|
+
except:
|
|
30
|
+
# Fall back to ANTIALIAS for older PIL versions
|
|
31
|
+
# 对较旧的PIL版本回退到ANTIALIAS
|
|
32
|
+
resample = Image.ANTIALIAS
|
|
33
|
+
# Allow loading truncated images
|
|
34
|
+
# 允许加载截断的图像
|
|
35
|
+
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ExcelSinkMixin:
|
|
39
|
+
"""
|
|
40
|
+
Mixin class for Excel file handling.
|
|
41
|
+
Excel文件处理的混入类。
|
|
42
|
+
|
|
43
|
+
This mixin provides methods for saving items to Excel files, managing Excel workbooks
|
|
44
|
+
and worksheets, and handling special data types like dates and images. It can be used
|
|
45
|
+
by any class that needs to write data to Excel files.
|
|
46
|
+
此混入类提供了将项目保存到Excel文件、管理Excel工作簿和工作表以及处理日期和图像等
|
|
47
|
+
特殊数据类型的方法。它可以被任何需要将数据写入Excel文件的类使用。
|
|
48
|
+
"""
|
|
49
|
+
# Dictionary to store worksheet objects by filename+sheet
|
|
50
|
+
# 按文件名+工作表存储工作表对象的字典
|
|
51
|
+
ws_cache = {}
|
|
52
|
+
|
|
53
|
+
# Dictionary to store workbook objects by filename
|
|
54
|
+
# 按文件名存储工作簿对象的字典
|
|
55
|
+
wb_cache = {}
|
|
56
|
+
|
|
57
|
+
# Dictionary to store field lists by filename+sheet
|
|
58
|
+
# 按文件名+工作表存储字段列表的字典
|
|
59
|
+
fields_cache = {}
|
|
60
|
+
|
|
61
|
+
# Dictionary to store current row positions by filename+sheet
|
|
62
|
+
# 按文件名+工作表存储当前行位置的字典
|
|
63
|
+
y_cache = {}
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
async def deal_img(url: str, img_size: Optional[Tuple[int, int]]) -> Optional[BytesIO]:
|
|
67
|
+
"""
|
|
68
|
+
Download and process an image from a URL.
|
|
69
|
+
从URL下载并处理图像。
|
|
70
|
+
|
|
71
|
+
This method downloads an image from the given URL, optionally resizes it
|
|
72
|
+
to the specified dimensions while maintaining aspect ratio, and returns
|
|
73
|
+
it as a BytesIO object that can be embedded in an Excel file.
|
|
74
|
+
此方法从给定的URL下载图像,可选择将其调整为指定的尺寸(同时保持纵横比),
|
|
75
|
+
并将其作为可嵌入Excel文件的BytesIO对象返回。
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
url: The URL of the image to download.
|
|
79
|
+
要下载的图像的URL。
|
|
80
|
+
img_size: Optional tuple of (width, height) to resize the image to.
|
|
81
|
+
可选的(宽度, 高度)元组,用于调整图像大小。
|
|
82
|
+
If provided, the image will be resized to fit within these dimensions
|
|
83
|
+
while maintaining aspect ratio.
|
|
84
|
+
如果提供,图像将被调整大小以适应这些尺寸,同时保持纵横比。
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
BytesIO: A BytesIO object containing the processed image,
|
|
88
|
+
or None if the image could not be downloaded or processed.
|
|
89
|
+
包含处理后图像的BytesIO对象,
|
|
90
|
+
如果无法下载或处理图像,则为None。
|
|
91
|
+
"""
|
|
92
|
+
# Add https: prefix if URL starts with //
|
|
93
|
+
# 如果URL以//开头,则添加https:前缀
|
|
94
|
+
if url.startswith('//'):
|
|
95
|
+
url = 'https:' + url
|
|
96
|
+
|
|
97
|
+
# Download the image
|
|
98
|
+
# 下载图像
|
|
99
|
+
try:
|
|
100
|
+
img_bytes = requests.get(url).content
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logger.error(f"download img error: {e}")
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
# Open the image using PIL
|
|
106
|
+
# 使用PIL打开图像
|
|
107
|
+
im = Image.open(BytesIO(img_bytes))
|
|
108
|
+
im_format = im.format
|
|
109
|
+
|
|
110
|
+
# Resize the image if a size is specified
|
|
111
|
+
# 如果指定了大小,则调整图像大小
|
|
112
|
+
if img_size:
|
|
113
|
+
# Calculate scaling factor to maintain aspect ratio
|
|
114
|
+
# 计算缩放因子以保持纵横比
|
|
115
|
+
temp = max(im.size[0] / img_size[0], im.size[1] / img_size[1])
|
|
116
|
+
img_size = (math.ceil(im.size[0] / temp), math.ceil(im.size[1] / temp))
|
|
117
|
+
|
|
118
|
+
# Resize and convert to palette mode to reduce file size
|
|
119
|
+
# 调整大小并转换为调色板模式以减小文件大小
|
|
120
|
+
im = im.resize(img_size, resample).convert('P')
|
|
121
|
+
|
|
122
|
+
# Save the processed image to a BytesIO object
|
|
123
|
+
# 将处理后的图像保存到BytesIO对象
|
|
124
|
+
result = BytesIO()
|
|
125
|
+
im.save(result, format=im_format)
|
|
126
|
+
|
|
127
|
+
# Reset the position to the beginning of the BytesIO object
|
|
128
|
+
# 将位置重置到BytesIO对象的开头
|
|
129
|
+
result.seek(0)
|
|
130
|
+
|
|
131
|
+
return result
|
|
132
|
+
|
|
133
|
+
async def save_item(
|
|
134
|
+
self,
|
|
135
|
+
item: dict,
|
|
136
|
+
*,
|
|
137
|
+
filename: Optional[str] = None,
|
|
138
|
+
date_fields: Optional[list] = None,
|
|
139
|
+
date_format: str = 'yyyy-mm-dd HH:MM:SS',
|
|
140
|
+
img_fields: Optional[list] = None,
|
|
141
|
+
img_size: Optional[Tuple[int, int]] = None,
|
|
142
|
+
**options
|
|
143
|
+
):
|
|
144
|
+
"""
|
|
145
|
+
Save an item to an Excel file.
|
|
146
|
+
将项目保存到Excel文件。
|
|
147
|
+
|
|
148
|
+
This method writes a dictionary item as a row in an Excel file. It handles
|
|
149
|
+
special formatting for date fields and can embed images from URLs. The first row
|
|
150
|
+
of the Excel file will contain the keys of the first item saved to the file.
|
|
151
|
+
此方法将字典项目作为Excel文件中的一行写入。它处理日期字段的特殊格式,
|
|
152
|
+
并可以嵌入来自URL的图像。Excel文件的第一行将包含保存到文件的第一个项目的键。
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
item: The dictionary item to save.
|
|
156
|
+
要保存的字典项目。
|
|
157
|
+
filename: The name of the Excel file to save to.
|
|
158
|
+
要保存到的Excel文件的名称。
|
|
159
|
+
If not provided, an assertion error will be raised.
|
|
160
|
+
如果未提供,将引发断言错误。
|
|
161
|
+
date_fields: Optional list of field names that should be formatted as dates.
|
|
162
|
+
应格式化为日期的字段名称的可选列表。
|
|
163
|
+
date_format: The Excel date format to use for date fields.
|
|
164
|
+
用于日期字段的Excel日期格式。
|
|
165
|
+
Defaults to 'yyyy-mm-dd HH:MM:SS'.
|
|
166
|
+
默认为'yyyy-mm-dd HH:MM:SS'。
|
|
167
|
+
img_fields: Optional list of field names that contain image URLs.
|
|
168
|
+
包含图像URL的字段名称的可选列表。
|
|
169
|
+
img_size: Optional tuple of (width, height) to resize images to.
|
|
170
|
+
用于调整图像大小的可选(宽度, 高度)元组。
|
|
171
|
+
**options: Additional options to pass to the Excel workbook.
|
|
172
|
+
传递给Excel工作簿的其他选项。
|
|
173
|
+
|
|
174
|
+
Raises:
|
|
175
|
+
AssertionError: If filename is None.
|
|
176
|
+
如果filename为None。
|
|
177
|
+
Exception: If there is an error writing to the Excel file.
|
|
178
|
+
如果写入Excel文件时出错。
|
|
179
|
+
"""
|
|
180
|
+
# Ensure filename is provided
|
|
181
|
+
# 确保提供了文件名
|
|
182
|
+
assert filename is not None, "请传入filename参数"
|
|
183
|
+
|
|
184
|
+
# Add .xlsx extension if not present
|
|
185
|
+
# 如果不存在,则添加.xlsx扩展名
|
|
186
|
+
if '.xlsx' not in filename:
|
|
187
|
+
filename = filename + '.xlsx'
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
# Get or create workbook, worksheet, fields list, and current row
|
|
191
|
+
# 获取或创建工作簿、工作表、字段列表和当前行
|
|
192
|
+
wb, ws, fields, y = self._get_write_class(filename, item, **options)
|
|
193
|
+
|
|
194
|
+
# Create cell formats for alternating row colors
|
|
195
|
+
# 创建用于交替行颜色的单元格格式
|
|
196
|
+
bold_format_1 = wb.add_format({'align': 'left', 'border': 1, 'valign': 'vcenter'})
|
|
197
|
+
bold_format_2 = wb.add_format({'align': 'left', 'border': 1, 'valign': 'vcenter', 'fg_color': '#D0D3D4'})
|
|
198
|
+
|
|
199
|
+
# Process each field in the item
|
|
200
|
+
# 处理项目中的每个字段
|
|
201
|
+
for x, field in enumerate(fields):
|
|
202
|
+
# Alternate row colors
|
|
203
|
+
# 交替行颜色
|
|
204
|
+
if x % 2 == 0:
|
|
205
|
+
bold_format = bold_format_1
|
|
206
|
+
else:
|
|
207
|
+
bold_format = bold_format_2
|
|
208
|
+
|
|
209
|
+
# Handle date fields
|
|
210
|
+
# 处理日期字段
|
|
211
|
+
if date_fields is not None and field in date_fields:
|
|
212
|
+
ws.write_datetime(y, x, item.get(field), wb.add_format({'num_format': date_format}))
|
|
213
|
+
|
|
214
|
+
# Handle image fields
|
|
215
|
+
# 处理图像字段
|
|
216
|
+
elif img_fields is not None and field in img_fields:
|
|
217
|
+
# Set column width if image size is specified
|
|
218
|
+
# 如果指定了图像大小,则设置列宽
|
|
219
|
+
img_size and ws.set_column_pixels(x, x, width=math.ceil(img_size[0]))
|
|
220
|
+
|
|
221
|
+
# Get image URL from item
|
|
222
|
+
# 从项目获取图像URL
|
|
223
|
+
url = item.get(field)
|
|
224
|
+
|
|
225
|
+
# Download and process the image
|
|
226
|
+
# 下载并处理图像
|
|
227
|
+
img_bytes = await self.deal_img(url, img_size)
|
|
228
|
+
|
|
229
|
+
# Insert the image or fall back to writing the URL if insertion fails
|
|
230
|
+
# 插入图像,如果插入失败,则回退到写入URL
|
|
231
|
+
if img_bytes is None or ws.insert_image(y, x, '', {'image_data': img_bytes}) == -1:
|
|
232
|
+
ws.write(y, x, url, bold_format)
|
|
233
|
+
|
|
234
|
+
# Handle regular fields
|
|
235
|
+
# 处理常规字段
|
|
236
|
+
else:
|
|
237
|
+
ws.write(y, x, item.get(field), bold_format)
|
|
238
|
+
|
|
239
|
+
# Set row and column dimensions if image size is specified
|
|
240
|
+
# 如果指定了图像大小,则设置行和列尺寸
|
|
241
|
+
if img_size is not None:
|
|
242
|
+
ws.set_column_pixels(0, len(fields), width=math.ceil(img_size[0]))
|
|
243
|
+
ws.set_row_pixels(y, height=math.ceil(img_size[1]))
|
|
244
|
+
|
|
245
|
+
except Exception as e:
|
|
246
|
+
# Log any errors that occur
|
|
247
|
+
# 记录发生的任何错误
|
|
248
|
+
logger.exception(f'Save Execl Error, filename:{filename}, item:{item}, errMsg: {e}')
|
|
249
|
+
|
|
250
|
+
def _get_write_class(self, filename, item, sheet='sheet1', **options):
|
|
251
|
+
"""
|
|
252
|
+
Get or create workbook, worksheet, fields list, and current row for a file.
|
|
253
|
+
获取或创建文件的工作簿、工作表、字段列表和当前行。
|
|
254
|
+
|
|
255
|
+
This method returns existing Excel objects for the given filename and sheet
|
|
256
|
+
if they exist, or creates new ones if not. When creating a new worksheet,
|
|
257
|
+
it also writes the header row using the keys of the provided item.
|
|
258
|
+
如果存在,此方法返回给定文件名和工作表的现有Excel对象,如果不存在,则创建新的。
|
|
259
|
+
创建新工作表时,它还使用提供的项目的键写入标题行。
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
filename: The name of the Excel file.
|
|
263
|
+
Excel文件的名称。
|
|
264
|
+
item: The dictionary item whose keys will be used as headers.
|
|
265
|
+
其键将用作标题的字典项目。
|
|
266
|
+
sheet: The name of the worksheet to use.
|
|
267
|
+
要使用的工作表的名称。
|
|
268
|
+
Defaults to 'sheet1'.
|
|
269
|
+
默认为'sheet1'。
|
|
270
|
+
**options: Additional options to pass to the Excel workbook.
|
|
271
|
+
传递给Excel工作簿的其他选项。
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
tuple: A tuple containing (workbook, worksheet, fields, row_number).
|
|
275
|
+
包含(工作簿, 工作表, 字段, 行号)的元组。
|
|
276
|
+
"""
|
|
277
|
+
# Create a unique key for the worksheet cache
|
|
278
|
+
# 为工作表缓存创建唯一键
|
|
279
|
+
filename_sheet = filename + sheet
|
|
280
|
+
|
|
281
|
+
# If this worksheet doesn't exist yet, create it
|
|
282
|
+
# 如果此工作表尚不存在,则创建它
|
|
283
|
+
if self.ws_cache.get(filename_sheet) is None:
|
|
284
|
+
# If the workbook doesn't exist yet, create it
|
|
285
|
+
# 如果工作簿尚不存在,则创建它
|
|
286
|
+
if self.wb_cache.get(filename) is None:
|
|
287
|
+
logger.info(f'Create Execl: {filename}')
|
|
288
|
+
wb = xlsxwriter.Workbook(filename, options=options)
|
|
289
|
+
self.wb_cache[filename] = wb
|
|
290
|
+
else:
|
|
291
|
+
wb = self.wb_cache[filename]
|
|
292
|
+
|
|
293
|
+
# Create a new worksheet
|
|
294
|
+
# 创建新工作表
|
|
295
|
+
ws = wb.add_worksheet(sheet)
|
|
296
|
+
|
|
297
|
+
# Create a format for the header row
|
|
298
|
+
# 为标题行创建格式
|
|
299
|
+
bold_format = wb.add_format(
|
|
300
|
+
{'bold': True, 'font_size': 12, 'border': 1, 'align': 'center', 'valign': 'vcenter'})
|
|
301
|
+
|
|
302
|
+
# Get the field names from the item
|
|
303
|
+
# 从项目获取字段名称
|
|
304
|
+
fields = list(item.keys())
|
|
305
|
+
|
|
306
|
+
# Write the header row
|
|
307
|
+
# 写入标题行
|
|
308
|
+
ws.write_row('A1', fields, cell_format=bold_format)
|
|
309
|
+
ws.set_row(0, height=30)
|
|
310
|
+
|
|
311
|
+
# Store the worksheet, fields, and row counter in the caches
|
|
312
|
+
# 将工作表、字段和行计数器存储在缓存中
|
|
313
|
+
self.fields_cache[filename_sheet] = fields
|
|
314
|
+
self.ws_cache[filename_sheet] = ws
|
|
315
|
+
self.y_cache[filename_sheet] = 0
|
|
316
|
+
|
|
317
|
+
# Increment the row counter for this worksheet
|
|
318
|
+
# 增加此工作表的行计数器
|
|
319
|
+
self.y_cache[filename_sheet] += 1
|
|
320
|
+
|
|
321
|
+
# Return the workbook, worksheet, fields, and current row
|
|
322
|
+
# 返回工作簿、工作表、字段和当前行
|
|
323
|
+
return self.wb_cache[filename], \
|
|
324
|
+
self.ws_cache[filename_sheet], \
|
|
325
|
+
self.fields_cache[filename_sheet], \
|
|
326
|
+
self.y_cache[filename_sheet]
|
|
327
|
+
|
|
328
|
+
def close_execl(self, filename=None):
|
|
329
|
+
"""
|
|
330
|
+
Close a specific Excel file.
|
|
331
|
+
关闭特定的Excel文件。
|
|
332
|
+
|
|
333
|
+
This method closes the workbook for a specific Excel file and removes
|
|
334
|
+
all related objects from the caches.
|
|
335
|
+
此方法关闭特定Excel文件的工作簿,并从缓存中删除所有相关对象。
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
filename: The name of the Excel file to close.
|
|
339
|
+
要关闭的Excel文件的名称。
|
|
340
|
+
If None or not found in the cache, nothing happens.
|
|
341
|
+
如果为None或在缓存中未找到,则不会发生任何事情。
|
|
342
|
+
"""
|
|
343
|
+
# If the filename is not in the cache, return
|
|
344
|
+
# 如果文件名不在缓存中,则返回
|
|
345
|
+
if filename not in self.wb_cache:
|
|
346
|
+
return
|
|
347
|
+
|
|
348
|
+
# Log that we're closing the file
|
|
349
|
+
# 记录我们正在关闭文件
|
|
350
|
+
logger.info(f'Closing Execl: {filename}')
|
|
351
|
+
|
|
352
|
+
# Close the workbook if it exists
|
|
353
|
+
# 如果工作簿存在,则关闭它
|
|
354
|
+
if wb := self.wb_cache.pop(filename):
|
|
355
|
+
wb.close()
|
|
356
|
+
|
|
357
|
+
# Remove all worksheets, row counters, and fields lists for this file
|
|
358
|
+
# 删除此文件的所有工作表、行计数器和字段列表
|
|
359
|
+
for filename_sheet in list(self.ws_cache.keys()):
|
|
360
|
+
if not filename_sheet.startswith(filename):
|
|
361
|
+
continue
|
|
362
|
+
self.ws_cache.pop(filename_sheet, None)
|
|
363
|
+
self.y_cache.pop(filename_sheet, None)
|
|
364
|
+
self.fields_cache.pop(filename_sheet, None)
|
|
365
|
+
|
|
366
|
+
def close(self):
|
|
367
|
+
"""
|
|
368
|
+
Close all open Excel files.
|
|
369
|
+
关闭所有打开的Excel文件。
|
|
370
|
+
|
|
371
|
+
This method closes all workbooks for all Excel files that have been
|
|
372
|
+
opened by this instance.
|
|
373
|
+
此方法关闭此实例打开的所有Excel文件的所有工作簿。
|
|
374
|
+
"""
|
|
375
|
+
# Make a copy of the keys to avoid modifying the dictionary during iteration
|
|
376
|
+
# 复制键以避免在迭代期间修改字典
|
|
377
|
+
for filename in list(self.wb_cache.keys()):
|
|
378
|
+
self.close_execl(filename)
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
class ExcelPipeline(ExcelSinkMixin):
|
|
382
|
+
"""
|
|
383
|
+
Pipeline for storing scraped items in Excel files.
|
|
384
|
+
用于将抓取的项目存储在Excel文件中的管道。
|
|
385
|
+
|
|
386
|
+
This pipeline uses the ExeclSinkMixin to save items to Excel files. It processes
|
|
387
|
+
items that have a '__excel__' key, which contains parameters for the Excel file
|
|
388
|
+
such as the filename, sheet name, and image settings.
|
|
389
|
+
此管道使用ExeclSinkMixin将项目保存到Excel文件中。它处理具有'__excel__'键的项目,
|
|
390
|
+
该键包含Excel文件的参数,如文件名、工作表名称和图像设置。
|
|
391
|
+
|
|
392
|
+
Note: The class name is misspelled as "Execl" instead of "Excel" for backward compatibility.
|
|
393
|
+
注意:类名拼写为"Execl"而不是"Excel",以保持向后兼容性。
|
|
394
|
+
"""
|
|
395
|
+
|
|
396
|
+
def __init__(self, settings):
|
|
397
|
+
"""
|
|
398
|
+
Initialize the Excel pipeline.
|
|
399
|
+
初始化Excel管道。
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
settings: The AioScrapy settings object.
|
|
403
|
+
AioScrapy设置对象。
|
|
404
|
+
Not used in the current implementation, but included for
|
|
405
|
+
compatibility with the pipeline interface.
|
|
406
|
+
在当前实现中未使用,但为了与管道接口兼容而包含。
|
|
407
|
+
"""
|
|
408
|
+
# Create a lock to ensure thread-safe access to Excel files
|
|
409
|
+
# 创建锁以确保对Excel文件的线程安全访问
|
|
410
|
+
self.lock = asyncio.Lock()
|
|
411
|
+
|
|
412
|
+
@classmethod
|
|
413
|
+
def from_settings(cls, settings):
|
|
414
|
+
"""
|
|
415
|
+
Create an ExeclPipeline instance from settings.
|
|
416
|
+
从设置创建ExeclPipeline实例。
|
|
417
|
+
|
|
418
|
+
This is the factory method used by AioScrapy to create pipeline instances.
|
|
419
|
+
这是AioScrapy用于创建管道实例的工厂方法。
|
|
420
|
+
|
|
421
|
+
Args:
|
|
422
|
+
settings: The AioScrapy settings object.
|
|
423
|
+
AioScrapy设置对象。
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
ExeclPipeline: A new ExeclPipeline instance.
|
|
427
|
+
一个新的ExeclPipeline实例。
|
|
428
|
+
"""
|
|
429
|
+
return cls(settings)
|
|
430
|
+
|
|
431
|
+
async def process_item(self, item, spider):
|
|
432
|
+
"""
|
|
433
|
+
Process an item and save it to an Excel file if it has a '__excel__' key.
|
|
434
|
+
处理项目,如果它有'__excel__'键,则将其保存到Excel文件。
|
|
435
|
+
|
|
436
|
+
This method checks if the item has a '__excel__' key. If it does, it uses
|
|
437
|
+
the parameters in that key to save the item to an Excel file. If not, it
|
|
438
|
+
logs a warning and returns the item unchanged.
|
|
439
|
+
此方法检查项目是否具有'__excel__'键。如果有,它使用该键中的参数将项目
|
|
440
|
+
保存到Excel文件。如果没有,它会记录警告并返回未更改的项目。
|
|
441
|
+
|
|
442
|
+
Args:
|
|
443
|
+
item: The item to process.
|
|
444
|
+
要处理的项目。
|
|
445
|
+
spider: The spider that generated the item.
|
|
446
|
+
生成项目的爬虫。
|
|
447
|
+
|
|
448
|
+
Returns:
|
|
449
|
+
dict: The processed item.
|
|
450
|
+
处理后的项目。
|
|
451
|
+
"""
|
|
452
|
+
# Extract Excel parameters from the item
|
|
453
|
+
# 从项目中提取Excel参数
|
|
454
|
+
execl_kw: Optional[dict] = item.pop('__excel__', None)
|
|
455
|
+
|
|
456
|
+
# If no Excel parameters, log a warning and return the item
|
|
457
|
+
# 如果没有Excel参数,记录警告并返回项目
|
|
458
|
+
if not execl_kw:
|
|
459
|
+
logger.warning(f"item Missing key __excel__, not stored")
|
|
460
|
+
return item
|
|
461
|
+
|
|
462
|
+
# Use the spider name as the default filename
|
|
463
|
+
# 使用爬虫名称作为默认文件名
|
|
464
|
+
execl_kw.setdefault('filename', spider.name)
|
|
465
|
+
|
|
466
|
+
# Use a lock to ensure thread-safe access to Excel files
|
|
467
|
+
# 使用锁确保对Excel文件的线程安全访问
|
|
468
|
+
async with self.lock:
|
|
469
|
+
# Save the item to an Excel file
|
|
470
|
+
# 将项目保存到Excel文件
|
|
471
|
+
await self.save_item(item, **execl_kw)
|
|
472
|
+
|
|
473
|
+
return item
|
|
474
|
+
|
|
475
|
+
async def close_spider(self, spider):
|
|
476
|
+
"""
|
|
477
|
+
Close all open Excel files when the spider is closed.
|
|
478
|
+
当爬虫关闭时关闭所有打开的Excel文件。
|
|
479
|
+
|
|
480
|
+
This method is called by AioScrapy when a spider is closed. It ensures
|
|
481
|
+
that all Excel files opened by this pipeline are properly closed.
|
|
482
|
+
当爬虫关闭时,AioScrapy调用此方法。它确保此管道打开的所有Excel文件
|
|
483
|
+
都正确关闭。
|
|
484
|
+
|
|
485
|
+
Args:
|
|
486
|
+
spider: The spider that was closed.
|
|
487
|
+
被关闭的爬虫。
|
|
488
|
+
"""
|
|
489
|
+
# Close all open Excel files
|
|
490
|
+
# 关闭所有打开的Excel文件
|
|
491
|
+
self.close()
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
# Test code for the Excel pipeline
|
|
495
|
+
# Excel管道的测试代码
|
|
496
|
+
if __name__ == '__main__':
|
|
497
|
+
"""
|
|
498
|
+
Test code to demonstrate the usage of the ExeclPipeline.
|
|
499
|
+
演示ExeclPipeline用法的测试代码。
|
|
500
|
+
|
|
501
|
+
This code creates a simple test spider and pipeline, processes a test item,
|
|
502
|
+
and then closes the pipeline.
|
|
503
|
+
此代码创建一个简单的测试爬虫和管道,处理一个测试项目,然后关闭管道。
|
|
504
|
+
"""
|
|
505
|
+
|
|
506
|
+
class TestSpider:
|
|
507
|
+
"""
|
|
508
|
+
Simple test spider class with a name attribute.
|
|
509
|
+
具有name属性的简单测试爬虫类。
|
|
510
|
+
"""
|
|
511
|
+
name = 'TestSpider'
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
async def test():
|
|
515
|
+
"""
|
|
516
|
+
Async test function to demonstrate the ExeclPipeline.
|
|
517
|
+
演示ExeclPipeline的异步测试函数。
|
|
518
|
+
"""
|
|
519
|
+
# Create a new Excel pipeline
|
|
520
|
+
# 创建一个新的Excel管道
|
|
521
|
+
p = ExcelPipeline({})
|
|
522
|
+
|
|
523
|
+
# Process a test item with Excel parameters
|
|
524
|
+
# 处理带有Excel参数的测试项目
|
|
525
|
+
await p.process_item({
|
|
526
|
+
'title': 'tttt',
|
|
527
|
+
'img': '//www.baidu.com/img/flexible/logo/pc/result.png',
|
|
528
|
+
'__excel__': {
|
|
529
|
+
'sheet': 'sheet1',
|
|
530
|
+
# Uncomment these lines to test additional features
|
|
531
|
+
# 取消注释这些行以测试其他功能
|
|
532
|
+
# 'filename': 'test',
|
|
533
|
+
# 'img_fields': ['img'],
|
|
534
|
+
# 'img_size': (100, 500)
|
|
535
|
+
}
|
|
536
|
+
}, TestSpider())
|
|
537
|
+
|
|
538
|
+
# Close the pipeline
|
|
539
|
+
# 关闭管道
|
|
540
|
+
await p.close_spider(None)
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
# Run the test function
|
|
544
|
+
# 运行测试函数
|
|
545
|
+
asyncio.run(test())
|