aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,545 @@
1
+ """
2
+ Excel Pipeline for AioScrapy
3
+ AioScrapy的Excel管道
4
+
5
+ This module provides a pipeline for storing scraped items in Excel files.
6
+ It includes a mixin class for Excel file handling and a pipeline class that
7
+ uses the mixin to process items and save them to Excel files. It supports
8
+ formatting dates and embedding images.
9
+ 此模块提供了一个用于将抓取的项目存储在Excel文件中的管道。
10
+ 它包括一个用于Excel文件处理的混入类和一个使用该混入类处理项目并将其保存到Excel文件的管道类。
11
+ 它支持格式化日期和嵌入图像。
12
+ """
13
+
14
+ import asyncio
15
+ import math
16
+ from io import BytesIO
17
+ from typing import Tuple, Optional
18
+
19
+ import requests
20
+ import xlsxwriter
21
+ from PIL import Image, ImageFile
22
+
23
+ from aioscrapy.utils.log import logger
24
+
25
+ try:
26
+ # Use LANCZOS resampling filter for PIL 9.1.0 and above
27
+ # 对PIL 9.1.0及以上版本使用LANCZOS重采样过滤器
28
+ resample = Image.LANCZOS
29
+ except:
30
+ # Fall back to ANTIALIAS for older PIL versions
31
+ # 对较旧的PIL版本回退到ANTIALIAS
32
+ resample = Image.ANTIALIAS
33
+ # Allow loading truncated images
34
+ # 允许加载截断的图像
35
+ ImageFile.LOAD_TRUNCATED_IMAGES = True
36
+
37
+
38
+ class ExcelSinkMixin:
39
+ """
40
+ Mixin class for Excel file handling.
41
+ Excel文件处理的混入类。
42
+
43
+ This mixin provides methods for saving items to Excel files, managing Excel workbooks
44
+ and worksheets, and handling special data types like dates and images. It can be used
45
+ by any class that needs to write data to Excel files.
46
+ 此混入类提供了将项目保存到Excel文件、管理Excel工作簿和工作表以及处理日期和图像等
47
+ 特殊数据类型的方法。它可以被任何需要将数据写入Excel文件的类使用。
48
+ """
49
+ # Dictionary to store worksheet objects by filename+sheet
50
+ # 按文件名+工作表存储工作表对象的字典
51
+ ws_cache = {}
52
+
53
+ # Dictionary to store workbook objects by filename
54
+ # 按文件名存储工作簿对象的字典
55
+ wb_cache = {}
56
+
57
+ # Dictionary to store field lists by filename+sheet
58
+ # 按文件名+工作表存储字段列表的字典
59
+ fields_cache = {}
60
+
61
+ # Dictionary to store current row positions by filename+sheet
62
+ # 按文件名+工作表存储当前行位置的字典
63
+ y_cache = {}
64
+
65
+ @staticmethod
66
+ async def deal_img(url: str, img_size: Optional[Tuple[int, int]]) -> Optional[BytesIO]:
67
+ """
68
+ Download and process an image from a URL.
69
+ 从URL下载并处理图像。
70
+
71
+ This method downloads an image from the given URL, optionally resizes it
72
+ to the specified dimensions while maintaining aspect ratio, and returns
73
+ it as a BytesIO object that can be embedded in an Excel file.
74
+ 此方法从给定的URL下载图像,可选择将其调整为指定的尺寸(同时保持纵横比),
75
+ 并将其作为可嵌入Excel文件的BytesIO对象返回。
76
+
77
+ Args:
78
+ url: The URL of the image to download.
79
+ 要下载的图像的URL。
80
+ img_size: Optional tuple of (width, height) to resize the image to.
81
+ 可选的(宽度, 高度)元组,用于调整图像大小。
82
+ If provided, the image will be resized to fit within these dimensions
83
+ while maintaining aspect ratio.
84
+ 如果提供,图像将被调整大小以适应这些尺寸,同时保持纵横比。
85
+
86
+ Returns:
87
+ BytesIO: A BytesIO object containing the processed image,
88
+ or None if the image could not be downloaded or processed.
89
+ 包含处理后图像的BytesIO对象,
90
+ 如果无法下载或处理图像,则为None。
91
+ """
92
+ # Add https: prefix if URL starts with //
93
+ # 如果URL以//开头,则添加https:前缀
94
+ if url.startswith('//'):
95
+ url = 'https:' + url
96
+
97
+ # Download the image
98
+ # 下载图像
99
+ try:
100
+ img_bytes = requests.get(url).content
101
+ except Exception as e:
102
+ logger.error(f"download img error: {e}")
103
+ return None
104
+
105
+ # Open the image using PIL
106
+ # 使用PIL打开图像
107
+ im = Image.open(BytesIO(img_bytes))
108
+ im_format = im.format
109
+
110
+ # Resize the image if a size is specified
111
+ # 如果指定了大小,则调整图像大小
112
+ if img_size:
113
+ # Calculate scaling factor to maintain aspect ratio
114
+ # 计算缩放因子以保持纵横比
115
+ temp = max(im.size[0] / img_size[0], im.size[1] / img_size[1])
116
+ img_size = (math.ceil(im.size[0] / temp), math.ceil(im.size[1] / temp))
117
+
118
+ # Resize and convert to palette mode to reduce file size
119
+ # 调整大小并转换为调色板模式以减小文件大小
120
+ im = im.resize(img_size, resample).convert('P')
121
+
122
+ # Save the processed image to a BytesIO object
123
+ # 将处理后的图像保存到BytesIO对象
124
+ result = BytesIO()
125
+ im.save(result, format=im_format)
126
+
127
+ # Reset the position to the beginning of the BytesIO object
128
+ # 将位置重置到BytesIO对象的开头
129
+ result.seek(0)
130
+
131
+ return result
132
+
133
+ async def save_item(
134
+ self,
135
+ item: dict,
136
+ *,
137
+ filename: Optional[str] = None,
138
+ date_fields: Optional[list] = None,
139
+ date_format: str = 'yyyy-mm-dd HH:MM:SS',
140
+ img_fields: Optional[list] = None,
141
+ img_size: Optional[Tuple[int, int]] = None,
142
+ **options
143
+ ):
144
+ """
145
+ Save an item to an Excel file.
146
+ 将项目保存到Excel文件。
147
+
148
+ This method writes a dictionary item as a row in an Excel file. It handles
149
+ special formatting for date fields and can embed images from URLs. The first row
150
+ of the Excel file will contain the keys of the first item saved to the file.
151
+ 此方法将字典项目作为Excel文件中的一行写入。它处理日期字段的特殊格式,
152
+ 并可以嵌入来自URL的图像。Excel文件的第一行将包含保存到文件的第一个项目的键。
153
+
154
+ Args:
155
+ item: The dictionary item to save.
156
+ 要保存的字典项目。
157
+ filename: The name of the Excel file to save to.
158
+ 要保存到的Excel文件的名称。
159
+ If not provided, an assertion error will be raised.
160
+ 如果未提供,将引发断言错误。
161
+ date_fields: Optional list of field names that should be formatted as dates.
162
+ 应格式化为日期的字段名称的可选列表。
163
+ date_format: The Excel date format to use for date fields.
164
+ 用于日期字段的Excel日期格式。
165
+ Defaults to 'yyyy-mm-dd HH:MM:SS'.
166
+ 默认为'yyyy-mm-dd HH:MM:SS'。
167
+ img_fields: Optional list of field names that contain image URLs.
168
+ 包含图像URL的字段名称的可选列表。
169
+ img_size: Optional tuple of (width, height) to resize images to.
170
+ 用于调整图像大小的可选(宽度, 高度)元组。
171
+ **options: Additional options to pass to the Excel workbook.
172
+ 传递给Excel工作簿的其他选项。
173
+
174
+ Raises:
175
+ AssertionError: If filename is None.
176
+ 如果filename为None。
177
+ Exception: If there is an error writing to the Excel file.
178
+ 如果写入Excel文件时出错。
179
+ """
180
+ # Ensure filename is provided
181
+ # 确保提供了文件名
182
+ assert filename is not None, "请传入filename参数"
183
+
184
+ # Add .xlsx extension if not present
185
+ # 如果不存在,则添加.xlsx扩展名
186
+ if '.xlsx' not in filename:
187
+ filename = filename + '.xlsx'
188
+
189
+ try:
190
+ # Get or create workbook, worksheet, fields list, and current row
191
+ # 获取或创建工作簿、工作表、字段列表和当前行
192
+ wb, ws, fields, y = self._get_write_class(filename, item, **options)
193
+
194
+ # Create cell formats for alternating row colors
195
+ # 创建用于交替行颜色的单元格格式
196
+ bold_format_1 = wb.add_format({'align': 'left', 'border': 1, 'valign': 'vcenter'})
197
+ bold_format_2 = wb.add_format({'align': 'left', 'border': 1, 'valign': 'vcenter', 'fg_color': '#D0D3D4'})
198
+
199
+ # Process each field in the item
200
+ # 处理项目中的每个字段
201
+ for x, field in enumerate(fields):
202
+ # Alternate row colors
203
+ # 交替行颜色
204
+ if x % 2 == 0:
205
+ bold_format = bold_format_1
206
+ else:
207
+ bold_format = bold_format_2
208
+
209
+ # Handle date fields
210
+ # 处理日期字段
211
+ if date_fields is not None and field in date_fields:
212
+ ws.write_datetime(y, x, item.get(field), wb.add_format({'num_format': date_format}))
213
+
214
+ # Handle image fields
215
+ # 处理图像字段
216
+ elif img_fields is not None and field in img_fields:
217
+ # Set column width if image size is specified
218
+ # 如果指定了图像大小,则设置列宽
219
+ img_size and ws.set_column_pixels(x, x, width=math.ceil(img_size[0]))
220
+
221
+ # Get image URL from item
222
+ # 从项目获取图像URL
223
+ url = item.get(field)
224
+
225
+ # Download and process the image
226
+ # 下载并处理图像
227
+ img_bytes = await self.deal_img(url, img_size)
228
+
229
+ # Insert the image or fall back to writing the URL if insertion fails
230
+ # 插入图像,如果插入失败,则回退到写入URL
231
+ if img_bytes is None or ws.insert_image(y, x, '', {'image_data': img_bytes}) == -1:
232
+ ws.write(y, x, url, bold_format)
233
+
234
+ # Handle regular fields
235
+ # 处理常规字段
236
+ else:
237
+ ws.write(y, x, item.get(field), bold_format)
238
+
239
+ # Set row and column dimensions if image size is specified
240
+ # 如果指定了图像大小,则设置行和列尺寸
241
+ if img_size is not None:
242
+ ws.set_column_pixels(0, len(fields), width=math.ceil(img_size[0]))
243
+ ws.set_row_pixels(y, height=math.ceil(img_size[1]))
244
+
245
+ except Exception as e:
246
+ # Log any errors that occur
247
+ # 记录发生的任何错误
248
+ logger.exception(f'Save Execl Error, filename:{filename}, item:{item}, errMsg: {e}')
249
+
250
+ def _get_write_class(self, filename, item, sheet='sheet1', **options):
251
+ """
252
+ Get or create workbook, worksheet, fields list, and current row for a file.
253
+ 获取或创建文件的工作簿、工作表、字段列表和当前行。
254
+
255
+ This method returns existing Excel objects for the given filename and sheet
256
+ if they exist, or creates new ones if not. When creating a new worksheet,
257
+ it also writes the header row using the keys of the provided item.
258
+ 如果存在,此方法返回给定文件名和工作表的现有Excel对象,如果不存在,则创建新的。
259
+ 创建新工作表时,它还使用提供的项目的键写入标题行。
260
+
261
+ Args:
262
+ filename: The name of the Excel file.
263
+ Excel文件的名称。
264
+ item: The dictionary item whose keys will be used as headers.
265
+ 其键将用作标题的字典项目。
266
+ sheet: The name of the worksheet to use.
267
+ 要使用的工作表的名称。
268
+ Defaults to 'sheet1'.
269
+ 默认为'sheet1'。
270
+ **options: Additional options to pass to the Excel workbook.
271
+ 传递给Excel工作簿的其他选项。
272
+
273
+ Returns:
274
+ tuple: A tuple containing (workbook, worksheet, fields, row_number).
275
+ 包含(工作簿, 工作表, 字段, 行号)的元组。
276
+ """
277
+ # Create a unique key for the worksheet cache
278
+ # 为工作表缓存创建唯一键
279
+ filename_sheet = filename + sheet
280
+
281
+ # If this worksheet doesn't exist yet, create it
282
+ # 如果此工作表尚不存在,则创建它
283
+ if self.ws_cache.get(filename_sheet) is None:
284
+ # If the workbook doesn't exist yet, create it
285
+ # 如果工作簿尚不存在,则创建它
286
+ if self.wb_cache.get(filename) is None:
287
+ logger.info(f'Create Execl: {filename}')
288
+ wb = xlsxwriter.Workbook(filename, options=options)
289
+ self.wb_cache[filename] = wb
290
+ else:
291
+ wb = self.wb_cache[filename]
292
+
293
+ # Create a new worksheet
294
+ # 创建新工作表
295
+ ws = wb.add_worksheet(sheet)
296
+
297
+ # Create a format for the header row
298
+ # 为标题行创建格式
299
+ bold_format = wb.add_format(
300
+ {'bold': True, 'font_size': 12, 'border': 1, 'align': 'center', 'valign': 'vcenter'})
301
+
302
+ # Get the field names from the item
303
+ # 从项目获取字段名称
304
+ fields = list(item.keys())
305
+
306
+ # Write the header row
307
+ # 写入标题行
308
+ ws.write_row('A1', fields, cell_format=bold_format)
309
+ ws.set_row(0, height=30)
310
+
311
+ # Store the worksheet, fields, and row counter in the caches
312
+ # 将工作表、字段和行计数器存储在缓存中
313
+ self.fields_cache[filename_sheet] = fields
314
+ self.ws_cache[filename_sheet] = ws
315
+ self.y_cache[filename_sheet] = 0
316
+
317
+ # Increment the row counter for this worksheet
318
+ # 增加此工作表的行计数器
319
+ self.y_cache[filename_sheet] += 1
320
+
321
+ # Return the workbook, worksheet, fields, and current row
322
+ # 返回工作簿、工作表、字段和当前行
323
+ return self.wb_cache[filename], \
324
+ self.ws_cache[filename_sheet], \
325
+ self.fields_cache[filename_sheet], \
326
+ self.y_cache[filename_sheet]
327
+
328
+ def close_execl(self, filename=None):
329
+ """
330
+ Close a specific Excel file.
331
+ 关闭特定的Excel文件。
332
+
333
+ This method closes the workbook for a specific Excel file and removes
334
+ all related objects from the caches.
335
+ 此方法关闭特定Excel文件的工作簿,并从缓存中删除所有相关对象。
336
+
337
+ Args:
338
+ filename: The name of the Excel file to close.
339
+ 要关闭的Excel文件的名称。
340
+ If None or not found in the cache, nothing happens.
341
+ 如果为None或在缓存中未找到,则不会发生任何事情。
342
+ """
343
+ # If the filename is not in the cache, return
344
+ # 如果文件名不在缓存中,则返回
345
+ if filename not in self.wb_cache:
346
+ return
347
+
348
+ # Log that we're closing the file
349
+ # 记录我们正在关闭文件
350
+ logger.info(f'Closing Execl: {filename}')
351
+
352
+ # Close the workbook if it exists
353
+ # 如果工作簿存在,则关闭它
354
+ if wb := self.wb_cache.pop(filename):
355
+ wb.close()
356
+
357
+ # Remove all worksheets, row counters, and fields lists for this file
358
+ # 删除此文件的所有工作表、行计数器和字段列表
359
+ for filename_sheet in list(self.ws_cache.keys()):
360
+ if not filename_sheet.startswith(filename):
361
+ continue
362
+ self.ws_cache.pop(filename_sheet, None)
363
+ self.y_cache.pop(filename_sheet, None)
364
+ self.fields_cache.pop(filename_sheet, None)
365
+
366
+ def close(self):
367
+ """
368
+ Close all open Excel files.
369
+ 关闭所有打开的Excel文件。
370
+
371
+ This method closes all workbooks for all Excel files that have been
372
+ opened by this instance.
373
+ 此方法关闭此实例打开的所有Excel文件的所有工作簿。
374
+ """
375
+ # Make a copy of the keys to avoid modifying the dictionary during iteration
376
+ # 复制键以避免在迭代期间修改字典
377
+ for filename in list(self.wb_cache.keys()):
378
+ self.close_execl(filename)
379
+
380
+
381
+ class ExcelPipeline(ExcelSinkMixin):
382
+ """
383
+ Pipeline for storing scraped items in Excel files.
384
+ 用于将抓取的项目存储在Excel文件中的管道。
385
+
386
+ This pipeline uses the ExeclSinkMixin to save items to Excel files. It processes
387
+ items that have a '__excel__' key, which contains parameters for the Excel file
388
+ such as the filename, sheet name, and image settings.
389
+ 此管道使用ExeclSinkMixin将项目保存到Excel文件中。它处理具有'__excel__'键的项目,
390
+ 该键包含Excel文件的参数,如文件名、工作表名称和图像设置。
391
+
392
+ Note: The class name is misspelled as "Execl" instead of "Excel" for backward compatibility.
393
+ 注意:类名拼写为"Execl"而不是"Excel",以保持向后兼容性。
394
+ """
395
+
396
+ def __init__(self, settings):
397
+ """
398
+ Initialize the Excel pipeline.
399
+ 初始化Excel管道。
400
+
401
+ Args:
402
+ settings: The AioScrapy settings object.
403
+ AioScrapy设置对象。
404
+ Not used in the current implementation, but included for
405
+ compatibility with the pipeline interface.
406
+ 在当前实现中未使用,但为了与管道接口兼容而包含。
407
+ """
408
+ # Create a lock to ensure thread-safe access to Excel files
409
+ # 创建锁以确保对Excel文件的线程安全访问
410
+ self.lock = asyncio.Lock()
411
+
412
+ @classmethod
413
+ def from_settings(cls, settings):
414
+ """
415
+ Create an ExeclPipeline instance from settings.
416
+ 从设置创建ExeclPipeline实例。
417
+
418
+ This is the factory method used by AioScrapy to create pipeline instances.
419
+ 这是AioScrapy用于创建管道实例的工厂方法。
420
+
421
+ Args:
422
+ settings: The AioScrapy settings object.
423
+ AioScrapy设置对象。
424
+
425
+ Returns:
426
+ ExeclPipeline: A new ExeclPipeline instance.
427
+ 一个新的ExeclPipeline实例。
428
+ """
429
+ return cls(settings)
430
+
431
+ async def process_item(self, item, spider):
432
+ """
433
+ Process an item and save it to an Excel file if it has a '__excel__' key.
434
+ 处理项目,如果它有'__excel__'键,则将其保存到Excel文件。
435
+
436
+ This method checks if the item has a '__excel__' key. If it does, it uses
437
+ the parameters in that key to save the item to an Excel file. If not, it
438
+ logs a warning and returns the item unchanged.
439
+ 此方法检查项目是否具有'__excel__'键。如果有,它使用该键中的参数将项目
440
+ 保存到Excel文件。如果没有,它会记录警告并返回未更改的项目。
441
+
442
+ Args:
443
+ item: The item to process.
444
+ 要处理的项目。
445
+ spider: The spider that generated the item.
446
+ 生成项目的爬虫。
447
+
448
+ Returns:
449
+ dict: The processed item.
450
+ 处理后的项目。
451
+ """
452
+ # Extract Excel parameters from the item
453
+ # 从项目中提取Excel参数
454
+ execl_kw: Optional[dict] = item.pop('__excel__', None)
455
+
456
+ # If no Excel parameters, log a warning and return the item
457
+ # 如果没有Excel参数,记录警告并返回项目
458
+ if not execl_kw:
459
+ logger.warning(f"item Missing key __excel__, not stored")
460
+ return item
461
+
462
+ # Use the spider name as the default filename
463
+ # 使用爬虫名称作为默认文件名
464
+ execl_kw.setdefault('filename', spider.name)
465
+
466
+ # Use a lock to ensure thread-safe access to Excel files
467
+ # 使用锁确保对Excel文件的线程安全访问
468
+ async with self.lock:
469
+ # Save the item to an Excel file
470
+ # 将项目保存到Excel文件
471
+ await self.save_item(item, **execl_kw)
472
+
473
+ return item
474
+
475
+ async def close_spider(self, spider):
476
+ """
477
+ Close all open Excel files when the spider is closed.
478
+ 当爬虫关闭时关闭所有打开的Excel文件。
479
+
480
+ This method is called by AioScrapy when a spider is closed. It ensures
481
+ that all Excel files opened by this pipeline are properly closed.
482
+ 当爬虫关闭时,AioScrapy调用此方法。它确保此管道打开的所有Excel文件
483
+ 都正确关闭。
484
+
485
+ Args:
486
+ spider: The spider that was closed.
487
+ 被关闭的爬虫。
488
+ """
489
+ # Close all open Excel files
490
+ # 关闭所有打开的Excel文件
491
+ self.close()
492
+
493
+
494
+ # Test code for the Excel pipeline
495
+ # Excel管道的测试代码
496
+ if __name__ == '__main__':
497
+ """
498
+ Test code to demonstrate the usage of the ExeclPipeline.
499
+ 演示ExeclPipeline用法的测试代码。
500
+
501
+ This code creates a simple test spider and pipeline, processes a test item,
502
+ and then closes the pipeline.
503
+ 此代码创建一个简单的测试爬虫和管道,处理一个测试项目,然后关闭管道。
504
+ """
505
+
506
+ class TestSpider:
507
+ """
508
+ Simple test spider class with a name attribute.
509
+ 具有name属性的简单测试爬虫类。
510
+ """
511
+ name = 'TestSpider'
512
+
513
+
514
+ async def test():
515
+ """
516
+ Async test function to demonstrate the ExeclPipeline.
517
+ 演示ExeclPipeline的异步测试函数。
518
+ """
519
+ # Create a new Excel pipeline
520
+ # 创建一个新的Excel管道
521
+ p = ExcelPipeline({})
522
+
523
+ # Process a test item with Excel parameters
524
+ # 处理带有Excel参数的测试项目
525
+ await p.process_item({
526
+ 'title': 'tttt',
527
+ 'img': '//www.baidu.com/img/flexible/logo/pc/result.png',
528
+ '__excel__': {
529
+ 'sheet': 'sheet1',
530
+ # Uncomment these lines to test additional features
531
+ # 取消注释这些行以测试其他功能
532
+ # 'filename': 'test',
533
+ # 'img_fields': ['img'],
534
+ # 'img_size': (100, 500)
535
+ }
536
+ }, TestSpider())
537
+
538
+ # Close the pipeline
539
+ # 关闭管道
540
+ await p.close_spider(None)
541
+
542
+
543
+ # Run the test function
544
+ # 运行测试函数
545
+ asyncio.run(test())