aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/core/scraper.py
CHANGED
|
@@ -1,12 +1,41 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
1
|
+
"""
|
|
2
|
+
Scraper Module
|
|
3
|
+
抓取器模块
|
|
4
|
+
|
|
5
|
+
This module implements the Scraper component which parses responses and
|
|
6
|
+
extracts information from them. The Scraper is the central component that
|
|
7
|
+
coordinates the processing of downloaded content and manages the flow of
|
|
8
|
+
extracted data through the system.
|
|
9
|
+
此模块实现了Scraper组件,用于解析响应并从中提取信息。Scraper是协调下载内容处理
|
|
10
|
+
并管理提取数据在系统中流动的中央组件。
|
|
11
|
+
|
|
12
|
+
The Scraper is responsible for:
|
|
13
|
+
Scraper负责:
|
|
14
|
+
1. Processing downloaded responses through spider callbacks
|
|
15
|
+
通过爬虫回调处理下载的响应
|
|
16
|
+
2. Handling spider output (requests and items)
|
|
17
|
+
处理爬虫输出(请求和项目)
|
|
18
|
+
3. Processing items through the item pipeline
|
|
19
|
+
通过项目管道处理项目
|
|
20
|
+
4. Handling errors during the scraping process
|
|
21
|
+
处理抓取过程中的错误
|
|
22
|
+
5. Managing memory usage and concurrency
|
|
23
|
+
管理内存使用和并发性
|
|
24
|
+
|
|
25
|
+
The module contains two main classes:
|
|
26
|
+
模块包含两个主要类:
|
|
27
|
+
1. Slot: Tracks active requests and memory usage for a spider
|
|
28
|
+
跟踪爬虫的活动请求和内存使用情况
|
|
29
|
+
2. Scraper: Processes responses and extracts items
|
|
30
|
+
处理响应并提取项目
|
|
31
|
+
"""
|
|
3
32
|
import asyncio
|
|
4
33
|
from typing import Any, AsyncGenerator, Set, Union, Optional
|
|
5
34
|
|
|
6
35
|
import aioscrapy
|
|
7
36
|
from aioscrapy import signals, Spider
|
|
8
37
|
from aioscrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
|
|
9
|
-
from aioscrapy.http import
|
|
38
|
+
from aioscrapy.http import WebDriverResponse
|
|
10
39
|
from aioscrapy.http import Request, Response
|
|
11
40
|
from aioscrapy.logformatter import LogFormatter
|
|
12
41
|
from aioscrapy.middleware import ItemPipelineManager, SpiderMiddlewareManager
|
|
@@ -17,39 +46,136 @@ from aioscrapy.utils.tools import call_helper, create_task
|
|
|
17
46
|
|
|
18
47
|
|
|
19
48
|
class Slot:
|
|
20
|
-
"""
|
|
49
|
+
"""
|
|
50
|
+
Scraper slot (one per running spider).
|
|
51
|
+
抓取器槽(每个运行的爬虫一个)。
|
|
21
52
|
|
|
22
|
-
|
|
53
|
+
This class keeps track of active requests and memory usage
|
|
54
|
+
to control the scraper's memory footprint.
|
|
55
|
+
此类跟踪活动请求和内存使用情况,以控制抓取器的内存占用。
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
MIN_RESPONSE_SIZE = 1024 # Minimum size in bytes to account for a response
|
|
59
|
+
# 计算响应的最小字节大小
|
|
23
60
|
|
|
24
61
|
def __init__(self, max_active_size: int = 5000000):
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
62
|
+
"""
|
|
63
|
+
Initialize a scraper slot.
|
|
64
|
+
初始化抓取器槽。
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
max_active_size: Maximum allowed size in bytes for active responses.
|
|
68
|
+
活动响应允许的最大字节大小。
|
|
69
|
+
Default is 5MB.
|
|
70
|
+
默认为5MB。
|
|
71
|
+
"""
|
|
72
|
+
self.max_active_size = max_active_size # Maximum memory allowed for active responses
|
|
73
|
+
# 活动响应允许的最大内存
|
|
74
|
+
self.active: Set[Request] = set() # Set of active requests being processed
|
|
75
|
+
# 正在处理的活动请求集合
|
|
76
|
+
self.active_size: int = 0 # Current memory usage of active responses
|
|
77
|
+
# 活动响应的当前内存使用量
|
|
78
|
+
self.itemproc_size: int = 0 # Number of items being processed by the item pipeline
|
|
79
|
+
# 项目管道正在处理的项目数量
|
|
29
80
|
|
|
30
81
|
def add_response_request(self, result: Union[Response, BaseException], request: Request) -> None:
|
|
82
|
+
"""
|
|
83
|
+
Add a request and its result to the active set.
|
|
84
|
+
将请求及其结果添加到活动集合中。
|
|
85
|
+
|
|
86
|
+
This method tracks the request and updates the memory usage counter
|
|
87
|
+
based on the size of the response.
|
|
88
|
+
此方法跟踪请求并根据响应的大小更新内存使用计数器。
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
result: The response or exception from processing the request.
|
|
92
|
+
处理请求的响应或异常。
|
|
93
|
+
request: The request being processed.
|
|
94
|
+
正在处理的请求。
|
|
95
|
+
"""
|
|
31
96
|
self.active.add(request)
|
|
32
97
|
if isinstance(result, Response):
|
|
98
|
+
# Account for the response body size, with a minimum threshold
|
|
99
|
+
# 计算响应体大小,设有最小阈值
|
|
33
100
|
self.active_size += max(len(result.body), self.MIN_RESPONSE_SIZE)
|
|
34
101
|
else:
|
|
102
|
+
# For exceptions, use the minimum size
|
|
103
|
+
# 对于异常,使用最小大小
|
|
35
104
|
self.active_size += self.MIN_RESPONSE_SIZE
|
|
36
105
|
|
|
37
106
|
def finish_response(self, request: Request, result: Union[Response, BaseException]) -> None:
|
|
107
|
+
"""
|
|
108
|
+
Remove a request and its result from the active set.
|
|
109
|
+
从活动集合中移除请求及其结果。
|
|
110
|
+
|
|
111
|
+
This method is called when processing of a request is complete.
|
|
112
|
+
It updates the memory usage counter and cleans up resources.
|
|
113
|
+
当请求处理完成时调用此方法。它更新内存使用计数器并清理资源。
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
request: The request that has been processed.
|
|
117
|
+
已处理的请求。
|
|
118
|
+
result: The response or exception from processing the request.
|
|
119
|
+
处理请求的响应或异常。
|
|
120
|
+
"""
|
|
38
121
|
self.active.remove(request)
|
|
39
122
|
if isinstance(result, Response):
|
|
123
|
+
# Decrease the memory counter by the response size
|
|
124
|
+
# 按响应大小减少内存计数器
|
|
40
125
|
self.active_size -= max(len(result.body), self.MIN_RESPONSE_SIZE)
|
|
126
|
+
# Clear cached selector to free memory
|
|
127
|
+
# 清除缓存的选择器以释放内存
|
|
41
128
|
result._cached_selector = None
|
|
42
129
|
else:
|
|
130
|
+
# For exceptions, decrease by the minimum size
|
|
131
|
+
# 对于异常,按最小大小减少
|
|
43
132
|
self.active_size -= self.MIN_RESPONSE_SIZE
|
|
44
133
|
|
|
45
134
|
def is_idle(self) -> bool:
|
|
135
|
+
"""
|
|
136
|
+
Check if the slot is idle (no active requests).
|
|
137
|
+
检查槽是否空闲(没有活动请求)。
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
bool: True if there are no active requests, False otherwise.
|
|
141
|
+
如果没有活动请求,则为True,否则为False。
|
|
142
|
+
"""
|
|
46
143
|
return not self.active
|
|
47
144
|
|
|
48
145
|
def needs_backout(self) -> bool:
|
|
146
|
+
"""
|
|
147
|
+
Check if the slot needs to back out (stop accepting new requests).
|
|
148
|
+
检查槽是否需要退出(停止接受新请求)。
|
|
149
|
+
|
|
150
|
+
This method determines if the memory usage has exceeded the maximum
|
|
151
|
+
allowed size, in which case the scraper should stop accepting new
|
|
152
|
+
requests until some current ones complete.
|
|
153
|
+
此方法确定内存使用是否已超过允许的最大大小,在这种情况下,
|
|
154
|
+
抓取器应停止接受新请求,直到一些当前请求完成。
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
bool: True if memory usage exceeds the maximum, False otherwise.
|
|
158
|
+
如果内存使用超过最大值,则为True,否则为False。
|
|
159
|
+
"""
|
|
49
160
|
return self.active_size > self.max_active_size
|
|
50
161
|
|
|
51
162
|
|
|
52
163
|
class Scraper:
|
|
164
|
+
"""
|
|
165
|
+
The Scraper processes downloaded responses and extracts items.
|
|
166
|
+
Scraper处理下载的响应并提取项目。
|
|
167
|
+
|
|
168
|
+
This class is responsible for:
|
|
169
|
+
此类负责:
|
|
170
|
+
1. Processing responses through spider callbacks
|
|
171
|
+
通过爬虫回调处理响应
|
|
172
|
+
2. Handling spider output (requests and items)
|
|
173
|
+
处理爬虫输出(请求和项目)
|
|
174
|
+
3. Processing items through the item pipeline
|
|
175
|
+
通过项目管道处理项目
|
|
176
|
+
4. Managing memory usage and concurrency
|
|
177
|
+
管理内存使用和并发
|
|
178
|
+
"""
|
|
53
179
|
|
|
54
180
|
def __init__(
|
|
55
181
|
self,
|
|
@@ -58,146 +184,428 @@ class Scraper:
|
|
|
58
184
|
spidermw: SpiderMiddlewareManager,
|
|
59
185
|
itemproc: ItemPipelineManager,
|
|
60
186
|
):
|
|
187
|
+
"""
|
|
188
|
+
Initialize the Scraper.
|
|
189
|
+
初始化Scraper。
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
crawler: The crawler instance that this scraper belongs to.
|
|
193
|
+
此抓取器所属的爬虫实例。
|
|
194
|
+
slot: The slot for tracking active requests and memory usage.
|
|
195
|
+
用于跟踪活动请求和内存使用的槽。
|
|
196
|
+
spidermw: The spider middleware manager.
|
|
197
|
+
爬虫中间件管理器。
|
|
198
|
+
itemproc: The item pipeline manager.
|
|
199
|
+
项目管道管理器。
|
|
200
|
+
"""
|
|
61
201
|
self.crawler = crawler
|
|
62
202
|
self.spider: Spider = crawler.spider
|
|
63
203
|
self.signals: SignalManager = self.crawler.signals
|
|
64
204
|
self.logformatter: LogFormatter = self.crawler.logformatter
|
|
65
205
|
|
|
66
|
-
self.slot = slot
|
|
67
|
-
|
|
68
|
-
self.
|
|
69
|
-
|
|
70
|
-
self.
|
|
206
|
+
self.slot = slot # Slot for tracking active requests and memory
|
|
207
|
+
# 用于跟踪活动请求和内存的槽
|
|
208
|
+
self.spidermw = spidermw # Spider middleware manager
|
|
209
|
+
# 爬虫中间件管理器
|
|
210
|
+
self.itemproc = itemproc # Item pipeline manager
|
|
211
|
+
# 项目管道管理器
|
|
212
|
+
|
|
213
|
+
self.finish: bool = False # Flag to indicate if scraper is shutting down
|
|
214
|
+
# 指示抓取器是否正在关闭的标志
|
|
215
|
+
# Semaphore to limit concurrent parsing
|
|
216
|
+
# 用于限制并发解析的信号量
|
|
71
217
|
self.concurrent_parser = asyncio.Semaphore(crawler.settings.getint('CONCURRENT_PARSER', 1))
|
|
72
218
|
|
|
73
219
|
@classmethod
|
|
74
220
|
async def from_crawler(cls, crawler: "aioscrapy.Crawler") -> "Scraper":
|
|
221
|
+
"""
|
|
222
|
+
Create a Scraper instance from a crawler.
|
|
223
|
+
从爬虫创建Scraper实例。
|
|
224
|
+
|
|
225
|
+
This factory method creates a new Scraper instance with all the
|
|
226
|
+
necessary components initialized from the crawler.
|
|
227
|
+
此工厂方法创建一个新的Scraper实例,所有必要的组件都从爬虫初始化。
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
crawler: The crawler instance that will use this scraper.
|
|
231
|
+
将使用此抓取器的爬虫实例。
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
Scraper: A new scraper instance.
|
|
235
|
+
一个新的抓取器实例。
|
|
236
|
+
"""
|
|
237
|
+
# Create the scraper instance with all required components
|
|
238
|
+
# 创建具有所有必需组件的抓取器实例
|
|
75
239
|
instance: "Scraper" = cls(
|
|
76
240
|
crawler,
|
|
241
|
+
# Create a slot with the maximum active size from settings
|
|
242
|
+
# 使用设置中的最大活动大小创建槽
|
|
77
243
|
Slot(crawler.settings.getint('SCRAPER_SLOT_MAX_ACTIVE_SIZE')),
|
|
244
|
+
# Initialize the spider middleware manager
|
|
245
|
+
# 初始化爬虫中间件管理器
|
|
78
246
|
await call_helper(SpiderMiddlewareManager.from_crawler, crawler),
|
|
247
|
+
# Initialize the item pipeline manager
|
|
248
|
+
# 初始化项目管道管理器
|
|
79
249
|
await call_helper(load_object(crawler.settings['ITEM_PROCESSOR']).from_crawler, crawler)
|
|
80
250
|
)
|
|
251
|
+
# Open the item processor for the spider
|
|
252
|
+
# 为爬虫打开项目处理器
|
|
81
253
|
await instance.itemproc.open_spider(crawler.spider)
|
|
82
254
|
return instance
|
|
83
255
|
|
|
84
256
|
async def close(self) -> None:
|
|
85
|
-
"""
|
|
257
|
+
"""
|
|
258
|
+
Close a spider being scraped and release its resources.
|
|
259
|
+
关闭正在抓取的爬虫并释放其资源。
|
|
260
|
+
|
|
261
|
+
This method closes the item processor for the spider and
|
|
262
|
+
marks the scraper as finished.
|
|
263
|
+
此方法关闭爬虫的项目处理器并将抓取器标记为已完成。
|
|
264
|
+
"""
|
|
86
265
|
await self.itemproc.close_spider(self.spider)
|
|
87
266
|
self.finish = True
|
|
88
267
|
|
|
89
268
|
def is_idle(self) -> bool:
|
|
90
|
-
"""
|
|
269
|
+
"""
|
|
270
|
+
Check if the scraper is idle (no active requests).
|
|
271
|
+
检查抓取器是否空闲(没有活动请求)。
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
bool: True if there aren't any more requests to process, False otherwise.
|
|
275
|
+
如果没有更多要处理的请求,则为True,否则为False。
|
|
276
|
+
"""
|
|
91
277
|
return self.slot.is_idle()
|
|
92
278
|
|
|
93
279
|
def needs_backout(self) -> bool:
|
|
280
|
+
"""
|
|
281
|
+
Check if the scraper needs to back out (stop accepting new requests).
|
|
282
|
+
检查抓取器是否需要退出(停止接受新请求)。
|
|
283
|
+
|
|
284
|
+
This method delegates to the slot to determine if memory usage
|
|
285
|
+
has exceeded the maximum allowed size.
|
|
286
|
+
此方法委托给槽来确定内存使用是否已超过允许的最大大小。
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
bool: True if memory usage exceeds the maximum, False otherwise.
|
|
290
|
+
如果内存使用超过最大值,则为True,否则为False。
|
|
291
|
+
"""
|
|
94
292
|
return self.slot.needs_backout()
|
|
95
293
|
|
|
96
294
|
async def enqueue_scrape(self, result: Union[Response, BaseException], request: Request) -> None:
|
|
295
|
+
"""
|
|
296
|
+
Enqueue a response or exception for scraping.
|
|
297
|
+
将响应或异常排队等待抓取。
|
|
298
|
+
|
|
299
|
+
This method adds the request and result to the active set in the slot
|
|
300
|
+
and starts the scraping process.
|
|
301
|
+
此方法将请求和结果添加到槽中的活动集合,并开始抓取过程。
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
result: The response or exception from processing the request.
|
|
305
|
+
处理请求的响应或异常。
|
|
306
|
+
request: The request that was processed.
|
|
307
|
+
已处理的请求。
|
|
308
|
+
"""
|
|
97
309
|
# Cache the results in the slot
|
|
310
|
+
# 在槽中缓存结果
|
|
98
311
|
self.slot.add_response_request(result, request)
|
|
99
312
|
await self._scrape(result, request)
|
|
100
313
|
|
|
101
314
|
async def _scrape(self, result: Union[Response, BaseException], request: Request) -> None:
|
|
102
|
-
"""
|
|
315
|
+
"""
|
|
316
|
+
Handle the downloaded response or failure through the spider callback/errback.
|
|
317
|
+
通过爬虫回调/错误回调处理下载的响应或失败。
|
|
318
|
+
|
|
319
|
+
This method processes the response or exception through the appropriate
|
|
320
|
+
spider callback or errback, and handles any output or errors.
|
|
321
|
+
此方法通过适当的爬虫回调或错误回调处理响应或异常,并处理任何输出或错误。
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
result: The response or exception from processing the request.
|
|
325
|
+
处理请求的响应或异常。
|
|
326
|
+
request: The request that was processed.
|
|
327
|
+
已处理的请求。
|
|
328
|
+
"""
|
|
329
|
+
# Use semaphore to limit concurrent parsing
|
|
330
|
+
# 使用信号量限制并发解析
|
|
103
331
|
async with self.concurrent_parser:
|
|
104
332
|
try:
|
|
333
|
+
# Validate the result type
|
|
334
|
+
# 验证结果类型
|
|
105
335
|
if not isinstance(result, (Response, BaseException)):
|
|
106
336
|
raise TypeError(f"Incorrect type: expected Response or Failure, got {type(result)}: {result!r}")
|
|
107
337
|
try:
|
|
338
|
+
# Process the result through spider middleware and callbacks
|
|
339
|
+
# 通过爬虫中间件和回调处理结果
|
|
108
340
|
output = await self._scrape2(result, request) # returns spider's processed output
|
|
109
341
|
except BaseException as e:
|
|
342
|
+
# Handle any errors during processing
|
|
343
|
+
# 处理处理过程中的任何错误
|
|
110
344
|
await self.handle_spider_error(e, request, result)
|
|
111
345
|
else:
|
|
346
|
+
# Handle the output from the spider
|
|
347
|
+
# 处理爬虫的输出
|
|
112
348
|
await self.handle_spider_output(output, request, result)
|
|
113
349
|
except BaseException as e:
|
|
350
|
+
# Handle any errors that weren't caught earlier
|
|
351
|
+
# 处理之前未捕获的任何错误
|
|
114
352
|
await self.handle_spider_error(e, request, result)
|
|
115
353
|
finally:
|
|
116
|
-
#
|
|
354
|
+
# Update dupefilter with parse status
|
|
355
|
+
# 使用解析状态更新重复过滤器
|
|
117
356
|
self.spider.dupefilter and \
|
|
118
357
|
not request.dont_filter and \
|
|
119
358
|
await self.spider.dupefilter.done(request, done_type="parse_ok" if getattr(request, "parse_ok", False) else "parse_err")
|
|
120
359
|
|
|
121
|
-
if
|
|
360
|
+
# Release playwright/drissionpage response resources if applicable
|
|
361
|
+
# 如果适用,释放playwright/drissionpage等响应资源
|
|
362
|
+
if isinstance(result, WebDriverResponse):
|
|
122
363
|
await result.release()
|
|
123
364
|
|
|
124
365
|
# Delete the cache result from the slot
|
|
366
|
+
# 从槽中删除缓存结果
|
|
125
367
|
self.slot.finish_response(request, result)
|
|
126
368
|
|
|
127
369
|
async def _scrape2(self, result: Union[Response, BaseException], request: Request) -> Optional[AsyncGenerator]:
|
|
128
|
-
"""
|
|
129
|
-
|
|
370
|
+
"""
|
|
371
|
+
Handle the different cases of request's result being a Response or an Exception.
|
|
372
|
+
处理请求结果为Response或Exception的不同情况。
|
|
373
|
+
|
|
374
|
+
This method routes the result to the appropriate processing path based on
|
|
375
|
+
whether it's a successful response or an exception.
|
|
376
|
+
此方法根据结果是成功的响应还是异常,将结果路由到适当的处理路径。
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
result: The response or exception from processing the request.
|
|
380
|
+
处理请求的响应或异常。
|
|
381
|
+
request: The request that was processed.
|
|
382
|
+
已处理的请求。
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
Optional[AsyncGenerator]: The output from processing the result, or None.
|
|
386
|
+
处理结果的输出,或None。
|
|
387
|
+
"""
|
|
130
388
|
if isinstance(result, Response):
|
|
389
|
+
# For responses, pass through spider middleware
|
|
390
|
+
# 对于响应,通过爬虫中间件传递
|
|
131
391
|
# Throw the response to the middleware of the spider,
|
|
132
392
|
# and the processing results will be processed to the self.call_spider
|
|
393
|
+
# 将响应抛给爬虫的中间件,处理结果将被处理到self.call_spider
|
|
133
394
|
return await self.spidermw.scrape_response(self.call_spider, result, request, self.spider)
|
|
134
395
|
else:
|
|
135
396
|
try:
|
|
397
|
+
# For exceptions, call spider directly (bypass middleware)
|
|
398
|
+
# 对于异常,直接调用爬虫(绕过中间件)
|
|
136
399
|
# Processing Exception of download and download's middleware
|
|
400
|
+
# 处理下载和下载中间件的异常
|
|
137
401
|
return await self.call_spider(result, request)
|
|
138
402
|
except BaseException as e:
|
|
403
|
+
# Log any errors that occur during exception handling
|
|
404
|
+
# 记录异常处理期间发生的任何错误
|
|
139
405
|
await self._log_download_errors(e, result, request)
|
|
140
406
|
|
|
141
407
|
async def call_spider(self, result: Union[Response, BaseException], request: Request) -> Optional[AsyncGenerator]:
|
|
408
|
+
"""
|
|
409
|
+
Call the appropriate spider method to handle a result.
|
|
410
|
+
调用适当的爬虫方法来处理结果。
|
|
411
|
+
|
|
412
|
+
This method calls either the callback or errback method of the spider
|
|
413
|
+
based on whether the result is a response or an exception.
|
|
414
|
+
此方法根据结果是响应还是异常,调用爬虫的回调或错误回调方法。
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
result: The response or exception to process.
|
|
418
|
+
要处理的响应或异常。
|
|
419
|
+
request: The request associated with the result.
|
|
420
|
+
与结果关联的请求。
|
|
421
|
+
|
|
422
|
+
Returns:
|
|
423
|
+
Optional[AsyncGenerator]: The output from the spider method, or None.
|
|
424
|
+
爬虫方法的输出,或None。
|
|
425
|
+
|
|
426
|
+
Raises:
|
|
427
|
+
BaseException: If result is an exception and no errback is defined.
|
|
428
|
+
如果结果是异常且未定义错误回调。
|
|
429
|
+
"""
|
|
142
430
|
if isinstance(result, Response):
|
|
431
|
+
# For responses, call the callback method
|
|
432
|
+
# 对于响应,调用回调方法
|
|
143
433
|
# throws Response to Spider's parse
|
|
434
|
+
# 将Response抛给爬虫的parse
|
|
144
435
|
callback = request.callback or self.spider._parse
|
|
145
436
|
return await call_helper(callback, result, **result.request.cb_kwargs)
|
|
146
437
|
else:
|
|
438
|
+
# For exceptions, call the errback method if defined
|
|
439
|
+
# 对于异常,如果定义了错误回调方法,则调用它
|
|
147
440
|
if request.errback is None:
|
|
441
|
+
# If no errback is defined, re-raise the exception
|
|
442
|
+
# 如果未定义错误回调,则重新引发异常
|
|
148
443
|
raise result
|
|
149
444
|
# throws Exception of download and download's middleware to Spider's errback
|
|
445
|
+
# 将下载和下载中间件的异常抛给爬虫的errback
|
|
150
446
|
return await call_helper(request.errback, result)
|
|
151
447
|
|
|
152
|
-
async def handle_spider_error(self, exc: BaseException, request: Request, response: Response) -> None:
|
|
448
|
+
async def handle_spider_error(self, exc: BaseException, request: Request, response: Union[Response, BaseException]) -> None:
|
|
449
|
+
"""
|
|
450
|
+
Handle errors raised during spider callback processing.
|
|
451
|
+
处理爬虫回调处理期间引发的错误。
|
|
452
|
+
|
|
453
|
+
This method handles exceptions that occur during the processing of
|
|
454
|
+
responses by spider callbacks. It logs the error, sends the spider_error signal,
|
|
455
|
+
and updates error statistics.
|
|
456
|
+
此方法处理爬虫回调处理响应期间发生的异常。它记录错误、发送spider_error信号
|
|
457
|
+
并更新错误统计信息。
|
|
458
|
+
|
|
459
|
+
Args:
|
|
460
|
+
exc: The exception that was raised.
|
|
461
|
+
引发的异常。
|
|
462
|
+
request: The request being processed when the error occurred.
|
|
463
|
+
发生错误时正在处理的请求。
|
|
464
|
+
response: The response or exception being processed when the error occurred.
|
|
465
|
+
发生错误时正在处理的响应或异常。
|
|
466
|
+
This can be either a Response object or an Exception object in case
|
|
467
|
+
the error occurred during processing of an errback.
|
|
468
|
+
这可以是Response对象或Exception对象,以防错误发生在处理errback期间。
|
|
469
|
+
"""
|
|
470
|
+
# Handle CloseSpider exceptions specially
|
|
471
|
+
# 特别处理CloseSpider异常
|
|
153
472
|
if isinstance(exc, CloseSpider):
|
|
154
473
|
create_task(self.crawler.engine.close_spider(self.spider, exc.reason or 'cancelled'))
|
|
155
474
|
return
|
|
475
|
+
|
|
476
|
+
# Log the error
|
|
477
|
+
# 记录错误
|
|
156
478
|
logger.exception(self.logformatter.spider_error(exc, request, response, self.spider))
|
|
479
|
+
|
|
480
|
+
# Send the spider_error signal
|
|
481
|
+
# 发送spider_error信号
|
|
157
482
|
await self.signals.send_catch_log(
|
|
158
483
|
signal=signals.spider_error,
|
|
159
484
|
failure=exc, response=response,
|
|
160
485
|
spider=self.spider
|
|
161
486
|
)
|
|
487
|
+
|
|
488
|
+
# Update error statistics by exception type and total count
|
|
489
|
+
# 按异常类型和总计数更新错误统计信息
|
|
162
490
|
self.crawler.stats.inc_value("spider_exceptions/%s" % exc.__class__.__name__, spider=self.spider)
|
|
163
491
|
self.crawler.stats.inc_value("spider_exceptions", spider=self.spider)
|
|
164
492
|
|
|
165
|
-
async def handle_spider_output(self, result: AsyncGenerator, request: Request, response: Response) -> None:
|
|
166
|
-
"""
|
|
493
|
+
async def handle_spider_output(self, result: Optional[AsyncGenerator], request: Request, response: Union[Response, BaseException]) -> None:
|
|
494
|
+
"""
|
|
495
|
+
Process each Request/Item returned from the spider.
|
|
496
|
+
处理从爬虫返回的每个Request/Item。
|
|
497
|
+
|
|
498
|
+
This method iterates through the async generator returned by the spider
|
|
499
|
+
callback and processes each yielded item. It handles any exceptions that
|
|
500
|
+
occur during iteration and marks the request as successfully parsed or not.
|
|
501
|
+
此方法遍历爬虫回调返回的异步生成器,并处理每个产生的项目。
|
|
502
|
+
它处理迭代期间发生的任何异常,并将请求标记为成功解析或未成功解析。
|
|
503
|
+
|
|
504
|
+
Args:
|
|
505
|
+
result: The async generator returned by the spider callback, or None.
|
|
506
|
+
爬虫回调返回的异步生成器,或None。
|
|
507
|
+
If None, the method returns immediately.
|
|
508
|
+
如果为None,方法立即返回。
|
|
509
|
+
request: The request that was processed.
|
|
510
|
+
已处理的请求。
|
|
511
|
+
response: The response or exception that was processed.
|
|
512
|
+
已处理的响应或异常。
|
|
513
|
+
This can be either a Response object or an Exception object in case
|
|
514
|
+
the output came from an errback.
|
|
515
|
+
这可以是Response对象或Exception对象,以防输出来自errback。
|
|
516
|
+
"""
|
|
167
517
|
if not result:
|
|
168
518
|
return
|
|
169
519
|
|
|
170
520
|
parse_ok = True
|
|
171
521
|
while True:
|
|
172
522
|
try:
|
|
523
|
+
# Get the next item from the generator
|
|
524
|
+
# 从生成器获取下一个项目
|
|
173
525
|
output = await result.__anext__()
|
|
174
526
|
except StopAsyncIteration:
|
|
527
|
+
# End of generator
|
|
528
|
+
# 生成器结束
|
|
175
529
|
break
|
|
176
530
|
except Exception as e:
|
|
531
|
+
# Error during iteration
|
|
532
|
+
# 迭代期间出错
|
|
177
533
|
parse_ok = False
|
|
178
534
|
await self.handle_spider_error(e, request, response)
|
|
179
535
|
else:
|
|
536
|
+
# Process the output item
|
|
537
|
+
# 处理输出项目
|
|
180
538
|
await self._process_spidermw_output(output, request, response)
|
|
181
539
|
|
|
540
|
+
# Mark the request as successfully parsed (or not) for dupefilter
|
|
541
|
+
# 将请求标记为成功解析(或未成功)以供重复过滤器使用
|
|
182
542
|
self.spider.dupefilter and \
|
|
183
543
|
not request.dont_filter and \
|
|
184
544
|
setattr(request, "parse_ok", parse_ok)
|
|
185
545
|
|
|
186
|
-
async def _process_spidermw_output(self, output: Any, request: Request, response: Response) -> None:
|
|
187
|
-
"""
|
|
188
|
-
|
|
546
|
+
async def _process_spidermw_output(self, output: Any, request: Request, response: Union[Response, BaseException]) -> None:
|
|
547
|
+
"""
|
|
548
|
+
Process each Request/Item returned from the spider.
|
|
549
|
+
处理从爬虫返回的每个Request/Item。
|
|
550
|
+
|
|
551
|
+
This method handles different types of output from the spider:
|
|
552
|
+
此方法处理爬虫的不同类型的输出:
|
|
553
|
+
|
|
554
|
+
- Request: Schedule it for crawling
|
|
555
|
+
Request:安排它进行爬取
|
|
556
|
+
- dict: Process it through the item pipeline
|
|
557
|
+
dict:通过项目管道处理它
|
|
558
|
+
- None: Ignore it
|
|
559
|
+
None:忽略它
|
|
560
|
+
- Other types: Log an error
|
|
561
|
+
其他类型:记录错误
|
|
562
|
+
|
|
563
|
+
Args:
|
|
564
|
+
output: The output from the spider to process.
|
|
565
|
+
要处理的爬虫输出。
|
|
566
|
+
This can be a Request, a dict (item), None, or any other type.
|
|
567
|
+
这可以是Request、dict(项目)、None或任何其他类型。
|
|
568
|
+
request: The original request that generated this output.
|
|
569
|
+
生成此输出的原始请求。
|
|
570
|
+
This is used for logging and tracking purposes.
|
|
571
|
+
这用于日志记录和跟踪目的。
|
|
572
|
+
response: The response or exception that was processed.
|
|
573
|
+
已处理的响应或异常。
|
|
574
|
+
This can be either a Response object or an Exception object in case
|
|
575
|
+
the output came from an errback.
|
|
576
|
+
这可以是Response对象或Exception对象,以防输出来自errback。
|
|
577
|
+
"""
|
|
189
578
|
if isinstance(output, Request):
|
|
579
|
+
# Schedule new requests for crawling
|
|
580
|
+
# 安排新请求进行爬取
|
|
190
581
|
await self.crawler.engine.crawl(request=output)
|
|
191
582
|
elif isinstance(output, dict):
|
|
583
|
+
# Process items through the item pipeline
|
|
584
|
+
# 通过项目管道处理项目
|
|
192
585
|
self.slot.itemproc_size += 1
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
await
|
|
586
|
+
try:
|
|
587
|
+
# Process the item through the pipeline
|
|
588
|
+
# 通过管道处理项目
|
|
589
|
+
item = await self.itemproc.process_item(output, self.spider)
|
|
590
|
+
# Call the spider's process_item method if it exists
|
|
591
|
+
# 如果存在,调用爬虫的process_item方法
|
|
592
|
+
if process_item_method := getattr(self.spider, 'process_item', None):
|
|
593
|
+
await call_helper(process_item_method, item)
|
|
594
|
+
except Exception as e:
|
|
595
|
+
# Handle exceptions during item processing
|
|
596
|
+
# 处理项目处理期间的异常
|
|
597
|
+
item = output
|
|
598
|
+
output = e
|
|
599
|
+
# Handle the processed item or exception
|
|
600
|
+
# 处理已处理的项目或异常
|
|
197
601
|
await self._itemproc_finished(output, item, response)
|
|
198
602
|
elif output is None:
|
|
603
|
+
# Ignore None outputs
|
|
604
|
+
# 忽略None输出
|
|
199
605
|
pass
|
|
200
606
|
else:
|
|
607
|
+
# Log an error for unexpected output types
|
|
608
|
+
# 记录意外输出类型的错误
|
|
201
609
|
typename = type(output).__name__
|
|
202
610
|
logger.error(
|
|
203
611
|
'Spider must return request, item, or None, got %(typename)r in %(request)s' % {'request': request,
|
|
@@ -210,28 +618,96 @@ class Scraper:
|
|
|
210
618
|
download_exception: BaseException,
|
|
211
619
|
request: Request
|
|
212
620
|
) -> None:
|
|
213
|
-
"""
|
|
621
|
+
"""
|
|
622
|
+
Process and record download errors.
|
|
623
|
+
处理和记录下载错误。
|
|
624
|
+
|
|
625
|
+
This method logs download errors and re-raises spider exceptions
|
|
626
|
+
if they are different from the download exception. It's typically called
|
|
627
|
+
when an error occurs during the processing of an errback.
|
|
628
|
+
此方法记录下载错误,如果爬虫异常与下载异常不同,则重新引发爬虫异常。
|
|
629
|
+
它通常在处理errback期间发生错误时调用。
|
|
630
|
+
|
|
631
|
+
Args:
|
|
632
|
+
spider_exception: The exception raised during spider processing.
|
|
633
|
+
爬虫处理期间引发的异常。
|
|
634
|
+
This is the exception that occurred while processing
|
|
635
|
+
the download exception in the spider's errback.
|
|
636
|
+
这是在爬虫的errback中处理下载异常时发生的异常。
|
|
637
|
+
download_exception: The exception raised during download.
|
|
638
|
+
下载期间引发的异常。
|
|
639
|
+
This is the original exception that occurred during
|
|
640
|
+
the download process.
|
|
641
|
+
这是下载过程中发生的原始异常。
|
|
642
|
+
request: The request that caused the error.
|
|
643
|
+
导致错误的请求。
|
|
644
|
+
This is used for logging purposes and to provide context
|
|
645
|
+
about which request failed.
|
|
646
|
+
这用于日志记录目的,并提供有关哪个请求失败的上下文。
|
|
647
|
+
|
|
648
|
+
Raises:
|
|
649
|
+
BaseException: Re-raises spider_exception if it's different from download_exception.
|
|
650
|
+
如果spider_exception与download_exception不同,则重新引发spider_exception。
|
|
651
|
+
This ensures that new exceptions raised during errback processing
|
|
652
|
+
are properly propagated.
|
|
653
|
+
这确保在errback处理期间引发的新异常被正确传播。
|
|
654
|
+
"""
|
|
655
|
+
# Log download errors (except IgnoreRequest which is not an error)
|
|
656
|
+
# 记录下载错误(除了IgnoreRequest,它不是错误)
|
|
214
657
|
if isinstance(download_exception, BaseException) and not isinstance(download_exception, IgnoreRequest):
|
|
215
658
|
logger.exception(self.logformatter.download_error(download_exception, request, self.spider))
|
|
216
659
|
|
|
660
|
+
# Re-raise spider exceptions if they're different from the download exception
|
|
661
|
+
# 如果爬虫异常与下载异常不同,则重新引发爬虫异常
|
|
217
662
|
if spider_exception is not download_exception:
|
|
218
663
|
raise spider_exception
|
|
219
664
|
|
|
220
665
|
async def _itemproc_finished(self, output: Any, item: Any, response: Response) -> None:
|
|
221
|
-
"""
|
|
666
|
+
"""
|
|
667
|
+
Handle the result of item processing.
|
|
668
|
+
处理项目处理的结果。
|
|
669
|
+
|
|
670
|
+
This method is called when the item pipeline has finished processing an item.
|
|
671
|
+
It handles different outcomes based on the result:
|
|
672
|
+
当项目管道完成处理项目时调用此方法。它根据结果处理不同的结果:
|
|
673
|
+
|
|
674
|
+
- If output is a DropItem exception: Log it and send item_dropped signal
|
|
675
|
+
如果输出是DropItem异常:记录它并发送item_dropped信号
|
|
676
|
+
- If output is another exception: Log it and send item_error signal
|
|
677
|
+
如果输出是另一个异常:记录它并发送item_error信号
|
|
678
|
+
- If output is a valid item: Log it and send item_scraped signal
|
|
679
|
+
如果输出是有效项目:记录它并发送item_scraped信号
|
|
680
|
+
|
|
681
|
+
Args:
|
|
682
|
+
output: The result of item processing (item or exception).
|
|
683
|
+
项目处理的结果(项目或异常)。
|
|
684
|
+
item: The original item before processing.
|
|
685
|
+
处理前的原始项目。
|
|
686
|
+
response: The response from which the item was extracted.
|
|
687
|
+
从中提取项目的响应。
|
|
688
|
+
"""
|
|
689
|
+
# Decrease the item processing counter
|
|
690
|
+
# 减少项目处理计数器
|
|
222
691
|
self.slot.itemproc_size -= 1
|
|
692
|
+
|
|
223
693
|
if isinstance(output, BaseException):
|
|
224
694
|
if isinstance(output, DropItem):
|
|
695
|
+
# Item was intentionally dropped by a pipeline
|
|
696
|
+
# 项目被管道有意丢弃
|
|
225
697
|
logger.log(**self.logformatter.dropped(item, output, response, self.spider))
|
|
226
698
|
return await self.signals.send_catch_log_deferred(
|
|
227
699
|
signal=signals.item_dropped, item=item, response=response,
|
|
228
700
|
spider=self.spider, exception=output)
|
|
229
701
|
else:
|
|
702
|
+
# An error occurred during item processing
|
|
703
|
+
# 项目处理期间发生错误
|
|
230
704
|
logger.exception(self.logformatter.item_error(item, output, response, self.spider))
|
|
231
705
|
return await self.signals.send_catch_log_deferred(
|
|
232
706
|
signal=signals.item_error, item=item, response=response,
|
|
233
707
|
spider=self.spider, failure=output)
|
|
234
708
|
else:
|
|
709
|
+
# Item was successfully processed
|
|
710
|
+
# 项目已成功处理
|
|
235
711
|
logger.log(**self.logformatter.scraped(output, response, self.spider))
|
|
236
712
|
return await self.signals.send_catch_log_deferred(
|
|
237
713
|
signal=signals.item_scraped, item=output, response=response,
|