aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/spiders/__init__.py
CHANGED
|
@@ -1,8 +1,14 @@
|
|
|
1
|
+
|
|
1
2
|
"""
|
|
2
|
-
|
|
3
|
+
Spider module for aioscrapy.
|
|
4
|
+
aioscrapy的爬虫模块。
|
|
3
5
|
|
|
4
|
-
|
|
6
|
+
This module contains the base Spider class that all spiders must inherit from.
|
|
7
|
+
It provides the core functionality for creating and managing spiders.
|
|
8
|
+
此模块包含所有爬虫必须继承的基础Spider类。
|
|
9
|
+
它提供了创建和管理爬虫的核心功能。
|
|
5
10
|
"""
|
|
11
|
+
|
|
6
12
|
import time
|
|
7
13
|
from typing import Optional, Union
|
|
8
14
|
|
|
@@ -16,8 +22,22 @@ from aioscrapy.utils.url import url_is_from_spider
|
|
|
16
22
|
|
|
17
23
|
|
|
18
24
|
class Spider(object):
|
|
19
|
-
"""
|
|
20
|
-
class.
|
|
25
|
+
"""
|
|
26
|
+
Base class for aioscrapy spiders. All spiders must inherit from this class.
|
|
27
|
+
aioscrapy爬虫的基类。所有爬虫必须继承自此类。
|
|
28
|
+
|
|
29
|
+
This class provides the core functionality for creating and managing spiders,
|
|
30
|
+
including request generation, response parsing, and signal handling.
|
|
31
|
+
此类提供了创建和管理爬虫的核心功能,包括请求生成、响应解析和信号处理。
|
|
32
|
+
|
|
33
|
+
Attributes:
|
|
34
|
+
name: The name of the spider. Must be unique. 爬虫的名称,必须唯一。
|
|
35
|
+
proxy: Optional proxy handler. 可选的代理处理器。
|
|
36
|
+
dupefilter: Optional duplicate filter. 可选的重复过滤器。
|
|
37
|
+
custom_settings: Dictionary of settings to override project settings. 用于覆盖项目设置的设置字典。
|
|
38
|
+
stats: Statistics collector. 统计收集器。
|
|
39
|
+
pause: Whether the spider is paused. 爬虫是否暂停。
|
|
40
|
+
start_urls: List of URLs to start crawling from. 开始爬取的URL列表。
|
|
21
41
|
"""
|
|
22
42
|
|
|
23
43
|
name: Optional[str] = None
|
|
@@ -30,6 +50,14 @@ class Spider(object):
|
|
|
30
50
|
_pause_time: Optional[Union[int, float]] = None
|
|
31
51
|
|
|
32
52
|
def __init__(self, name=None, **kwargs):
|
|
53
|
+
"""
|
|
54
|
+
Initialize the spider.
|
|
55
|
+
初始化爬虫。
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
name: Spider name. 爬虫名称。
|
|
59
|
+
**kwargs: Additional arguments. 额外参数。
|
|
60
|
+
"""
|
|
33
61
|
if name is not None:
|
|
34
62
|
self.name = name
|
|
35
63
|
elif not getattr(self, 'name', None):
|
|
@@ -40,12 +68,37 @@ class Spider(object):
|
|
|
40
68
|
|
|
41
69
|
@property
|
|
42
70
|
def pause_time(self) -> int:
|
|
71
|
+
"""
|
|
72
|
+
Get the time until which the spider is paused.
|
|
73
|
+
获取爬虫暂停的时间点。
|
|
74
|
+
|
|
75
|
+
If not set, defaults to current time + 600 seconds.
|
|
76
|
+
如果未设置,默认为当前时间 + 600秒。
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
int: Unix timestamp when the pause ends.
|
|
80
|
+
暂停结束的Unix时间戳。
|
|
81
|
+
"""
|
|
43
82
|
if self._pause_time is None:
|
|
44
83
|
self._pause_time = 600 + int(time.time())
|
|
45
84
|
return self._pause_time
|
|
46
85
|
|
|
47
86
|
@pause_time.setter
|
|
48
87
|
def pause_time(self, value: Union[int, float]):
|
|
88
|
+
"""
|
|
89
|
+
Set the time until which the spider is paused.
|
|
90
|
+
设置爬虫暂停的时间点。
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
value: Unix timestamp or duration in seconds.
|
|
94
|
+
Unix时间戳或持续时间(秒)。
|
|
95
|
+
- If None, pause indefinitely.
|
|
96
|
+
如果为None,则无限期暂停。
|
|
97
|
+
- If less than current time, treated as duration.
|
|
98
|
+
如果小于当前时间,则视为持续时间。
|
|
99
|
+
- Otherwise, treated as absolute timestamp.
|
|
100
|
+
否则,视为绝对时间戳。
|
|
101
|
+
"""
|
|
49
102
|
self.pause = True
|
|
50
103
|
if value is None:
|
|
51
104
|
self._pause_time = float('inf')
|
|
@@ -56,18 +109,67 @@ class Spider(object):
|
|
|
56
109
|
|
|
57
110
|
@classmethod
|
|
58
111
|
async def from_crawler(cls, crawler, *args, **kwargs):
|
|
112
|
+
"""
|
|
113
|
+
Create a spider instance from a crawler.
|
|
114
|
+
从爬虫引擎创建爬虫实例。
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
crawler: The crawler instance. 爬虫引擎实例。
|
|
118
|
+
*args: Additional arguments. 额外参数。
|
|
119
|
+
**kwargs: Additional keyword arguments. 额外关键字参数。
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Spider instance. 爬虫实例。
|
|
123
|
+
"""
|
|
59
124
|
spider = cls(*args, **kwargs)
|
|
60
125
|
spider._set_crawler(crawler)
|
|
61
126
|
return spider
|
|
62
127
|
|
|
63
128
|
def _set_crawler(self, crawler):
|
|
129
|
+
"""
|
|
130
|
+
Set the crawler for this spider.
|
|
131
|
+
为此爬虫设置爬虫引擎。
|
|
132
|
+
|
|
133
|
+
This method is called by the from_crawler class method to set up the crawler
|
|
134
|
+
for this spider. It connects signal handlers and sets up settings.
|
|
135
|
+
此方法由from_crawler类方法调用,为此爬虫设置爬虫引擎。
|
|
136
|
+
它连接信号处理程序并设置配置。
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
crawler: The crawler instance to use.
|
|
140
|
+
要使用的爬虫引擎实例。
|
|
141
|
+
"""
|
|
142
|
+
# Store the crawler instance
|
|
143
|
+
# 存储爬虫引擎实例
|
|
64
144
|
self.crawler = crawler
|
|
145
|
+
|
|
146
|
+
# Get settings from the crawler
|
|
147
|
+
# 从爬虫引擎获取设置
|
|
65
148
|
self.settings = crawler.settings
|
|
149
|
+
|
|
150
|
+
# Determine if the spider should close when idle
|
|
151
|
+
# 确定爬虫在空闲时是否应该关闭
|
|
66
152
|
self.close_on_idle = self.settings.get("CLOSE_SPIDER_ON_IDLE", True)
|
|
153
|
+
|
|
154
|
+
# Connect signal handlers
|
|
155
|
+
# 连接信号处理程序
|
|
67
156
|
crawler.signals.connect(self.close, signals.spider_closed)
|
|
68
157
|
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
|
|
69
158
|
|
|
70
159
|
async def start_requests(self):
|
|
160
|
+
"""
|
|
161
|
+
Generate initial requests for the spider.
|
|
162
|
+
生成爬虫的初始请求。
|
|
163
|
+
|
|
164
|
+
This method must return an iterable of Request objects.
|
|
165
|
+
此方法必须返回一个包含Request对象的可迭代对象。
|
|
166
|
+
|
|
167
|
+
By default, it generates Request objects from the spider's start_urls.
|
|
168
|
+
默认情况下,它从爬虫的start_urls生成Request对象。
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
An iterable of Request objects. 包含Request对象的可迭代对象。
|
|
172
|
+
"""
|
|
71
173
|
if not self.start_urls and hasattr(self, 'start_url'):
|
|
72
174
|
raise AttributeError(
|
|
73
175
|
"Crawling could not start: 'start_urls' not found "
|
|
@@ -78,36 +180,151 @@ class Spider(object):
|
|
|
78
180
|
yield Request(url)
|
|
79
181
|
|
|
80
182
|
async def request_from_dict(self, d: dict):
|
|
81
|
-
"""
|
|
82
|
-
|
|
183
|
+
"""
|
|
184
|
+
Create a Request object from a dictionary.
|
|
185
|
+
从字典创建Request对象。
|
|
186
|
+
|
|
187
|
+
This method can be overridden in subclasses to customize the request creation process.
|
|
188
|
+
It is typically used for deserializing requests from storage or message queues.
|
|
189
|
+
可以在子类中重写此方法以自定义请求创建过程。
|
|
190
|
+
它通常用于从存储或消息队列中反序列化请求。
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
d: Dictionary containing request data. 包含请求数据的字典。
|
|
194
|
+
Expected keys include:
|
|
195
|
+
预期的键包括:
|
|
196
|
+
- url: The URL to request (required)
|
|
197
|
+
要请求的URL(必需)
|
|
198
|
+
- callback: Name of the callback method (optional)
|
|
199
|
+
回调方法的名称(可选)
|
|
200
|
+
- method: HTTP method (optional, default: 'GET')
|
|
201
|
+
HTTP方法(可选,默认:'GET')
|
|
202
|
+
- headers: HTTP headers (optional)
|
|
203
|
+
HTTP头(可选)
|
|
204
|
+
- body: Request body (optional)
|
|
205
|
+
请求体(可选)
|
|
206
|
+
- cookies: Cookies (optional)
|
|
207
|
+
Cookie(可选)
|
|
208
|
+
- meta: Request metadata (optional)
|
|
209
|
+
请求元数据(可选)
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
Request: A Request object, or None if the request cannot be created.
|
|
213
|
+
Request对象,如果无法创建请求则为None。
|
|
214
|
+
"""
|
|
215
|
+
# This is a placeholder implementation that should be overridden in subclasses
|
|
216
|
+
# 这是一个应该在子类中重写的占位符实现
|
|
217
|
+
return None
|
|
83
218
|
|
|
84
219
|
async def _parse(self, response: Response, **kwargs):
|
|
220
|
+
"""
|
|
221
|
+
Internal parse method that calls the user-defined parse method.
|
|
222
|
+
调用用户定义的parse方法的内部解析方法。
|
|
223
|
+
|
|
224
|
+
This method is used internally by the crawler to call the spider's parse method.
|
|
225
|
+
It uses call_helper to handle the parse method call, which supports both
|
|
226
|
+
async and non-async parse methods.
|
|
227
|
+
此方法由爬虫引擎内部使用,用于调用爬虫的parse方法。
|
|
228
|
+
它使用call_helper来处理parse方法调用,支持异步和非异步parse方法。
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
response: The response to parse.
|
|
232
|
+
要解析的响应。
|
|
233
|
+
**kwargs: Additional keyword arguments to pass to the parse method.
|
|
234
|
+
传递给parse方法的额外关键字参数。
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
The result of the parse method.
|
|
238
|
+
parse方法的结果。
|
|
239
|
+
"""
|
|
85
240
|
return await call_helper(self.parse, response)
|
|
86
241
|
|
|
87
242
|
async def parse(self, response: Response):
|
|
243
|
+
"""
|
|
244
|
+
Default callback used to process downloaded responses.
|
|
245
|
+
用于处理下载响应的默认回调方法。
|
|
246
|
+
|
|
247
|
+
This method must be implemented in subclasses.
|
|
248
|
+
必须在子类中实现此方法。
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
response: The response to process. 要处理的响应。
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
An iterable of Request and/or item objects. 包含Request和/或数据项对象的可迭代对象。
|
|
255
|
+
"""
|
|
88
256
|
raise NotImplementedError(f'{self.__class__.__name__}.parse callback is not defined')
|
|
89
257
|
|
|
90
258
|
@classmethod
|
|
91
259
|
def update_settings(cls, settings):
|
|
260
|
+
"""
|
|
261
|
+
Update settings with spider custom settings.
|
|
262
|
+
使用爬虫自定义设置更新设置。
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
settings: The settings to update. 要更新的设置。
|
|
266
|
+
"""
|
|
92
267
|
settings.setdict(cls.custom_settings or {}, priority='spider')
|
|
93
268
|
|
|
94
269
|
@classmethod
|
|
95
270
|
def handles_request(cls, request):
|
|
271
|
+
"""
|
|
272
|
+
Check if this spider can handle the given request.
|
|
273
|
+
检查此爬虫是否可以处理给定的请求。
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
request: The request to check. 要检查的请求。
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
True if this spider can handle the request, False otherwise.
|
|
280
|
+
如果此爬虫可以处理该请求,则返回True,否则返回False。
|
|
281
|
+
"""
|
|
96
282
|
return url_is_from_spider(request.url, cls)
|
|
97
283
|
|
|
98
284
|
@staticmethod
|
|
99
285
|
def close(spider, reason):
|
|
286
|
+
"""
|
|
287
|
+
Signal handler for the spider_closed signal.
|
|
288
|
+
爬虫关闭信号的处理函数。
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
spider: The spider being closed. 正在关闭的爬虫。
|
|
292
|
+
reason: The reason for closing the spider. 关闭爬虫的原因。
|
|
293
|
+
"""
|
|
100
294
|
closed = getattr(spider, 'closed', None)
|
|
101
295
|
if callable(closed):
|
|
102
296
|
return closed(reason)
|
|
103
297
|
|
|
104
298
|
def __str__(self):
|
|
299
|
+
"""
|
|
300
|
+
Return a string representation of the spider.
|
|
301
|
+
返回爬虫的字符串表示。
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
str: A string representation of the spider, including its class name,
|
|
305
|
+
name, and memory address.
|
|
306
|
+
爬虫的字符串表示,包括其类名、名称和内存地址。
|
|
307
|
+
"""
|
|
105
308
|
return f"<{type(self).__name__} {self.name!r} at 0x{id(self):0x}>"
|
|
106
309
|
|
|
310
|
+
# Make __repr__ use the same implementation as __str__
|
|
311
|
+
# 使__repr__使用与__str__相同的实现
|
|
107
312
|
__repr__ = __str__
|
|
108
313
|
|
|
109
314
|
@classmethod
|
|
110
315
|
def start(cls, setting_path=None, use_windows_selector_eventLoop: bool = False):
|
|
316
|
+
"""
|
|
317
|
+
Start crawling using this spider.
|
|
318
|
+
使用此爬虫开始爬取。
|
|
319
|
+
|
|
320
|
+
This is a convenience method that creates a CrawlerProcess, adds the spider,
|
|
321
|
+
and starts the crawling process.
|
|
322
|
+
这是一个便捷方法,它创建一个CrawlerProcess,添加爬虫,并启动爬取过程。
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
setting_path: Path to settings module. 设置模块的路径。
|
|
326
|
+
use_windows_selector_eventLoop: Whether to use Windows selector event loop. 是否使用Windows选择器事件循环。
|
|
327
|
+
"""
|
|
111
328
|
from aioscrapy.crawler import CrawlerProcess
|
|
112
329
|
from aioscrapy.utils.project import get_project_settings
|
|
113
330
|
|
|
@@ -119,5 +336,15 @@ class Spider(object):
|
|
|
119
336
|
cp.start(use_windows_selector_eventLoop)
|
|
120
337
|
|
|
121
338
|
def spider_idle(self):
|
|
339
|
+
"""
|
|
340
|
+
Signal handler for the spider_idle signal.
|
|
341
|
+
爬虫空闲信号的处理函数。
|
|
342
|
+
|
|
343
|
+
This method is called when the spider has no more requests to process.
|
|
344
|
+
当爬虫没有更多请求要处理时,调用此方法。
|
|
345
|
+
|
|
346
|
+
If CLOSE_SPIDER_ON_IDLE is False, it raises DontCloseSpider to prevent the spider from closing.
|
|
347
|
+
如果CLOSE_SPIDER_ON_IDLE为False,它会引发DontCloseSpider以防止爬虫关闭。
|
|
348
|
+
"""
|
|
122
349
|
if not self.close_on_idle:
|
|
123
350
|
raise DontCloseSpider
|