aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/process.py
CHANGED
|
@@ -1,3 +1,29 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Process Management Module
|
|
3
|
+
进程管理模块
|
|
4
|
+
|
|
5
|
+
This module provides functions for running spiders in single or multiple processes.
|
|
6
|
+
It handles the creation and management of processes, as well as the initialization
|
|
7
|
+
of event loops appropriate for different platforms.
|
|
8
|
+
此模块提供了在单个或多个进程中运行爬虫的函数。它处理进程的创建和管理,
|
|
9
|
+
以及适合不同平台的事件循环的初始化。
|
|
10
|
+
|
|
11
|
+
The main functions are:
|
|
12
|
+
主要函数包括:
|
|
13
|
+
|
|
14
|
+
1. single_process_run: Run multiple spiders in a single process
|
|
15
|
+
在单个进程中运行多个爬虫
|
|
16
|
+
2. multi_process_run: Run multiple spiders in separate processes
|
|
17
|
+
在单独的进程中运行多个爬虫
|
|
18
|
+
3. loop_initializer: Initialize an appropriate event loop based on the platform
|
|
19
|
+
根据平台初始化适当的事件循环
|
|
20
|
+
|
|
21
|
+
This module is particularly useful for running multiple spiders concurrently,
|
|
22
|
+
either in the same process or in separate processes for better isolation and
|
|
23
|
+
resource utilization.
|
|
24
|
+
此模块对于并发运行多个爬虫特别有用,可以在同一进程中运行,也可以在单独的
|
|
25
|
+
进程中运行,以获得更好的隔离和资源利用。
|
|
26
|
+
"""
|
|
1
27
|
import asyncio
|
|
2
28
|
import sys
|
|
3
29
|
from typing import Optional, Tuple, List, Union, Type, AnyStr
|
|
@@ -10,43 +36,217 @@ from aioscrapy.settings import Settings
|
|
|
10
36
|
|
|
11
37
|
|
|
12
38
|
def loop_initializer():
|
|
39
|
+
"""
|
|
40
|
+
Initialize and return an appropriate event loop based on the platform.
|
|
41
|
+
根据平台初始化并返回适当的事件循环。
|
|
42
|
+
|
|
43
|
+
This function selects the most efficient event loop implementation available
|
|
44
|
+
for the current platform:
|
|
45
|
+
此函数为当前平台选择最高效的事件循环实现:
|
|
46
|
+
|
|
47
|
+
- On Windows, returns a ProactorEventLoop which is optimized for Windows I/O operations.
|
|
48
|
+
在Windows上,返回ProactorEventLoop,它针对Windows I/O操作进行了优化。
|
|
49
|
+
|
|
50
|
+
- On other platforms (Linux, macOS, etc.), tries to use uvloop if available,
|
|
51
|
+
which is a fast drop-in replacement for the standard asyncio event loop.
|
|
52
|
+
在其他平台(Linux、macOS等)上,尝试使用uvloop(如果可用),
|
|
53
|
+
它是标准asyncio事件循环的快速替代品。
|
|
54
|
+
|
|
55
|
+
- If uvloop is not available, falls back to the standard asyncio event loop.
|
|
56
|
+
如果uvloop不可用,则回退到标准asyncio事件循环。
|
|
57
|
+
|
|
58
|
+
This function is used by the process management functions to ensure that
|
|
59
|
+
each process has an appropriate and efficient event loop.
|
|
60
|
+
进程管理函数使用此函数来确保每个进程都有适当且高效的事件循环。
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
An event loop or event loop policy appropriate for the current platform.
|
|
64
|
+
适合当前平台的事件循环或事件循环策略。
|
|
65
|
+
"""
|
|
66
|
+
# On Windows, use ProactorEventLoop which supports all asyncio features
|
|
67
|
+
# 在Windows上,使用支持所有asyncio功能的ProactorEventLoop
|
|
13
68
|
if sys.platform.startswith('win'):
|
|
14
69
|
return asyncio.windows_events.ProactorEventLoop()
|
|
70
|
+
|
|
71
|
+
# On other platforms, try to use uvloop which is much faster
|
|
72
|
+
# 在其他平台上,尝试使用更快的uvloop
|
|
15
73
|
try:
|
|
16
74
|
import uvloop
|
|
17
75
|
return uvloop.EventLoopPolicy()
|
|
18
76
|
except ImportError:
|
|
77
|
+
# If uvloop is not available, use the standard event loop
|
|
78
|
+
# 如果uvloop不可用,则使用标准事件循环
|
|
19
79
|
pass
|
|
20
80
|
|
|
81
|
+
# Fall back to the standard asyncio event loop
|
|
82
|
+
# 回退到标准asyncio事件循环
|
|
21
83
|
return asyncio.new_event_loop()
|
|
22
84
|
|
|
23
85
|
|
|
24
86
|
def multi_process_run(*tasks: Union[Tuple[Type[Spider], Optional[AnyStr]], List]):
|
|
87
|
+
"""
|
|
88
|
+
Run multiple spiders in separate processes.
|
|
89
|
+
在单独的进程中运行多个爬虫。
|
|
90
|
+
|
|
91
|
+
This function creates a new process for each task or list of tasks provided.
|
|
92
|
+
Each process runs independently with its own event loop, allowing for true
|
|
93
|
+
parallel execution across multiple CPU cores.
|
|
94
|
+
此函数为提供的每个任务或任务列表创建一个新进程。每个进程都有自己的事件循环
|
|
95
|
+
独立运行,允许在多个CPU核心上实现真正的并行执行。
|
|
96
|
+
|
|
97
|
+
Using multiple processes provides better isolation between spiders and can
|
|
98
|
+
improve performance on multi-core systems, but comes with higher memory
|
|
99
|
+
overhead compared to running all spiders in a single process.
|
|
100
|
+
使用多个进程可以提供更好的爬虫之间的隔离,并可以在多核系统上提高性能,
|
|
101
|
+
但与在单个进程中运行所有爬虫相比,会带来更高的内存开销。
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
*tasks: Each task can be either a tuple of (Spider class, settings) or a list of such tuples.
|
|
105
|
+
每个任务可以是(爬虫类, 设置)的元组,或者是这种元组的列表。
|
|
106
|
+
|
|
107
|
+
If a task is a list, all spiders in that list will run in the same process.
|
|
108
|
+
如果任务是列表,则该列表中的所有爬虫将在同一进程中运行。
|
|
109
|
+
|
|
110
|
+
The settings parameter can be a string (path to settings module) or None.
|
|
111
|
+
设置参数可以是字符串(设置模块的路径)或None。
|
|
112
|
+
|
|
113
|
+
Example:
|
|
114
|
+
```python
|
|
115
|
+
# Run two spiders in separate processes
|
|
116
|
+
multi_process_run(
|
|
117
|
+
(MySpider1, 'myproject.settings'),
|
|
118
|
+
(MySpider2, 'myproject.settings')
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Run two spiders in one process, and a third in another process
|
|
122
|
+
multi_process_run(
|
|
123
|
+
[(MySpider1, 'myproject.settings'), (MySpider2, 'myproject.settings')],
|
|
124
|
+
(MySpider3, 'myproject.settings')
|
|
125
|
+
)
|
|
126
|
+
```
|
|
127
|
+
"""
|
|
128
|
+
# Process each task
|
|
129
|
+
# 处理每个任务
|
|
25
130
|
for task in tasks:
|
|
26
131
|
if isinstance(task, list):
|
|
132
|
+
# If task is a list, run all spiders in that list in the same process
|
|
133
|
+
# 如果任务是列表,则在同一进程中运行该列表中的所有爬虫
|
|
27
134
|
p = Process(target=_single_process_run_async, args=(*task,), loop_initializer=loop_initializer)
|
|
28
135
|
else:
|
|
136
|
+
# If task is a single spider, run it in its own process
|
|
137
|
+
# 如果任务是单个爬虫,则在其自己的进程中运行它
|
|
29
138
|
p = Process(target=_single_process_run_async, args=(task,), loop_initializer=loop_initializer)
|
|
139
|
+
|
|
140
|
+
# Start the process
|
|
141
|
+
# 启动进程
|
|
30
142
|
p.start()
|
|
31
143
|
|
|
32
144
|
|
|
33
145
|
async def _single_process_run_async(*tasks: Tuple[Type[Spider], Optional[AnyStr]]):
|
|
146
|
+
"""
|
|
147
|
+
Run multiple spiders in a single process asynchronously.
|
|
148
|
+
在单个进程中异步运行多个爬虫。
|
|
149
|
+
|
|
150
|
+
This is an internal helper function used by multi_process_run. It creates a
|
|
151
|
+
CrawlerProcess, adds all the specified spiders to it, and then runs them
|
|
152
|
+
concurrently within the same process.
|
|
153
|
+
这是一个由multi_process_run使用的内部辅助函数。它创建一个CrawlerProcess,
|
|
154
|
+
将所有指定的爬虫添加到其中,然后在同一进程中并发运行它们。
|
|
155
|
+
|
|
156
|
+
The function handles the conversion of settings from string paths to Settings
|
|
157
|
+
objects if needed.
|
|
158
|
+
如果需要,该函数会处理将设置从字符串路径转换为Settings对象。
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
*tasks: Tuples of (Spider class, settings).
|
|
162
|
+
(爬虫类, 设置)的元组。
|
|
163
|
+
|
|
164
|
+
Each tuple contains a Spider class and its settings.
|
|
165
|
+
每个元组包含一个爬虫类及其设置。
|
|
166
|
+
|
|
167
|
+
The settings parameter can be a string (path to settings module) or None.
|
|
168
|
+
If it's a string, it will be converted to a Settings object.
|
|
169
|
+
设置参数可以是字符串(设置模块的路径)或None。
|
|
170
|
+
如果是字符串,它将被转换为Settings对象。
|
|
171
|
+
"""
|
|
172
|
+
# Create a crawler process to run all spiders
|
|
173
|
+
# 创建一个爬虫进程来运行所有爬虫
|
|
34
174
|
cp = CrawlerProcess()
|
|
175
|
+
|
|
176
|
+
# Add each spider to the crawler process
|
|
177
|
+
# 将每个爬虫添加到爬虫进程
|
|
35
178
|
for spidercls, settings in tasks:
|
|
179
|
+
# Convert string settings to Settings objects if needed
|
|
180
|
+
# 如果需要,将字符串设置转换为Settings对象
|
|
36
181
|
if isinstance(settings, str):
|
|
37
182
|
instance = Settings()
|
|
38
183
|
instance.setmodule(settings)
|
|
39
184
|
settings = instance
|
|
185
|
+
|
|
186
|
+
# Add the spider to the crawler process
|
|
187
|
+
# 将爬虫添加到爬虫进程
|
|
40
188
|
cp.crawl(spidercls, settings=settings)
|
|
189
|
+
|
|
190
|
+
# Run all spiders concurrently and wait for them to finish
|
|
191
|
+
# 并发运行所有爬虫并等待它们完成
|
|
41
192
|
await cp.run()
|
|
42
193
|
|
|
43
194
|
|
|
44
195
|
def single_process_run(*tasks: Tuple[Type[Spider], Optional[AnyStr]]):
|
|
196
|
+
"""
|
|
197
|
+
Run multiple spiders in a single process.
|
|
198
|
+
在单个进程中运行多个爬虫。
|
|
199
|
+
|
|
200
|
+
This function creates a CrawlerProcess and runs all provided spiders in it.
|
|
201
|
+
The spiders run concurrently within the same process using asyncio.
|
|
202
|
+
此函数创建一个CrawlerProcess并在其中运行所有提供的爬虫。
|
|
203
|
+
爬虫使用asyncio在同一进程中并发运行。
|
|
204
|
+
|
|
205
|
+
Running multiple spiders in a single process uses less memory than running
|
|
206
|
+
them in separate processes, but doesn't provide the same level of isolation
|
|
207
|
+
or parallel execution across CPU cores.
|
|
208
|
+
在单个进程中运行多个爬虫比在单独的进程中运行它们使用更少的内存,
|
|
209
|
+
但不提供相同级别的隔离或跨CPU核心的并行执行。
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
*tasks: Tuples of (Spider class, settings).
|
|
213
|
+
(爬虫类, 设置)的元组。
|
|
214
|
+
|
|
215
|
+
Each tuple contains a Spider class and its settings.
|
|
216
|
+
每个元组包含一个爬虫类及其设置。
|
|
217
|
+
|
|
218
|
+
The settings parameter can be a string (path to settings module) or None.
|
|
219
|
+
If it's a string, it will be converted to a Settings object.
|
|
220
|
+
设置参数可以是字符串(设置模块的路径)或None。
|
|
221
|
+
如果是字符串,它将被转换为Settings对象。
|
|
222
|
+
|
|
223
|
+
Example:
|
|
224
|
+
```python
|
|
225
|
+
# Run two spiders in a single process
|
|
226
|
+
single_process_run(
|
|
227
|
+
(MySpider1, 'myproject.settings'),
|
|
228
|
+
(MySpider2, 'myproject.settings')
|
|
229
|
+
)
|
|
230
|
+
```
|
|
231
|
+
"""
|
|
232
|
+
# Create a crawler process to run all spiders
|
|
233
|
+
# 创建一个爬虫进程来运行所有爬虫
|
|
45
234
|
cp = CrawlerProcess()
|
|
235
|
+
|
|
236
|
+
# Add each spider to the crawler process
|
|
237
|
+
# 将每个爬虫添加到爬虫进程
|
|
46
238
|
for spidercls, settings in tasks:
|
|
239
|
+
# Convert string settings to Settings objects if needed
|
|
240
|
+
# 如果需要,将字符串设置转换为Settings对象
|
|
47
241
|
if isinstance(settings, str):
|
|
48
242
|
instance = Settings()
|
|
49
243
|
instance.setmodule(settings)
|
|
50
244
|
settings = instance
|
|
245
|
+
|
|
246
|
+
# Add the spider to the crawler process
|
|
247
|
+
# 将爬虫添加到爬虫进程
|
|
51
248
|
cp.crawl(spidercls, settings=settings)
|
|
249
|
+
|
|
250
|
+
# Start the crawler process and block until all spiders are finished
|
|
251
|
+
# 启动爬虫进程并阻塞直到所有爬虫完成
|
|
52
252
|
cp.start()
|
aioscrapy/proxy/__init__.py
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Proxy module for aioscrapy.
|
|
3
|
+
aioscrapy的代理模块。
|
|
4
|
+
|
|
5
|
+
This module provides the abstract base class for proxy handlers in aioscrapy.
|
|
6
|
+
It defines the interface that all proxy handlers must implement and provides
|
|
7
|
+
common functionality for proxy management.
|
|
8
|
+
此模块提供了aioscrapy中代理处理程序的抽象基类。
|
|
9
|
+
它定义了所有代理处理程序必须实现的接口,并提供了代理管理的通用功能。
|
|
10
|
+
"""
|
|
11
|
+
|
|
1
12
|
from abc import ABCMeta, abstractmethod
|
|
2
13
|
|
|
3
14
|
from aioscrapy.utils.log import logger
|
|
@@ -5,7 +16,38 @@ from aioscrapy.utils.python import global_object_name
|
|
|
5
16
|
|
|
6
17
|
|
|
7
18
|
class AbsProxy(metaclass=ABCMeta):
|
|
19
|
+
"""
|
|
20
|
+
Abstract base class for proxy handlers.
|
|
21
|
+
代理处理程序的抽象基类。
|
|
22
|
+
|
|
23
|
+
This class defines the interface that all proxy handlers must implement
|
|
24
|
+
and provides common functionality for proxy management, including adding
|
|
25
|
+
proxies to requests, removing invalid proxies, and checking proxy validity.
|
|
26
|
+
此类定义了所有代理处理程序必须实现的接口,并提供了代理管理的通用功能,
|
|
27
|
+
包括向请求添加代理、移除无效代理和检查代理有效性。
|
|
28
|
+
|
|
29
|
+
Attributes:
|
|
30
|
+
use_proxy (bool): Whether to use proxies.
|
|
31
|
+
是否使用代理。
|
|
32
|
+
max_count (int): Maximum number of proxies to maintain.
|
|
33
|
+
要维护的最大代理数量。
|
|
34
|
+
min_count (int): Minimum number of proxies to maintain.
|
|
35
|
+
要维护的最小代理数量。
|
|
36
|
+
allow_status_code (list): HTTP status codes that are allowed even with a proxy.
|
|
37
|
+
即使使用代理也允许的HTTP状态码。
|
|
38
|
+
cache (list): List of available proxies.
|
|
39
|
+
可用代理列表。
|
|
40
|
+
"""
|
|
41
|
+
|
|
8
42
|
def __init__(self, settings):
|
|
43
|
+
"""
|
|
44
|
+
Initialize the proxy handler.
|
|
45
|
+
初始化代理处理程序。
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
settings: The aioscrapy settings object.
|
|
49
|
+
aioscrapy设置对象。
|
|
50
|
+
"""
|
|
9
51
|
self.use_proxy = settings.getbool('USE_PROXY', False)
|
|
10
52
|
self.max_count = settings.getint('PROXY_MAX_COUNT', 16)
|
|
11
53
|
self.min_count = settings.getint('PROXY_MIN_COUNT', 1)
|
|
@@ -13,38 +55,135 @@ class AbsProxy(metaclass=ABCMeta):
|
|
|
13
55
|
self.cache = []
|
|
14
56
|
|
|
15
57
|
async def add_proxy(self, request):
|
|
16
|
-
"""
|
|
58
|
+
"""
|
|
59
|
+
Add a proxy to the request if proxy usage is enabled.
|
|
60
|
+
如果启用了代理使用,则向请求添加代理。
|
|
61
|
+
|
|
62
|
+
This method checks if proxy usage is enabled both globally and for the
|
|
63
|
+
specific request. If so, it gets a proxy from the pool and adds it to
|
|
64
|
+
the request's meta. Otherwise, it removes any existing proxy from the request.
|
|
65
|
+
此方法检查代理使用是否在全局和特定请求中都启用。如果是,它从池中获取代理
|
|
66
|
+
并将其添加到请求的meta中。否则,它会从请求中移除任何现有的代理。
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
request: The request to add a proxy to.
|
|
70
|
+
要添加代理的请求。
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
The modified request.
|
|
74
|
+
修改后的请求。
|
|
75
|
+
"""
|
|
17
76
|
if self.use_proxy and request.use_proxy:
|
|
77
|
+
# Get a proxy and add it to the request's meta
|
|
78
|
+
# 获取代理并将其添加到请求的meta中
|
|
18
79
|
request.meta['proxy'] = await self.get()
|
|
19
80
|
else:
|
|
81
|
+
# Remove any existing proxy from the request
|
|
82
|
+
# 从请求中移除任何现有的代理
|
|
20
83
|
request.meta.pop('proxy', None)
|
|
21
84
|
return request
|
|
22
85
|
|
|
23
86
|
def remove(self, proxy, reason=None):
|
|
87
|
+
"""
|
|
88
|
+
Remove a proxy from the cache.
|
|
89
|
+
从缓存中移除代理。
|
|
90
|
+
|
|
91
|
+
This method removes a proxy from the cache when it's determined to be invalid
|
|
92
|
+
or no longer usable. It logs the removal with the provided reason.
|
|
93
|
+
当确定代理无效或不再可用时,此方法从缓存中移除代理。它记录移除的原因。
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
proxy: The proxy to remove.
|
|
97
|
+
要移除的代理。
|
|
98
|
+
reason: The reason for removing the proxy. Can be a callable, an exception,
|
|
99
|
+
or any other object that can be converted to a string.
|
|
100
|
+
移除代理的原因。可以是可调用对象、异常或任何其他可以转换为字符串的对象。
|
|
101
|
+
"""
|
|
102
|
+
# If reason is callable, call it to get the actual reason
|
|
103
|
+
# 如果reason是可调用的,调用它以获取实际原因
|
|
24
104
|
if callable(reason):
|
|
25
105
|
reason = reason()
|
|
106
|
+
|
|
107
|
+
# If reason is an exception, use its class name
|
|
108
|
+
# 如果reason是异常,使用其类名
|
|
26
109
|
if isinstance(reason, Exception):
|
|
27
110
|
reason = global_object_name(reason.__class__)
|
|
28
111
|
|
|
112
|
+
# Remove the proxy if it's in the cache
|
|
113
|
+
# 如果代理在缓存中,则移除它
|
|
29
114
|
if proxy in self.cache:
|
|
30
115
|
logger.info(f"remove proxy: {proxy}, reason: {reason}")
|
|
31
116
|
self.cache.remove(proxy)
|
|
32
117
|
|
|
33
118
|
def check(self, request, response=None, exception=None):
|
|
119
|
+
"""
|
|
120
|
+
Check if a proxy is still valid based on response or exception.
|
|
121
|
+
根据响应或异常检查代理是否仍然有效。
|
|
122
|
+
|
|
123
|
+
This method checks if a proxy should be removed based on the response status code
|
|
124
|
+
or an exception that occurred during the request. If the response status code is
|
|
125
|
+
not in the allowed list or if an exception occurred, the proxy is removed.
|
|
126
|
+
此方法根据响应状态码或请求期间发生的异常检查是否应该移除代理。
|
|
127
|
+
如果响应状态码不在允许列表中或发生异常,则移除代理。
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
request: The request that was made.
|
|
131
|
+
发出的请求。
|
|
132
|
+
response: The response received, if any.
|
|
133
|
+
收到的响应(如果有)。
|
|
134
|
+
exception: The exception that occurred, if any.
|
|
135
|
+
发生的异常(如果有)。
|
|
136
|
+
"""
|
|
137
|
+
# If proxy usage is disabled, do nothing
|
|
138
|
+
# 如果禁用了代理使用,则不执行任何操作
|
|
34
139
|
if not self.use_proxy:
|
|
35
140
|
return
|
|
36
141
|
|
|
142
|
+
# Check if the response status code is not allowed
|
|
143
|
+
# 检查响应状态码是否不被允许
|
|
37
144
|
if response and response.status >= 400 and response.status not in self.allow_status_code:
|
|
38
145
|
self.remove(request.meta.get('proxy'), f"Don't allow response status code:{response.status}")
|
|
39
146
|
|
|
147
|
+
# Check if an exception occurred
|
|
148
|
+
# 检查是否发生异常
|
|
40
149
|
if exception and isinstance(exception, BaseException):
|
|
41
150
|
self.remove(request.meta.get('proxy'), exception)
|
|
42
151
|
|
|
43
152
|
@classmethod
|
|
44
153
|
@abstractmethod
|
|
45
154
|
async def from_crawler(cls, crawler) -> "AbsProxy":
|
|
46
|
-
"""
|
|
155
|
+
"""
|
|
156
|
+
Create a proxy handler instance from a crawler.
|
|
157
|
+
从爬虫创建代理处理程序实例。
|
|
158
|
+
|
|
159
|
+
This class method is used to create a proxy handler instance from a crawler.
|
|
160
|
+
It is called by the crawler when initializing the proxy handler.
|
|
161
|
+
此类方法用于从爬虫创建代理处理程序实例。
|
|
162
|
+
它在初始化代理处理程序时由爬虫调用。
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
crawler: The crawler instance.
|
|
166
|
+
爬虫实例。
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
AbsProxy: A proxy handler instance.
|
|
170
|
+
代理处理程序实例。
|
|
171
|
+
"""
|
|
172
|
+
pass
|
|
47
173
|
|
|
48
174
|
@abstractmethod
|
|
49
175
|
async def get(self) -> str:
|
|
50
|
-
"""
|
|
176
|
+
"""
|
|
177
|
+
Get a proxy from the pool.
|
|
178
|
+
从池中获取代理。
|
|
179
|
+
|
|
180
|
+
This method is called when a proxy is needed for a request.
|
|
181
|
+
It should return a proxy in the format 'scheme://host:port'.
|
|
182
|
+
当请求需要代理时调用此方法。
|
|
183
|
+
它应该以'scheme://host:port'格式返回代理。
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
str: A proxy string in the format 'scheme://host:port'.
|
|
187
|
+
格式为'scheme://host:port'的代理字符串。
|
|
188
|
+
"""
|
|
189
|
+
pass
|
aioscrapy/proxy/redis.py
CHANGED
|
@@ -1,6 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Redis-based proxy implementation for aioscrapy.
|
|
3
|
+
aioscrapy的基于Redis的代理实现。
|
|
4
|
+
|
|
5
|
+
This module provides a Redis-based implementation of the proxy handler interface.
|
|
6
|
+
It fetches proxies from a Redis sorted set and manages them for use in requests.
|
|
7
|
+
此模块提供了代理处理程序接口的基于Redis的实现。
|
|
8
|
+
它从Redis有序集合中获取代理,并管理它们以用于请求。
|
|
9
|
+
"""
|
|
10
|
+
|
|
1
11
|
import asyncio
|
|
2
12
|
import time
|
|
3
|
-
from typing import Optional
|
|
13
|
+
from typing import Optional, Any
|
|
4
14
|
|
|
5
15
|
from aioscrapy.db import db_manager
|
|
6
16
|
from aioscrapy.exceptions import ProxyException
|
|
@@ -10,13 +20,47 @@ from aioscrapy.utils.tools import create_task
|
|
|
10
20
|
|
|
11
21
|
|
|
12
22
|
class RedisProxy(AbsProxy):
|
|
23
|
+
"""
|
|
24
|
+
Redis-based proxy handler implementation.
|
|
25
|
+
基于Redis的代理处理程序实现。
|
|
26
|
+
|
|
27
|
+
This class implements the AbsProxy interface using Redis as a backend.
|
|
28
|
+
It fetches proxies from a Redis sorted set and manages them for use in requests.
|
|
29
|
+
此类使用Redis作为后端实现AbsProxy接口。
|
|
30
|
+
它从Redis有序集合中获取代理,并管理它们以用于请求。
|
|
31
|
+
|
|
32
|
+
Attributes:
|
|
33
|
+
crawler: The crawler instance.
|
|
34
|
+
爬虫实例。
|
|
35
|
+
proxy_queue: The Redis client used to fetch proxies.
|
|
36
|
+
用于获取代理的Redis客户端。
|
|
37
|
+
proxy_key: The key of the Redis sorted set containing proxies.
|
|
38
|
+
包含代理的Redis有序集合的键。
|
|
39
|
+
lock: An asyncio lock to prevent concurrent proxy fetching.
|
|
40
|
+
防止并发代理获取的asyncio锁。
|
|
41
|
+
"""
|
|
42
|
+
|
|
13
43
|
def __init__(
|
|
14
44
|
self,
|
|
15
45
|
settings,
|
|
16
46
|
crawler,
|
|
17
|
-
proxy_queue: Optional[
|
|
47
|
+
proxy_queue: Optional[Any] = None,
|
|
18
48
|
proxy_key: Optional[str] = None
|
|
19
49
|
):
|
|
50
|
+
"""
|
|
51
|
+
Initialize the Redis proxy handler.
|
|
52
|
+
初始化Redis代理处理程序。
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
settings: The aioscrapy settings object.
|
|
56
|
+
aioscrapy设置对象。
|
|
57
|
+
crawler: The crawler instance.
|
|
58
|
+
爬虫实例。
|
|
59
|
+
proxy_queue: The Redis client used to fetch proxies.
|
|
60
|
+
用于获取代理的Redis客户端。
|
|
61
|
+
proxy_key: The key of the Redis sorted set containing proxies.
|
|
62
|
+
包含代理的Redis有序集合的键。
|
|
63
|
+
"""
|
|
20
64
|
super().__init__(settings)
|
|
21
65
|
self.crawler = crawler
|
|
22
66
|
self.proxy_queue = proxy_queue
|
|
@@ -25,11 +69,46 @@ class RedisProxy(AbsProxy):
|
|
|
25
69
|
|
|
26
70
|
@classmethod
|
|
27
71
|
async def from_crawler(cls, crawler) -> "RedisProxy":
|
|
72
|
+
"""
|
|
73
|
+
Create a RedisProxy instance from a crawler.
|
|
74
|
+
从爬虫创建RedisProxy实例。
|
|
75
|
+
|
|
76
|
+
This class method creates a RedisProxy instance from a crawler.
|
|
77
|
+
It retrieves the necessary settings and initializes the Redis client.
|
|
78
|
+
此类方法从爬虫创建RedisProxy实例。
|
|
79
|
+
它检索必要的设置并初始化Redis客户端。
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
crawler: The crawler instance.
|
|
83
|
+
爬虫实例。
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
RedisProxy: A RedisProxy instance.
|
|
87
|
+
RedisProxy实例。
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
AssertionError: If PROXY_KEY is not configured in settings.
|
|
91
|
+
如果在设置中未配置PROXY_KEY。
|
|
92
|
+
"""
|
|
93
|
+
# Get settings from crawler
|
|
94
|
+
# 从爬虫获取设置
|
|
28
95
|
settings = crawler.settings
|
|
96
|
+
|
|
97
|
+
# Get proxy key from settings
|
|
98
|
+
# 从设置获取代理键
|
|
29
99
|
proxy_key = settings.get('PROXY_KEY')
|
|
30
100
|
assert proxy_key is not None, "Not configured:'PROXY_KEY'"
|
|
101
|
+
|
|
102
|
+
# Get Redis alias from settings, default to 'proxy'
|
|
103
|
+
# 从设置获取Redis别名,默认为'proxy'
|
|
31
104
|
alias = settings.get("PROXY_QUEUE_ALIAS", 'proxy')
|
|
105
|
+
|
|
106
|
+
# Get Redis client
|
|
107
|
+
# 获取Redis客户端
|
|
32
108
|
proxy_queue = db_manager.redis(alias)
|
|
109
|
+
|
|
110
|
+
# Create and return RedisProxy instance
|
|
111
|
+
# 创建并返回RedisProxy实例
|
|
33
112
|
return cls(
|
|
34
113
|
settings,
|
|
35
114
|
crawler,
|
|
@@ -38,6 +117,23 @@ class RedisProxy(AbsProxy):
|
|
|
38
117
|
)
|
|
39
118
|
|
|
40
119
|
async def fill_proxy(self, redis_key: str, count: int) -> None:
|
|
120
|
+
"""
|
|
121
|
+
Fill the proxy cache from Redis.
|
|
122
|
+
从Redis填充代理缓存。
|
|
123
|
+
|
|
124
|
+
This method fetches proxies from a Redis sorted set and adds them to the cache.
|
|
125
|
+
It uses a Lua script to randomly select proxies from the sorted set.
|
|
126
|
+
此方法从Redis有序集合中获取代理并将它们添加到缓存中。
|
|
127
|
+
它使用Lua脚本从有序集合中随机选择代理。
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
redis_key: The key of the Redis sorted set containing proxies.
|
|
131
|
+
包含代理的Redis有序集合的键。
|
|
132
|
+
count: The number of proxies to fetch.
|
|
133
|
+
要获取的代理数量。
|
|
134
|
+
"""
|
|
135
|
+
# Lua script to randomly select proxies from a sorted set
|
|
136
|
+
# Lua脚本,用于从有序集合中随机选择代理
|
|
41
137
|
script = f"""
|
|
42
138
|
local redis_key = KEYS[1]
|
|
43
139
|
local min_score = ARGV[1]
|
|
@@ -50,23 +146,61 @@ class RedisProxy(AbsProxy):
|
|
|
50
146
|
end
|
|
51
147
|
return redis.call('ZRANGEBYSCORE', redis_key, min_score, max_score, 'LIMIT', start, {count})
|
|
52
148
|
"""
|
|
149
|
+
# Register and execute the script
|
|
150
|
+
# 注册并执行脚本
|
|
53
151
|
cmd_script = self.proxy_queue.register_script(script)
|
|
152
|
+
|
|
153
|
+
# Try to get proxies with score between 100 and 100 (high quality proxies)
|
|
154
|
+
# 尝试获取分数在100到100之间的代理(高质量代理)
|
|
54
155
|
result = await cmd_script(keys=[redis_key], args=[100, 100])
|
|
156
|
+
|
|
157
|
+
# If no high quality proxies are available, get any proxies
|
|
158
|
+
# 如果没有高质量代理可用,获取任何代理
|
|
55
159
|
if not result:
|
|
56
160
|
result = await cmd_script(keys=[redis_key], args=[0, 100])
|
|
161
|
+
|
|
162
|
+
# Format proxies and add them to the cache
|
|
163
|
+
# 格式化代理并将它们添加到缓存中
|
|
57
164
|
proxies = [ip.decode() if ip.decode().startswith('http') else f'http://{ip.decode()}' for ip in result]
|
|
58
165
|
self.cache.extend(proxies)
|
|
59
166
|
logger.info(f'Get proxy from redis: {proxies}')
|
|
60
167
|
|
|
61
168
|
async def get(self) -> str:
|
|
169
|
+
"""
|
|
170
|
+
Get a proxy from the cache.
|
|
171
|
+
从缓存中获取代理。
|
|
172
|
+
|
|
173
|
+
This method returns a proxy from the cache. If the cache is running low,
|
|
174
|
+
it fills the cache with more proxies from Redis. If no proxies are available,
|
|
175
|
+
it stops the crawler and raises an exception.
|
|
176
|
+
此方法从缓存中返回代理。如果缓存不足,它会从Redis中填充更多代理到缓存中。
|
|
177
|
+
如果没有可用的代理,它会停止爬虫并引发异常。
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
str: A proxy string in the format 'scheme://host:port'.
|
|
181
|
+
格式为'scheme://host:port'的代理字符串。
|
|
182
|
+
|
|
183
|
+
Raises:
|
|
184
|
+
ProxyException: If no proxies are available.
|
|
185
|
+
如果没有可用的代理。
|
|
186
|
+
"""
|
|
187
|
+
# If the cache is running low, fill it with more proxies
|
|
188
|
+
# 如果缓存不足,用更多代理填充它
|
|
62
189
|
if len(self.cache) < self.min_count:
|
|
63
190
|
async with self.lock:
|
|
191
|
+
# Check again inside the lock to avoid race conditions
|
|
192
|
+
# 在锁内再次检查以避免竞争条件
|
|
64
193
|
len(self.cache) < self.min_count and await self.fill_proxy(self.proxy_key, self.max_count - len(self.cache))
|
|
194
|
+
|
|
65
195
|
try:
|
|
196
|
+
# Get a proxy from the cache and move it to the end
|
|
197
|
+
# 从缓存中获取代理并将其移到末尾
|
|
66
198
|
proxy = self.cache.pop(0)
|
|
67
199
|
self.cache.append(proxy)
|
|
68
200
|
return proxy
|
|
69
201
|
except IndexError:
|
|
202
|
+
# If no proxies are available, stop the crawler and raise an exception
|
|
203
|
+
# 如果没有可用的代理,停止爬虫并引发异常
|
|
70
204
|
logger.warning("Not available proxy, Closing spider")
|
|
71
205
|
create_task(self.crawler.engine.stop(reason="Not available proxy"))
|
|
72
206
|
raise ProxyException("Not available proxy")
|