aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
aioscrapy/process.py CHANGED
@@ -1,3 +1,29 @@
1
+ """
2
+ Process Management Module
3
+ 进程管理模块
4
+
5
+ This module provides functions for running spiders in single or multiple processes.
6
+ It handles the creation and management of processes, as well as the initialization
7
+ of event loops appropriate for different platforms.
8
+ 此模块提供了在单个或多个进程中运行爬虫的函数。它处理进程的创建和管理,
9
+ 以及适合不同平台的事件循环的初始化。
10
+
11
+ The main functions are:
12
+ 主要函数包括:
13
+
14
+ 1. single_process_run: Run multiple spiders in a single process
15
+ 在单个进程中运行多个爬虫
16
+ 2. multi_process_run: Run multiple spiders in separate processes
17
+ 在单独的进程中运行多个爬虫
18
+ 3. loop_initializer: Initialize an appropriate event loop based on the platform
19
+ 根据平台初始化适当的事件循环
20
+
21
+ This module is particularly useful for running multiple spiders concurrently,
22
+ either in the same process or in separate processes for better isolation and
23
+ resource utilization.
24
+ 此模块对于并发运行多个爬虫特别有用,可以在同一进程中运行,也可以在单独的
25
+ 进程中运行,以获得更好的隔离和资源利用。
26
+ """
1
27
  import asyncio
2
28
  import sys
3
29
  from typing import Optional, Tuple, List, Union, Type, AnyStr
@@ -10,43 +36,217 @@ from aioscrapy.settings import Settings
10
36
 
11
37
 
12
38
  def loop_initializer():
39
+ """
40
+ Initialize and return an appropriate event loop based on the platform.
41
+ 根据平台初始化并返回适当的事件循环。
42
+
43
+ This function selects the most efficient event loop implementation available
44
+ for the current platform:
45
+ 此函数为当前平台选择最高效的事件循环实现:
46
+
47
+ - On Windows, returns a ProactorEventLoop which is optimized for Windows I/O operations.
48
+ 在Windows上,返回ProactorEventLoop,它针对Windows I/O操作进行了优化。
49
+
50
+ - On other platforms (Linux, macOS, etc.), tries to use uvloop if available,
51
+ which is a fast drop-in replacement for the standard asyncio event loop.
52
+ 在其他平台(Linux、macOS等)上,尝试使用uvloop(如果可用),
53
+ 它是标准asyncio事件循环的快速替代品。
54
+
55
+ - If uvloop is not available, falls back to the standard asyncio event loop.
56
+ 如果uvloop不可用,则回退到标准asyncio事件循环。
57
+
58
+ This function is used by the process management functions to ensure that
59
+ each process has an appropriate and efficient event loop.
60
+ 进程管理函数使用此函数来确保每个进程都有适当且高效的事件循环。
61
+
62
+ Returns:
63
+ An event loop or event loop policy appropriate for the current platform.
64
+ 适合当前平台的事件循环或事件循环策略。
65
+ """
66
+ # On Windows, use ProactorEventLoop which supports all asyncio features
67
+ # 在Windows上,使用支持所有asyncio功能的ProactorEventLoop
13
68
  if sys.platform.startswith('win'):
14
69
  return asyncio.windows_events.ProactorEventLoop()
70
+
71
+ # On other platforms, try to use uvloop which is much faster
72
+ # 在其他平台上,尝试使用更快的uvloop
15
73
  try:
16
74
  import uvloop
17
75
  return uvloop.EventLoopPolicy()
18
76
  except ImportError:
77
+ # If uvloop is not available, use the standard event loop
78
+ # 如果uvloop不可用,则使用标准事件循环
19
79
  pass
20
80
 
81
+ # Fall back to the standard asyncio event loop
82
+ # 回退到标准asyncio事件循环
21
83
  return asyncio.new_event_loop()
22
84
 
23
85
 
24
86
  def multi_process_run(*tasks: Union[Tuple[Type[Spider], Optional[AnyStr]], List]):
87
+ """
88
+ Run multiple spiders in separate processes.
89
+ 在单独的进程中运行多个爬虫。
90
+
91
+ This function creates a new process for each task or list of tasks provided.
92
+ Each process runs independently with its own event loop, allowing for true
93
+ parallel execution across multiple CPU cores.
94
+ 此函数为提供的每个任务或任务列表创建一个新进程。每个进程都有自己的事件循环
95
+ 独立运行,允许在多个CPU核心上实现真正的并行执行。
96
+
97
+ Using multiple processes provides better isolation between spiders and can
98
+ improve performance on multi-core systems, but comes with higher memory
99
+ overhead compared to running all spiders in a single process.
100
+ 使用多个进程可以提供更好的爬虫之间的隔离,并可以在多核系统上提高性能,
101
+ 但与在单个进程中运行所有爬虫相比,会带来更高的内存开销。
102
+
103
+ Args:
104
+ *tasks: Each task can be either a tuple of (Spider class, settings) or a list of such tuples.
105
+ 每个任务可以是(爬虫类, 设置)的元组,或者是这种元组的列表。
106
+
107
+ If a task is a list, all spiders in that list will run in the same process.
108
+ 如果任务是列表,则该列表中的所有爬虫将在同一进程中运行。
109
+
110
+ The settings parameter can be a string (path to settings module) or None.
111
+ 设置参数可以是字符串(设置模块的路径)或None。
112
+
113
+ Example:
114
+ ```python
115
+ # Run two spiders in separate processes
116
+ multi_process_run(
117
+ (MySpider1, 'myproject.settings'),
118
+ (MySpider2, 'myproject.settings')
119
+ )
120
+
121
+ # Run two spiders in one process, and a third in another process
122
+ multi_process_run(
123
+ [(MySpider1, 'myproject.settings'), (MySpider2, 'myproject.settings')],
124
+ (MySpider3, 'myproject.settings')
125
+ )
126
+ ```
127
+ """
128
+ # Process each task
129
+ # 处理每个任务
25
130
  for task in tasks:
26
131
  if isinstance(task, list):
132
+ # If task is a list, run all spiders in that list in the same process
133
+ # 如果任务是列表,则在同一进程中运行该列表中的所有爬虫
27
134
  p = Process(target=_single_process_run_async, args=(*task,), loop_initializer=loop_initializer)
28
135
  else:
136
+ # If task is a single spider, run it in its own process
137
+ # 如果任务是单个爬虫,则在其自己的进程中运行它
29
138
  p = Process(target=_single_process_run_async, args=(task,), loop_initializer=loop_initializer)
139
+
140
+ # Start the process
141
+ # 启动进程
30
142
  p.start()
31
143
 
32
144
 
33
145
  async def _single_process_run_async(*tasks: Tuple[Type[Spider], Optional[AnyStr]]):
146
+ """
147
+ Run multiple spiders in a single process asynchronously.
148
+ 在单个进程中异步运行多个爬虫。
149
+
150
+ This is an internal helper function used by multi_process_run. It creates a
151
+ CrawlerProcess, adds all the specified spiders to it, and then runs them
152
+ concurrently within the same process.
153
+ 这是一个由multi_process_run使用的内部辅助函数。它创建一个CrawlerProcess,
154
+ 将所有指定的爬虫添加到其中,然后在同一进程中并发运行它们。
155
+
156
+ The function handles the conversion of settings from string paths to Settings
157
+ objects if needed.
158
+ 如果需要,该函数会处理将设置从字符串路径转换为Settings对象。
159
+
160
+ Args:
161
+ *tasks: Tuples of (Spider class, settings).
162
+ (爬虫类, 设置)的元组。
163
+
164
+ Each tuple contains a Spider class and its settings.
165
+ 每个元组包含一个爬虫类及其设置。
166
+
167
+ The settings parameter can be a string (path to settings module) or None.
168
+ If it's a string, it will be converted to a Settings object.
169
+ 设置参数可以是字符串(设置模块的路径)或None。
170
+ 如果是字符串,它将被转换为Settings对象。
171
+ """
172
+ # Create a crawler process to run all spiders
173
+ # 创建一个爬虫进程来运行所有爬虫
34
174
  cp = CrawlerProcess()
175
+
176
+ # Add each spider to the crawler process
177
+ # 将每个爬虫添加到爬虫进程
35
178
  for spidercls, settings in tasks:
179
+ # Convert string settings to Settings objects if needed
180
+ # 如果需要,将字符串设置转换为Settings对象
36
181
  if isinstance(settings, str):
37
182
  instance = Settings()
38
183
  instance.setmodule(settings)
39
184
  settings = instance
185
+
186
+ # Add the spider to the crawler process
187
+ # 将爬虫添加到爬虫进程
40
188
  cp.crawl(spidercls, settings=settings)
189
+
190
+ # Run all spiders concurrently and wait for them to finish
191
+ # 并发运行所有爬虫并等待它们完成
41
192
  await cp.run()
42
193
 
43
194
 
44
195
  def single_process_run(*tasks: Tuple[Type[Spider], Optional[AnyStr]]):
196
+ """
197
+ Run multiple spiders in a single process.
198
+ 在单个进程中运行多个爬虫。
199
+
200
+ This function creates a CrawlerProcess and runs all provided spiders in it.
201
+ The spiders run concurrently within the same process using asyncio.
202
+ 此函数创建一个CrawlerProcess并在其中运行所有提供的爬虫。
203
+ 爬虫使用asyncio在同一进程中并发运行。
204
+
205
+ Running multiple spiders in a single process uses less memory than running
206
+ them in separate processes, but doesn't provide the same level of isolation
207
+ or parallel execution across CPU cores.
208
+ 在单个进程中运行多个爬虫比在单独的进程中运行它们使用更少的内存,
209
+ 但不提供相同级别的隔离或跨CPU核心的并行执行。
210
+
211
+ Args:
212
+ *tasks: Tuples of (Spider class, settings).
213
+ (爬虫类, 设置)的元组。
214
+
215
+ Each tuple contains a Spider class and its settings.
216
+ 每个元组包含一个爬虫类及其设置。
217
+
218
+ The settings parameter can be a string (path to settings module) or None.
219
+ If it's a string, it will be converted to a Settings object.
220
+ 设置参数可以是字符串(设置模块的路径)或None。
221
+ 如果是字符串,它将被转换为Settings对象。
222
+
223
+ Example:
224
+ ```python
225
+ # Run two spiders in a single process
226
+ single_process_run(
227
+ (MySpider1, 'myproject.settings'),
228
+ (MySpider2, 'myproject.settings')
229
+ )
230
+ ```
231
+ """
232
+ # Create a crawler process to run all spiders
233
+ # 创建一个爬虫进程来运行所有爬虫
45
234
  cp = CrawlerProcess()
235
+
236
+ # Add each spider to the crawler process
237
+ # 将每个爬虫添加到爬虫进程
46
238
  for spidercls, settings in tasks:
239
+ # Convert string settings to Settings objects if needed
240
+ # 如果需要,将字符串设置转换为Settings对象
47
241
  if isinstance(settings, str):
48
242
  instance = Settings()
49
243
  instance.setmodule(settings)
50
244
  settings = instance
245
+
246
+ # Add the spider to the crawler process
247
+ # 将爬虫添加到爬虫进程
51
248
  cp.crawl(spidercls, settings=settings)
249
+
250
+ # Start the crawler process and block until all spiders are finished
251
+ # 启动爬虫进程并阻塞直到所有爬虫完成
52
252
  cp.start()
@@ -1,3 +1,14 @@
1
+ """
2
+ Proxy module for aioscrapy.
3
+ aioscrapy的代理模块。
4
+
5
+ This module provides the abstract base class for proxy handlers in aioscrapy.
6
+ It defines the interface that all proxy handlers must implement and provides
7
+ common functionality for proxy management.
8
+ 此模块提供了aioscrapy中代理处理程序的抽象基类。
9
+ 它定义了所有代理处理程序必须实现的接口,并提供了代理管理的通用功能。
10
+ """
11
+
1
12
  from abc import ABCMeta, abstractmethod
2
13
 
3
14
  from aioscrapy.utils.log import logger
@@ -5,7 +16,38 @@ from aioscrapy.utils.python import global_object_name
5
16
 
6
17
 
7
18
  class AbsProxy(metaclass=ABCMeta):
19
+ """
20
+ Abstract base class for proxy handlers.
21
+ 代理处理程序的抽象基类。
22
+
23
+ This class defines the interface that all proxy handlers must implement
24
+ and provides common functionality for proxy management, including adding
25
+ proxies to requests, removing invalid proxies, and checking proxy validity.
26
+ 此类定义了所有代理处理程序必须实现的接口,并提供了代理管理的通用功能,
27
+ 包括向请求添加代理、移除无效代理和检查代理有效性。
28
+
29
+ Attributes:
30
+ use_proxy (bool): Whether to use proxies.
31
+ 是否使用代理。
32
+ max_count (int): Maximum number of proxies to maintain.
33
+ 要维护的最大代理数量。
34
+ min_count (int): Minimum number of proxies to maintain.
35
+ 要维护的最小代理数量。
36
+ allow_status_code (list): HTTP status codes that are allowed even with a proxy.
37
+ 即使使用代理也允许的HTTP状态码。
38
+ cache (list): List of available proxies.
39
+ 可用代理列表。
40
+ """
41
+
8
42
  def __init__(self, settings):
43
+ """
44
+ Initialize the proxy handler.
45
+ 初始化代理处理程序。
46
+
47
+ Args:
48
+ settings: The aioscrapy settings object.
49
+ aioscrapy设置对象。
50
+ """
9
51
  self.use_proxy = settings.getbool('USE_PROXY', False)
10
52
  self.max_count = settings.getint('PROXY_MAX_COUNT', 16)
11
53
  self.min_count = settings.getint('PROXY_MIN_COUNT', 1)
@@ -13,38 +55,135 @@ class AbsProxy(metaclass=ABCMeta):
13
55
  self.cache = []
14
56
 
15
57
  async def add_proxy(self, request):
16
- """add proxy for request"""
58
+ """
59
+ Add a proxy to the request if proxy usage is enabled.
60
+ 如果启用了代理使用,则向请求添加代理。
61
+
62
+ This method checks if proxy usage is enabled both globally and for the
63
+ specific request. If so, it gets a proxy from the pool and adds it to
64
+ the request's meta. Otherwise, it removes any existing proxy from the request.
65
+ 此方法检查代理使用是否在全局和特定请求中都启用。如果是,它从池中获取代理
66
+ 并将其添加到请求的meta中。否则,它会从请求中移除任何现有的代理。
67
+
68
+ Args:
69
+ request: The request to add a proxy to.
70
+ 要添加代理的请求。
71
+
72
+ Returns:
73
+ The modified request.
74
+ 修改后的请求。
75
+ """
17
76
  if self.use_proxy and request.use_proxy:
77
+ # Get a proxy and add it to the request's meta
78
+ # 获取代理并将其添加到请求的meta中
18
79
  request.meta['proxy'] = await self.get()
19
80
  else:
81
+ # Remove any existing proxy from the request
82
+ # 从请求中移除任何现有的代理
20
83
  request.meta.pop('proxy', None)
21
84
  return request
22
85
 
23
86
  def remove(self, proxy, reason=None):
87
+ """
88
+ Remove a proxy from the cache.
89
+ 从缓存中移除代理。
90
+
91
+ This method removes a proxy from the cache when it's determined to be invalid
92
+ or no longer usable. It logs the removal with the provided reason.
93
+ 当确定代理无效或不再可用时,此方法从缓存中移除代理。它记录移除的原因。
94
+
95
+ Args:
96
+ proxy: The proxy to remove.
97
+ 要移除的代理。
98
+ reason: The reason for removing the proxy. Can be a callable, an exception,
99
+ or any other object that can be converted to a string.
100
+ 移除代理的原因。可以是可调用对象、异常或任何其他可以转换为字符串的对象。
101
+ """
102
+ # If reason is callable, call it to get the actual reason
103
+ # 如果reason是可调用的,调用它以获取实际原因
24
104
  if callable(reason):
25
105
  reason = reason()
106
+
107
+ # If reason is an exception, use its class name
108
+ # 如果reason是异常,使用其类名
26
109
  if isinstance(reason, Exception):
27
110
  reason = global_object_name(reason.__class__)
28
111
 
112
+ # Remove the proxy if it's in the cache
113
+ # 如果代理在缓存中,则移除它
29
114
  if proxy in self.cache:
30
115
  logger.info(f"remove proxy: {proxy}, reason: {reason}")
31
116
  self.cache.remove(proxy)
32
117
 
33
118
  def check(self, request, response=None, exception=None):
119
+ """
120
+ Check if a proxy is still valid based on response or exception.
121
+ 根据响应或异常检查代理是否仍然有效。
122
+
123
+ This method checks if a proxy should be removed based on the response status code
124
+ or an exception that occurred during the request. If the response status code is
125
+ not in the allowed list or if an exception occurred, the proxy is removed.
126
+ 此方法根据响应状态码或请求期间发生的异常检查是否应该移除代理。
127
+ 如果响应状态码不在允许列表中或发生异常,则移除代理。
128
+
129
+ Args:
130
+ request: The request that was made.
131
+ 发出的请求。
132
+ response: The response received, if any.
133
+ 收到的响应(如果有)。
134
+ exception: The exception that occurred, if any.
135
+ 发生的异常(如果有)。
136
+ """
137
+ # If proxy usage is disabled, do nothing
138
+ # 如果禁用了代理使用,则不执行任何操作
34
139
  if not self.use_proxy:
35
140
  return
36
141
 
142
+ # Check if the response status code is not allowed
143
+ # 检查响应状态码是否不被允许
37
144
  if response and response.status >= 400 and response.status not in self.allow_status_code:
38
145
  self.remove(request.meta.get('proxy'), f"Don't allow response status code:{response.status}")
39
146
 
147
+ # Check if an exception occurred
148
+ # 检查是否发生异常
40
149
  if exception and isinstance(exception, BaseException):
41
150
  self.remove(request.meta.get('proxy'), exception)
42
151
 
43
152
  @classmethod
44
153
  @abstractmethod
45
154
  async def from_crawler(cls, crawler) -> "AbsProxy":
46
- """get proxy instance from spider"""
155
+ """
156
+ Create a proxy handler instance from a crawler.
157
+ 从爬虫创建代理处理程序实例。
158
+
159
+ This class method is used to create a proxy handler instance from a crawler.
160
+ It is called by the crawler when initializing the proxy handler.
161
+ 此类方法用于从爬虫创建代理处理程序实例。
162
+ 它在初始化代理处理程序时由爬虫调用。
163
+
164
+ Args:
165
+ crawler: The crawler instance.
166
+ 爬虫实例。
167
+
168
+ Returns:
169
+ AbsProxy: A proxy handler instance.
170
+ 代理处理程序实例。
171
+ """
172
+ pass
47
173
 
48
174
  @abstractmethod
49
175
  async def get(self) -> str:
50
- """get proxy"""
176
+ """
177
+ Get a proxy from the pool.
178
+ 从池中获取代理。
179
+
180
+ This method is called when a proxy is needed for a request.
181
+ It should return a proxy in the format 'scheme://host:port'.
182
+ 当请求需要代理时调用此方法。
183
+ 它应该以'scheme://host:port'格式返回代理。
184
+
185
+ Returns:
186
+ str: A proxy string in the format 'scheme://host:port'.
187
+ 格式为'scheme://host:port'的代理字符串。
188
+ """
189
+ pass
aioscrapy/proxy/redis.py CHANGED
@@ -1,6 +1,16 @@
1
+ """
2
+ Redis-based proxy implementation for aioscrapy.
3
+ aioscrapy的基于Redis的代理实现。
4
+
5
+ This module provides a Redis-based implementation of the proxy handler interface.
6
+ It fetches proxies from a Redis sorted set and manages them for use in requests.
7
+ 此模块提供了代理处理程序接口的基于Redis的实现。
8
+ 它从Redis有序集合中获取代理,并管理它们以用于请求。
9
+ """
10
+
1
11
  import asyncio
2
12
  import time
3
- from typing import Optional
13
+ from typing import Optional, Any
4
14
 
5
15
  from aioscrapy.db import db_manager
6
16
  from aioscrapy.exceptions import ProxyException
@@ -10,13 +20,47 @@ from aioscrapy.utils.tools import create_task
10
20
 
11
21
 
12
22
  class RedisProxy(AbsProxy):
23
+ """
24
+ Redis-based proxy handler implementation.
25
+ 基于Redis的代理处理程序实现。
26
+
27
+ This class implements the AbsProxy interface using Redis as a backend.
28
+ It fetches proxies from a Redis sorted set and manages them for use in requests.
29
+ 此类使用Redis作为后端实现AbsProxy接口。
30
+ 它从Redis有序集合中获取代理,并管理它们以用于请求。
31
+
32
+ Attributes:
33
+ crawler: The crawler instance.
34
+ 爬虫实例。
35
+ proxy_queue: The Redis client used to fetch proxies.
36
+ 用于获取代理的Redis客户端。
37
+ proxy_key: The key of the Redis sorted set containing proxies.
38
+ 包含代理的Redis有序集合的键。
39
+ lock: An asyncio lock to prevent concurrent proxy fetching.
40
+ 防止并发代理获取的asyncio锁。
41
+ """
42
+
13
43
  def __init__(
14
44
  self,
15
45
  settings,
16
46
  crawler,
17
- proxy_queue: Optional["Redis"] = None,
47
+ proxy_queue: Optional[Any] = None,
18
48
  proxy_key: Optional[str] = None
19
49
  ):
50
+ """
51
+ Initialize the Redis proxy handler.
52
+ 初始化Redis代理处理程序。
53
+
54
+ Args:
55
+ settings: The aioscrapy settings object.
56
+ aioscrapy设置对象。
57
+ crawler: The crawler instance.
58
+ 爬虫实例。
59
+ proxy_queue: The Redis client used to fetch proxies.
60
+ 用于获取代理的Redis客户端。
61
+ proxy_key: The key of the Redis sorted set containing proxies.
62
+ 包含代理的Redis有序集合的键。
63
+ """
20
64
  super().__init__(settings)
21
65
  self.crawler = crawler
22
66
  self.proxy_queue = proxy_queue
@@ -25,11 +69,46 @@ class RedisProxy(AbsProxy):
25
69
 
26
70
  @classmethod
27
71
  async def from_crawler(cls, crawler) -> "RedisProxy":
72
+ """
73
+ Create a RedisProxy instance from a crawler.
74
+ 从爬虫创建RedisProxy实例。
75
+
76
+ This class method creates a RedisProxy instance from a crawler.
77
+ It retrieves the necessary settings and initializes the Redis client.
78
+ 此类方法从爬虫创建RedisProxy实例。
79
+ 它检索必要的设置并初始化Redis客户端。
80
+
81
+ Args:
82
+ crawler: The crawler instance.
83
+ 爬虫实例。
84
+
85
+ Returns:
86
+ RedisProxy: A RedisProxy instance.
87
+ RedisProxy实例。
88
+
89
+ Raises:
90
+ AssertionError: If PROXY_KEY is not configured in settings.
91
+ 如果在设置中未配置PROXY_KEY。
92
+ """
93
+ # Get settings from crawler
94
+ # 从爬虫获取设置
28
95
  settings = crawler.settings
96
+
97
+ # Get proxy key from settings
98
+ # 从设置获取代理键
29
99
  proxy_key = settings.get('PROXY_KEY')
30
100
  assert proxy_key is not None, "Not configured:'PROXY_KEY'"
101
+
102
+ # Get Redis alias from settings, default to 'proxy'
103
+ # 从设置获取Redis别名,默认为'proxy'
31
104
  alias = settings.get("PROXY_QUEUE_ALIAS", 'proxy')
105
+
106
+ # Get Redis client
107
+ # 获取Redis客户端
32
108
  proxy_queue = db_manager.redis(alias)
109
+
110
+ # Create and return RedisProxy instance
111
+ # 创建并返回RedisProxy实例
33
112
  return cls(
34
113
  settings,
35
114
  crawler,
@@ -38,6 +117,23 @@ class RedisProxy(AbsProxy):
38
117
  )
39
118
 
40
119
  async def fill_proxy(self, redis_key: str, count: int) -> None:
120
+ """
121
+ Fill the proxy cache from Redis.
122
+ 从Redis填充代理缓存。
123
+
124
+ This method fetches proxies from a Redis sorted set and adds them to the cache.
125
+ It uses a Lua script to randomly select proxies from the sorted set.
126
+ 此方法从Redis有序集合中获取代理并将它们添加到缓存中。
127
+ 它使用Lua脚本从有序集合中随机选择代理。
128
+
129
+ Args:
130
+ redis_key: The key of the Redis sorted set containing proxies.
131
+ 包含代理的Redis有序集合的键。
132
+ count: The number of proxies to fetch.
133
+ 要获取的代理数量。
134
+ """
135
+ # Lua script to randomly select proxies from a sorted set
136
+ # Lua脚本,用于从有序集合中随机选择代理
41
137
  script = f"""
42
138
  local redis_key = KEYS[1]
43
139
  local min_score = ARGV[1]
@@ -50,23 +146,61 @@ class RedisProxy(AbsProxy):
50
146
  end
51
147
  return redis.call('ZRANGEBYSCORE', redis_key, min_score, max_score, 'LIMIT', start, {count})
52
148
  """
149
+ # Register and execute the script
150
+ # 注册并执行脚本
53
151
  cmd_script = self.proxy_queue.register_script(script)
152
+
153
+ # Try to get proxies with score between 100 and 100 (high quality proxies)
154
+ # 尝试获取分数在100到100之间的代理(高质量代理)
54
155
  result = await cmd_script(keys=[redis_key], args=[100, 100])
156
+
157
+ # If no high quality proxies are available, get any proxies
158
+ # 如果没有高质量代理可用,获取任何代理
55
159
  if not result:
56
160
  result = await cmd_script(keys=[redis_key], args=[0, 100])
161
+
162
+ # Format proxies and add them to the cache
163
+ # 格式化代理并将它们添加到缓存中
57
164
  proxies = [ip.decode() if ip.decode().startswith('http') else f'http://{ip.decode()}' for ip in result]
58
165
  self.cache.extend(proxies)
59
166
  logger.info(f'Get proxy from redis: {proxies}')
60
167
 
61
168
  async def get(self) -> str:
169
+ """
170
+ Get a proxy from the cache.
171
+ 从缓存中获取代理。
172
+
173
+ This method returns a proxy from the cache. If the cache is running low,
174
+ it fills the cache with more proxies from Redis. If no proxies are available,
175
+ it stops the crawler and raises an exception.
176
+ 此方法从缓存中返回代理。如果缓存不足,它会从Redis中填充更多代理到缓存中。
177
+ 如果没有可用的代理,它会停止爬虫并引发异常。
178
+
179
+ Returns:
180
+ str: A proxy string in the format 'scheme://host:port'.
181
+ 格式为'scheme://host:port'的代理字符串。
182
+
183
+ Raises:
184
+ ProxyException: If no proxies are available.
185
+ 如果没有可用的代理。
186
+ """
187
+ # If the cache is running low, fill it with more proxies
188
+ # 如果缓存不足,用更多代理填充它
62
189
  if len(self.cache) < self.min_count:
63
190
  async with self.lock:
191
+ # Check again inside the lock to avoid race conditions
192
+ # 在锁内再次检查以避免竞争条件
64
193
  len(self.cache) < self.min_count and await self.fill_proxy(self.proxy_key, self.max_count - len(self.cache))
194
+
65
195
  try:
196
+ # Get a proxy from the cache and move it to the end
197
+ # 从缓存中获取代理并将其移到末尾
66
198
  proxy = self.cache.pop(0)
67
199
  self.cache.append(proxy)
68
200
  return proxy
69
201
  except IndexError:
202
+ # If no proxies are available, stop the crawler and raise an exception
203
+ # 如果没有可用的代理,停止爬虫并引发异常
70
204
  logger.warning("Not available proxy, Closing spider")
71
205
  create_task(self.crawler.engine.stop(reason="Not available proxy"))
72
206
  raise ProxyException("Not available proxy")