aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
  2. aio_scrapy-2.1.7.dist-info/METADATA +147 -0
  3. aio_scrapy-2.1.7.dist-info/RECORD +134 -0
  4. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
  5. aioscrapy/VERSION +1 -1
  6. aioscrapy/cmdline.py +438 -5
  7. aioscrapy/core/downloader/__init__.py +522 -17
  8. aioscrapy/core/downloader/handlers/__init__.py +187 -5
  9. aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
  10. aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
  11. aioscrapy/core/downloader/handlers/httpx.py +135 -5
  12. aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
  13. aioscrapy/core/downloader/handlers/requests.py +120 -2
  14. aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
  15. aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
  16. aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
  17. aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
  18. aioscrapy/core/engine.py +381 -20
  19. aioscrapy/core/scheduler.py +350 -36
  20. aioscrapy/core/scraper.py +509 -33
  21. aioscrapy/crawler.py +392 -10
  22. aioscrapy/db/__init__.py +149 -0
  23. aioscrapy/db/absmanager.py +212 -6
  24. aioscrapy/db/aiomongo.py +292 -10
  25. aioscrapy/db/aiomysql.py +363 -10
  26. aioscrapy/db/aiopg.py +299 -2
  27. aioscrapy/db/aiorabbitmq.py +444 -4
  28. aioscrapy/db/aioredis.py +260 -11
  29. aioscrapy/dupefilters/__init__.py +110 -5
  30. aioscrapy/dupefilters/disk.py +124 -2
  31. aioscrapy/dupefilters/redis.py +598 -32
  32. aioscrapy/exceptions.py +151 -13
  33. aioscrapy/http/__init__.py +1 -1
  34. aioscrapy/http/headers.py +237 -3
  35. aioscrapy/http/request/__init__.py +257 -11
  36. aioscrapy/http/request/form.py +83 -3
  37. aioscrapy/http/request/json_request.py +121 -9
  38. aioscrapy/http/response/__init__.py +306 -33
  39. aioscrapy/http/response/html.py +42 -3
  40. aioscrapy/http/response/text.py +496 -49
  41. aioscrapy/http/response/web_driver.py +144 -0
  42. aioscrapy/http/response/xml.py +45 -3
  43. aioscrapy/libs/downloader/defaultheaders.py +66 -2
  44. aioscrapy/libs/downloader/downloadtimeout.py +91 -2
  45. aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
  46. aioscrapy/libs/downloader/retry.py +192 -6
  47. aioscrapy/libs/downloader/stats.py +142 -0
  48. aioscrapy/libs/downloader/useragent.py +93 -2
  49. aioscrapy/libs/extensions/closespider.py +166 -4
  50. aioscrapy/libs/extensions/corestats.py +151 -1
  51. aioscrapy/libs/extensions/logstats.py +145 -1
  52. aioscrapy/libs/extensions/metric.py +370 -1
  53. aioscrapy/libs/extensions/throttle.py +235 -1
  54. aioscrapy/libs/pipelines/__init__.py +345 -2
  55. aioscrapy/libs/pipelines/csv.py +242 -0
  56. aioscrapy/libs/pipelines/excel.py +545 -0
  57. aioscrapy/libs/pipelines/mongo.py +132 -0
  58. aioscrapy/libs/pipelines/mysql.py +67 -0
  59. aioscrapy/libs/pipelines/pg.py +67 -0
  60. aioscrapy/libs/spider/depth.py +141 -3
  61. aioscrapy/libs/spider/httperror.py +144 -4
  62. aioscrapy/libs/spider/offsite.py +202 -2
  63. aioscrapy/libs/spider/referer.py +396 -21
  64. aioscrapy/libs/spider/urllength.py +97 -1
  65. aioscrapy/link.py +115 -8
  66. aioscrapy/logformatter.py +199 -8
  67. aioscrapy/middleware/absmanager.py +328 -2
  68. aioscrapy/middleware/downloader.py +218 -0
  69. aioscrapy/middleware/extension.py +50 -1
  70. aioscrapy/middleware/itempipeline.py +96 -0
  71. aioscrapy/middleware/spider.py +360 -7
  72. aioscrapy/process.py +200 -0
  73. aioscrapy/proxy/__init__.py +142 -3
  74. aioscrapy/proxy/redis.py +136 -2
  75. aioscrapy/queue/__init__.py +168 -16
  76. aioscrapy/scrapyd/runner.py +124 -3
  77. aioscrapy/serializer.py +182 -2
  78. aioscrapy/settings/__init__.py +610 -128
  79. aioscrapy/settings/default_settings.py +314 -14
  80. aioscrapy/signalmanager.py +151 -20
  81. aioscrapy/signals.py +183 -1
  82. aioscrapy/spiderloader.py +165 -12
  83. aioscrapy/spiders/__init__.py +233 -6
  84. aioscrapy/statscollectors.py +312 -1
  85. aioscrapy/utils/conf.py +345 -17
  86. aioscrapy/utils/curl.py +168 -16
  87. aioscrapy/utils/decorators.py +76 -6
  88. aioscrapy/utils/deprecate.py +212 -19
  89. aioscrapy/utils/httpobj.py +55 -3
  90. aioscrapy/utils/log.py +79 -0
  91. aioscrapy/utils/misc.py +189 -21
  92. aioscrapy/utils/ossignal.py +67 -5
  93. aioscrapy/utils/project.py +165 -3
  94. aioscrapy/utils/python.py +254 -44
  95. aioscrapy/utils/reqser.py +75 -1
  96. aioscrapy/utils/request.py +173 -12
  97. aioscrapy/utils/response.py +91 -6
  98. aioscrapy/utils/signal.py +196 -14
  99. aioscrapy/utils/spider.py +51 -4
  100. aioscrapy/utils/template.py +93 -6
  101. aioscrapy/utils/tools.py +191 -17
  102. aioscrapy/utils/trackref.py +198 -12
  103. aioscrapy/utils/url.py +341 -36
  104. aio_scrapy-2.1.4.dist-info/METADATA +0 -239
  105. aio_scrapy-2.1.4.dist-info/RECORD +0 -133
  106. aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
  107. aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
  108. aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
  109. aioscrapy/http/response/playwright.py +0 -36
  110. aioscrapy/libs/pipelines/execl.py +0 -169
  111. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
  112. {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,14 @@
1
+
1
2
  """
2
- Base class for Scrapy spiders
3
+ Spider module for aioscrapy.
4
+ aioscrapy的爬虫模块。
3
5
 
4
- See documentation in docs/topics/spiders.rst
6
+ This module contains the base Spider class that all spiders must inherit from.
7
+ It provides the core functionality for creating and managing spiders.
8
+ 此模块包含所有爬虫必须继承的基础Spider类。
9
+ 它提供了创建和管理爬虫的核心功能。
5
10
  """
11
+
6
12
  import time
7
13
  from typing import Optional, Union
8
14
 
@@ -16,8 +22,22 @@ from aioscrapy.utils.url import url_is_from_spider
16
22
 
17
23
 
18
24
  class Spider(object):
19
- """Base class for scrapy spiders. All spiders must inherit from this
20
- class.
25
+ """
26
+ Base class for aioscrapy spiders. All spiders must inherit from this class.
27
+ aioscrapy爬虫的基类。所有爬虫必须继承自此类。
28
+
29
+ This class provides the core functionality for creating and managing spiders,
30
+ including request generation, response parsing, and signal handling.
31
+ 此类提供了创建和管理爬虫的核心功能,包括请求生成、响应解析和信号处理。
32
+
33
+ Attributes:
34
+ name: The name of the spider. Must be unique. 爬虫的名称,必须唯一。
35
+ proxy: Optional proxy handler. 可选的代理处理器。
36
+ dupefilter: Optional duplicate filter. 可选的重复过滤器。
37
+ custom_settings: Dictionary of settings to override project settings. 用于覆盖项目设置的设置字典。
38
+ stats: Statistics collector. 统计收集器。
39
+ pause: Whether the spider is paused. 爬虫是否暂停。
40
+ start_urls: List of URLs to start crawling from. 开始爬取的URL列表。
21
41
  """
22
42
 
23
43
  name: Optional[str] = None
@@ -30,6 +50,14 @@ class Spider(object):
30
50
  _pause_time: Optional[Union[int, float]] = None
31
51
 
32
52
  def __init__(self, name=None, **kwargs):
53
+ """
54
+ Initialize the spider.
55
+ 初始化爬虫。
56
+
57
+ Args:
58
+ name: Spider name. 爬虫名称。
59
+ **kwargs: Additional arguments. 额外参数。
60
+ """
33
61
  if name is not None:
34
62
  self.name = name
35
63
  elif not getattr(self, 'name', None):
@@ -40,12 +68,37 @@ class Spider(object):
40
68
 
41
69
  @property
42
70
  def pause_time(self) -> int:
71
+ """
72
+ Get the time until which the spider is paused.
73
+ 获取爬虫暂停的时间点。
74
+
75
+ If not set, defaults to current time + 600 seconds.
76
+ 如果未设置,默认为当前时间 + 600秒。
77
+
78
+ Returns:
79
+ int: Unix timestamp when the pause ends.
80
+ 暂停结束的Unix时间戳。
81
+ """
43
82
  if self._pause_time is None:
44
83
  self._pause_time = 600 + int(time.time())
45
84
  return self._pause_time
46
85
 
47
86
  @pause_time.setter
48
87
  def pause_time(self, value: Union[int, float]):
88
+ """
89
+ Set the time until which the spider is paused.
90
+ 设置爬虫暂停的时间点。
91
+
92
+ Args:
93
+ value: Unix timestamp or duration in seconds.
94
+ Unix时间戳或持续时间(秒)。
95
+ - If None, pause indefinitely.
96
+ 如果为None,则无限期暂停。
97
+ - If less than current time, treated as duration.
98
+ 如果小于当前时间,则视为持续时间。
99
+ - Otherwise, treated as absolute timestamp.
100
+ 否则,视为绝对时间戳。
101
+ """
49
102
  self.pause = True
50
103
  if value is None:
51
104
  self._pause_time = float('inf')
@@ -56,18 +109,67 @@ class Spider(object):
56
109
 
57
110
  @classmethod
58
111
  async def from_crawler(cls, crawler, *args, **kwargs):
112
+ """
113
+ Create a spider instance from a crawler.
114
+ 从爬虫引擎创建爬虫实例。
115
+
116
+ Args:
117
+ crawler: The crawler instance. 爬虫引擎实例。
118
+ *args: Additional arguments. 额外参数。
119
+ **kwargs: Additional keyword arguments. 额外关键字参数。
120
+
121
+ Returns:
122
+ Spider instance. 爬虫实例。
123
+ """
59
124
  spider = cls(*args, **kwargs)
60
125
  spider._set_crawler(crawler)
61
126
  return spider
62
127
 
63
128
  def _set_crawler(self, crawler):
129
+ """
130
+ Set the crawler for this spider.
131
+ 为此爬虫设置爬虫引擎。
132
+
133
+ This method is called by the from_crawler class method to set up the crawler
134
+ for this spider. It connects signal handlers and sets up settings.
135
+ 此方法由from_crawler类方法调用,为此爬虫设置爬虫引擎。
136
+ 它连接信号处理程序并设置配置。
137
+
138
+ Args:
139
+ crawler: The crawler instance to use.
140
+ 要使用的爬虫引擎实例。
141
+ """
142
+ # Store the crawler instance
143
+ # 存储爬虫引擎实例
64
144
  self.crawler = crawler
145
+
146
+ # Get settings from the crawler
147
+ # 从爬虫引擎获取设置
65
148
  self.settings = crawler.settings
149
+
150
+ # Determine if the spider should close when idle
151
+ # 确定爬虫在空闲时是否应该关闭
66
152
  self.close_on_idle = self.settings.get("CLOSE_SPIDER_ON_IDLE", True)
153
+
154
+ # Connect signal handlers
155
+ # 连接信号处理程序
67
156
  crawler.signals.connect(self.close, signals.spider_closed)
68
157
  crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
69
158
 
70
159
  async def start_requests(self):
160
+ """
161
+ Generate initial requests for the spider.
162
+ 生成爬虫的初始请求。
163
+
164
+ This method must return an iterable of Request objects.
165
+ 此方法必须返回一个包含Request对象的可迭代对象。
166
+
167
+ By default, it generates Request objects from the spider's start_urls.
168
+ 默认情况下,它从爬虫的start_urls生成Request对象。
169
+
170
+ Returns:
171
+ An iterable of Request objects. 包含Request对象的可迭代对象。
172
+ """
71
173
  if not self.start_urls and hasattr(self, 'start_url'):
72
174
  raise AttributeError(
73
175
  "Crawling could not start: 'start_urls' not found "
@@ -78,36 +180,151 @@ class Spider(object):
78
180
  yield Request(url)
79
181
 
80
182
  async def request_from_dict(self, d: dict):
81
- """继承成后重写改方法,将队列中的json根据情况构建成Request对象"""
82
- pass
183
+ """
184
+ Create a Request object from a dictionary.
185
+ 从字典创建Request对象。
186
+
187
+ This method can be overridden in subclasses to customize the request creation process.
188
+ It is typically used for deserializing requests from storage or message queues.
189
+ 可以在子类中重写此方法以自定义请求创建过程。
190
+ 它通常用于从存储或消息队列中反序列化请求。
191
+
192
+ Args:
193
+ d: Dictionary containing request data. 包含请求数据的字典。
194
+ Expected keys include:
195
+ 预期的键包括:
196
+ - url: The URL to request (required)
197
+ 要请求的URL(必需)
198
+ - callback: Name of the callback method (optional)
199
+ 回调方法的名称(可选)
200
+ - method: HTTP method (optional, default: 'GET')
201
+ HTTP方法(可选,默认:'GET')
202
+ - headers: HTTP headers (optional)
203
+ HTTP头(可选)
204
+ - body: Request body (optional)
205
+ 请求体(可选)
206
+ - cookies: Cookies (optional)
207
+ Cookie(可选)
208
+ - meta: Request metadata (optional)
209
+ 请求元数据(可选)
210
+
211
+ Returns:
212
+ Request: A Request object, or None if the request cannot be created.
213
+ Request对象,如果无法创建请求则为None。
214
+ """
215
+ # This is a placeholder implementation that should be overridden in subclasses
216
+ # 这是一个应该在子类中重写的占位符实现
217
+ return None
83
218
 
84
219
  async def _parse(self, response: Response, **kwargs):
220
+ """
221
+ Internal parse method that calls the user-defined parse method.
222
+ 调用用户定义的parse方法的内部解析方法。
223
+
224
+ This method is used internally by the crawler to call the spider's parse method.
225
+ It uses call_helper to handle the parse method call, which supports both
226
+ async and non-async parse methods.
227
+ 此方法由爬虫引擎内部使用,用于调用爬虫的parse方法。
228
+ 它使用call_helper来处理parse方法调用,支持异步和非异步parse方法。
229
+
230
+ Args:
231
+ response: The response to parse.
232
+ 要解析的响应。
233
+ **kwargs: Additional keyword arguments to pass to the parse method.
234
+ 传递给parse方法的额外关键字参数。
235
+
236
+ Returns:
237
+ The result of the parse method.
238
+ parse方法的结果。
239
+ """
85
240
  return await call_helper(self.parse, response)
86
241
 
87
242
  async def parse(self, response: Response):
243
+ """
244
+ Default callback used to process downloaded responses.
245
+ 用于处理下载响应的默认回调方法。
246
+
247
+ This method must be implemented in subclasses.
248
+ 必须在子类中实现此方法。
249
+
250
+ Args:
251
+ response: The response to process. 要处理的响应。
252
+
253
+ Returns:
254
+ An iterable of Request and/or item objects. 包含Request和/或数据项对象的可迭代对象。
255
+ """
88
256
  raise NotImplementedError(f'{self.__class__.__name__}.parse callback is not defined')
89
257
 
90
258
  @classmethod
91
259
  def update_settings(cls, settings):
260
+ """
261
+ Update settings with spider custom settings.
262
+ 使用爬虫自定义设置更新设置。
263
+
264
+ Args:
265
+ settings: The settings to update. 要更新的设置。
266
+ """
92
267
  settings.setdict(cls.custom_settings or {}, priority='spider')
93
268
 
94
269
  @classmethod
95
270
  def handles_request(cls, request):
271
+ """
272
+ Check if this spider can handle the given request.
273
+ 检查此爬虫是否可以处理给定的请求。
274
+
275
+ Args:
276
+ request: The request to check. 要检查的请求。
277
+
278
+ Returns:
279
+ True if this spider can handle the request, False otherwise.
280
+ 如果此爬虫可以处理该请求,则返回True,否则返回False。
281
+ """
96
282
  return url_is_from_spider(request.url, cls)
97
283
 
98
284
  @staticmethod
99
285
  def close(spider, reason):
286
+ """
287
+ Signal handler for the spider_closed signal.
288
+ 爬虫关闭信号的处理函数。
289
+
290
+ Args:
291
+ spider: The spider being closed. 正在关闭的爬虫。
292
+ reason: The reason for closing the spider. 关闭爬虫的原因。
293
+ """
100
294
  closed = getattr(spider, 'closed', None)
101
295
  if callable(closed):
102
296
  return closed(reason)
103
297
 
104
298
  def __str__(self):
299
+ """
300
+ Return a string representation of the spider.
301
+ 返回爬虫的字符串表示。
302
+
303
+ Returns:
304
+ str: A string representation of the spider, including its class name,
305
+ name, and memory address.
306
+ 爬虫的字符串表示,包括其类名、名称和内存地址。
307
+ """
105
308
  return f"<{type(self).__name__} {self.name!r} at 0x{id(self):0x}>"
106
309
 
310
+ # Make __repr__ use the same implementation as __str__
311
+ # 使__repr__使用与__str__相同的实现
107
312
  __repr__ = __str__
108
313
 
109
314
  @classmethod
110
315
  def start(cls, setting_path=None, use_windows_selector_eventLoop: bool = False):
316
+ """
317
+ Start crawling using this spider.
318
+ 使用此爬虫开始爬取。
319
+
320
+ This is a convenience method that creates a CrawlerProcess, adds the spider,
321
+ and starts the crawling process.
322
+ 这是一个便捷方法,它创建一个CrawlerProcess,添加爬虫,并启动爬取过程。
323
+
324
+ Args:
325
+ setting_path: Path to settings module. 设置模块的路径。
326
+ use_windows_selector_eventLoop: Whether to use Windows selector event loop. 是否使用Windows选择器事件循环。
327
+ """
111
328
  from aioscrapy.crawler import CrawlerProcess
112
329
  from aioscrapy.utils.project import get_project_settings
113
330
 
@@ -119,5 +336,15 @@ class Spider(object):
119
336
  cp.start(use_windows_selector_eventLoop)
120
337
 
121
338
  def spider_idle(self):
339
+ """
340
+ Signal handler for the spider_idle signal.
341
+ 爬虫空闲信号的处理函数。
342
+
343
+ This method is called when the spider has no more requests to process.
344
+ 当爬虫没有更多请求要处理时,调用此方法。
345
+
346
+ If CLOSE_SPIDER_ON_IDLE is False, it raises DontCloseSpider to prevent the spider from closing.
347
+ 如果CLOSE_SPIDER_ON_IDLE为False,它会引发DontCloseSpider以防止爬虫关闭。
348
+ """
122
349
  if not self.close_on_idle:
123
350
  raise DontCloseSpider