aio-scrapy 2.1.3__py3-none-any.whl → 2.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: aio-scrapy
3
- Version: 2.1.3
3
+ Version: 2.1.4
4
4
  Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
5
5
  Home-page: https://github.com/conlin-huang/aio-scrapy.git
6
6
  Author: conlin
@@ -28,6 +28,7 @@ Requires-Dist: zope.interface >=5.1.0
28
28
  Requires-Dist: redis >=4.3.1
29
29
  Requires-Dist: aiomultiprocess >=0.9.0
30
30
  Requires-Dist: loguru >=0.7.0
31
+ Requires-Dist: anyio >=3.6.2
31
32
  Provides-Extra: aio-pika
32
33
  Requires-Dist: aio-pika >=8.1.1 ; extra == 'aio-pika'
33
34
  Provides-Extra: aiomysql
@@ -1,9 +1,9 @@
1
- aioscrapy/VERSION,sha256=BuGd6tadzBa8VzzJ8ktotpkH9M_Ur7nXA3LMa0YqmGI,5
1
+ aioscrapy/VERSION,sha256=Z7BD32ByWBAJNuaSjQBe7W_NFoIm-41YzXKXt3z-bUI,5
2
2
  aioscrapy/__init__.py,sha256=esJeH66Mz9WV7XbotvZEjNn49jc589YZ_L2DKoD0JvA,858
3
3
  aioscrapy/__main__.py,sha256=rvTdJ0cQwbi29aucPj3jJRpccx5SBzvRcV7qvxvX2NQ,80
4
4
  aioscrapy/cmdline.py,sha256=1qhNg2Edl-Obmf2re2K4V8pJG7ubGfZZCzcHdKtdE_s,5159
5
5
  aioscrapy/crawler.py,sha256=6-ptivIjIGKdojOlZqXV0hV3x1Gont81tOC5u5JqIME,10330
6
- aioscrapy/exceptions.py,sha256=k1daw1hV_aqsaIKKibdyqcNPyVn5oUb07wmB2DRxfjs,2111
6
+ aioscrapy/exceptions.py,sha256=B1UZUXF_dZNJ5b1wltDemijK8iCNpH-EF2sOooH9AsA,2628
7
7
  aioscrapy/link.py,sha256=fXMqsHvYEzsuYi-sNDcElS7jV6Lusq0tjPkPUGOlyZw,1867
8
8
  aioscrapy/logformatter.py,sha256=y3etd28ACbpTbcGprJ_cQ086gxQY3k_QX_yxYFoF1AU,3028
9
9
  aioscrapy/process.py,sha256=uFkj2wzaBu0Vs3pGFKdJ4R-0Gn7hROX6EU-B5zddnyQ,1603
@@ -24,15 +24,15 @@ aioscrapy/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  aioscrapy/core/engine.py,sha256=h02-K2lQqlCxvNIlURgPpnhHCbyiJRIWrFJt5Ys7vZY,9843
25
25
  aioscrapy/core/scheduler.py,sha256=czCx5oHknXuHadpISTfoEMSKXXrlwJTmLTUQtHdtaTc,7407
26
26
  aioscrapy/core/scraper.py,sha256=eS_qEX_Q9fXZnK8Ou1wDtBJhRKk9JoUSnbn4c04u1cA,10750
27
- aioscrapy/core/downloader/__init__.py,sha256=2EUQHGS6Q8fy9eoDT_kmA-eYZEUBV9OkmTppfG05AFA,10039
28
- aioscrapy/core/downloader/handlers/__init__.py,sha256=CriaX2Cp4jUqzDDGZDB7HiIEgUWt2pnYVho6HMV6sJ0,3198
29
- aioscrapy/core/downloader/handlers/aiohttp.py,sha256=KY04ATlu2cTVF3Uxtvqpx1cGxOn2QVIGWoj7S3_UwF4,4266
30
- aioscrapy/core/downloader/handlers/curl_cffi.py,sha256=LpwWdHxctDOxVvhzYcgG1qGhPtw9DoqjUKEdrZlerAQ,2579
31
- aioscrapy/core/downloader/handlers/httpx.py,sha256=4XyLpiaXbO0AtPqctFJgYFX-5rJrKyf469YFpVJdcRY,3387
32
- aioscrapy/core/downloader/handlers/pyhttpx.py,sha256=phpWXtuvP-9tve3MDnWeto2Dmo25UTxgLYc12QEvMt4,2562
33
- aioscrapy/core/downloader/handlers/requests.py,sha256=UnV1WDyET8WMwyYYU0DcL_r420uz_0dK1ej5xAl2fwk,2317
34
- aioscrapy/core/downloader/handlers/playwright/__init__.py,sha256=PXS40Vv3KsV77QoyWCrWcHL6mItXxpTzwaCATB6RXiQ,4504
35
- aioscrapy/core/downloader/handlers/playwright/driverpool.py,sha256=qfIdGjORdn1MookO-ucIJ8NOeLrIQ0y0UJY_xuMzM_8,1374
27
+ aioscrapy/core/downloader/__init__.py,sha256=OCg21payZbmQPcZ1_Wrhhgos7angRB-w9qya3CxrmSU,10040
28
+ aioscrapy/core/downloader/handlers/__init__.py,sha256=KwID2qt3dhFvvBIF3CJnPR4w4a4_qz4uKaXgQI5b59o,3199
29
+ aioscrapy/core/downloader/handlers/aiohttp.py,sha256=qt8Wys8NrbLatBqEob5lzjKmy_C2Nl9XxLyA2npdv6A,4277
30
+ aioscrapy/core/downloader/handlers/curl_cffi.py,sha256=hYlUf2BzS6GrWaPKLJhuqj8fxOt9AANBoeAp9vx7-KU,2590
31
+ aioscrapy/core/downloader/handlers/httpx.py,sha256=HqidohwQr8G7GNhrS1v23rYmD2dzNW69bObcO0X_6Qs,3398
32
+ aioscrapy/core/downloader/handlers/pyhttpx.py,sha256=djxaNoYVD6TJSN3UruviQBx8_oLVtCn4d__qwsoxRJA,2573
33
+ aioscrapy/core/downloader/handlers/requests.py,sha256=RdRi6Izj-jvWa_8T8axW9EzcUfMqfman7eFKTFjOro4,2328
34
+ aioscrapy/core/downloader/handlers/playwright/__init__.py,sha256=xjPNlvM0zzR8lOIzgJeDnq1p0x1VHGhGiyMQmihdkmM,4676
35
+ aioscrapy/core/downloader/handlers/playwright/driverpool.py,sha256=IlkYB8TlSuDq7-sTLlGvtAsFMalNvzpTJR7wEMYe2jE,1595
36
36
  aioscrapy/core/downloader/handlers/playwright/webdriver.py,sha256=QFtAT--2Ea_Gg4x1EhMidyOwQjbqljUl4sKGB_hAA00,3530
37
37
  aioscrapy/db/__init__.py,sha256=ISBXM_-cCf5CgTLc3i_emLxV163-ZAbgttkQiRxokD0,2456
38
38
  aioscrapy/db/absmanager.py,sha256=6vlPcjDHOtZCHePiUYPe6ezRnM-TB4XLhmuw7APaWDk,1162
@@ -59,7 +59,7 @@ aioscrapy/libs/downloader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
59
59
  aioscrapy/libs/downloader/defaultheaders.py,sha256=tg_ULA0Y-41bZKG607mowFJQGVfnZ45LdR044DsjA_A,563
60
60
  aioscrapy/libs/downloader/downloadtimeout.py,sha256=hNh3OEj7rC0ceQrv_yrhR5lb5AvfxJ6cspj3qsQWj4o,704
61
61
  aioscrapy/libs/downloader/ja3fingerprint.py,sha256=DgTw74GXC_Bp94eD_bwoG6A_DphUHTt7bH4glBNXyV8,1058
62
- aioscrapy/libs/downloader/retry.py,sha256=0670bPz5lc4wUsWmYlhYdGZdeflsQdFhJbnwK1g0c84,4441
62
+ aioscrapy/libs/downloader/retry.py,sha256=uKU8XuPya8Co6vTTTgs1-rFtMsZreSwz0Zo1ErgaA6I,4482
63
63
  aioscrapy/libs/downloader/stats.py,sha256=FlkS8Zm4j3SBjHb6caXwq08HvvZ37VKORGCAjlA2U38,1376
64
64
  aioscrapy/libs/downloader/useragent.py,sha256=E5x5dk9AxsSCGDDICJlTXwWXRkqAibWgesqG0VhAG8M,743
65
65
  aioscrapy/libs/extensions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -125,9 +125,9 @@ aioscrapy/utils/template.py,sha256=HR97X4lpv2WuqhuPfzTgaBN66fYnzHVpP6zQ5IoTwcI,8
125
125
  aioscrapy/utils/tools.py,sha256=WJowViZB8XEs2CFqjVvbqXK3H5Uvf4BgWgBD_RcHMaM,2319
126
126
  aioscrapy/utils/trackref.py,sha256=0nIpelT1d5WYxALl8SGA8vHNYsh-jS0Z2lwVEAhwx8E,2019
127
127
  aioscrapy/utils/url.py,sha256=8W8tAhU7lgfPOfzKp3ejJGEcLj1i_PnA_53Jv5LpxiY,5464
128
- aio_scrapy-2.1.3.dist-info/LICENSE,sha256=L-UoAEM3fQSjKA7FVWxQM7gwSCbeue6gZRAnpRS_UCo,1088
129
- aio_scrapy-2.1.3.dist-info/METADATA,sha256=ldFQ8PbukunEAMxkUNbPQsA2Iuo6VfyYwPQbjc7KETk,6506
130
- aio_scrapy-2.1.3.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
131
- aio_scrapy-2.1.3.dist-info/entry_points.txt,sha256=WWhoVHZvqhW8a5uFg97K0EP_GjG3uuCIFLkyqDICgaw,56
132
- aio_scrapy-2.1.3.dist-info/top_level.txt,sha256=8l08KyMt22wfX_5BmhrGH0PgwZdzZIPq-hBUa1GNir4,10
133
- aio_scrapy-2.1.3.dist-info/RECORD,,
128
+ aio_scrapy-2.1.4.dist-info/LICENSE,sha256=L-UoAEM3fQSjKA7FVWxQM7gwSCbeue6gZRAnpRS_UCo,1088
129
+ aio_scrapy-2.1.4.dist-info/METADATA,sha256=9R1Kw1XYe7yrLJ3h4SeiV69tPphz8sTaIf2Sizfh0GU,6536
130
+ aio_scrapy-2.1.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
131
+ aio_scrapy-2.1.4.dist-info/entry_points.txt,sha256=WWhoVHZvqhW8a5uFg97K0EP_GjG3uuCIFLkyqDICgaw,56
132
+ aio_scrapy-2.1.4.dist-info/top_level.txt,sha256=8l08KyMt22wfX_5BmhrGH0PgwZdzZIPq-hBUa1GNir4,10
133
+ aio_scrapy-2.1.4.dist-info/RECORD,,
aioscrapy/VERSION CHANGED
@@ -1 +1 @@
1
- 2.1.3
1
+ 2.1.4
@@ -143,7 +143,7 @@ class Downloader(BaseDownloader):
143
143
  crawler.spider.dupefilter = df # 将指纹绑定到Spider 在解析成功的时候 调用DUPEFILTER_CLASS的success方法
144
144
  return cls(
145
145
  crawler,
146
- await call_helper(DownloadHandlerManager.for_crawler, crawler),
146
+ await call_helper(DownloadHandlerManager.from_crawler, crawler),
147
147
  await call_helper(DownloaderMiddlewareManager.from_crawler, crawler),
148
148
  proxy=crawler.settings.get("PROXY_HANDLER") and await load_instance(crawler.settings["PROXY_HANDLER"],
149
149
  crawler=crawler),
@@ -37,7 +37,7 @@ class DownloadHandlerManager:
37
37
  crawler.signals.connect(self._close, signals.engine_stopped)
38
38
 
39
39
  @classmethod
40
- def for_crawler(cls, crawler) -> "DownloadHandlerManager":
40
+ def from_crawler(cls, crawler) -> "DownloadHandlerManager":
41
41
  return cls(crawler)
42
42
 
43
43
  async def _get_handler(self, scheme: str) -> Optional[BaseDownloadHandler]:
@@ -37,7 +37,7 @@ class AioHttpDownloadHandler(BaseDownloadHandler):
37
37
  try:
38
38
  return await self._download_request(request)
39
39
  except ClientError as e:
40
- raise DownloadError(e) from e
40
+ raise DownloadError(real_error=e) from e
41
41
 
42
42
  async def _download_request(self, request: Request) -> HtmlResponse:
43
43
  kwargs = {
@@ -24,7 +24,7 @@ class CurlCffiDownloadHandler(BaseDownloadHandler):
24
24
  try:
25
25
  return await self._download_request(request)
26
26
  except CurlError as e:
27
- raise DownloadError(e) from e
27
+ raise DownloadError(real_error=e) from e
28
28
 
29
29
  async def _download_request(self, request: Request) -> HtmlResponse:
30
30
  kwargs = {
@@ -32,7 +32,7 @@ class HttpxDownloadHandler(BaseDownloadHandler):
32
32
  try:
33
33
  return await self._download_request(request)
34
34
  except HttpxError as e:
35
- raise DownloadError(e) from e
35
+ raise DownloadError(real_error=e) from e
36
36
 
37
37
  async def _download_request(self, request: Request) -> HtmlResponse:
38
38
  kwargs = {
@@ -1,6 +1,10 @@
1
1
  from functools import wraps
2
2
 
3
- from playwright._impl._api_types import Error
3
+ try:
4
+ from playwright._impl._errors import Error
5
+ except ImportError:
6
+ from playwright._impl._api_types import Error
7
+
4
8
  from playwright.async_api._generated import Response as EventResponse
5
9
 
6
10
  from aioscrapy import Request, Spider
@@ -17,10 +21,11 @@ class PlaywrightHandler(BaseDownloadHandler):
17
21
  def __init__(self, settings: Settings):
18
22
  self.settings = settings
19
23
  playwright_client_args = settings.getdict('PLAYWRIGHT_CLIENT_ARGS')
24
+ use_pool = settings.getbool('PLAYWRIGHT_USE_POOL', True)
20
25
  self.wait_until = playwright_client_args.get('wait_until', 'domcontentloaded')
21
26
  self.url_regexes = playwright_client_args.pop('url_regexes', [])
22
27
  pool_size = playwright_client_args.pop('pool_size', settings.getint("CONCURRENT_REQUESTS", 1))
23
- self._webdriver_pool = WebDriverPool(pool_size=pool_size, driver_cls=PlaywrightDriver, **playwright_client_args)
28
+ self._webdriver_pool = WebDriverPool(use_pool=use_pool, pool_size=pool_size, driver_cls=PlaywrightDriver, **playwright_client_args)
24
29
 
25
30
  @classmethod
26
31
  def from_settings(cls, settings: Settings):
@@ -30,7 +35,7 @@ class PlaywrightHandler(BaseDownloadHandler):
30
35
  try:
31
36
  return await self._download_request(request, spider)
32
37
  except Error as e:
33
- raise DownloadError(e) from e
38
+ raise DownloadError(real_error=e) from e
34
39
 
35
40
  async def _download_request(self, request: Request, spider) -> PlaywrightResponse:
36
41
  cookies = dict(request.cookies)
@@ -9,8 +9,9 @@ from aioscrapy.utils.tools import singleton
9
9
  @singleton
10
10
  class WebDriverPool:
11
11
  def __init__(
12
- self, pool_size=5, driver_cls=None, **kwargs
12
+ self, use_pool=True, pool_size=5, driver_cls=None, **kwargs
13
13
  ):
14
+ self.use_pool = use_pool
14
15
  self.pool_size = pool_size
15
16
  self.driver_cls = driver_cls
16
17
  self.kwargs = kwargs
@@ -32,6 +33,8 @@ class WebDriverPool:
32
33
 
33
34
  async def get(self, **kwargs):
34
35
  async with self.lock:
36
+ if not self.use_pool:
37
+ return await self.create_driver(**kwargs)
35
38
  if not self.is_full:
36
39
  driver = await self.create_driver(**kwargs)
37
40
  self.driver_count += 1
@@ -40,6 +43,9 @@ class WebDriverPool:
40
43
  return driver
41
44
 
42
45
  async def release(self, driver):
46
+ if not self.use_pool:
47
+ await driver.quit()
48
+ return
43
49
  await self.queue.put(driver)
44
50
 
45
51
  async def remove(self, driver):
@@ -27,7 +27,7 @@ class PyhttpxDownloadHandler(BaseDownloadHandler):
27
27
  try:
28
28
  return await self._download_request(request)
29
29
  except PyHttpxError as e:
30
- raise DownloadError(e) from e
30
+ raise DownloadError(real_error=e) from e
31
31
 
32
32
  async def _download_request(self, request: Request) -> HtmlResponse:
33
33
  kwargs = {
@@ -25,7 +25,7 @@ class RequestsDownloadHandler(BaseDownloadHandler):
25
25
  try:
26
26
  return await self._download_request(request)
27
27
  except RequestsError as e:
28
- raise DownloadError(e) from e
28
+ raise DownloadError(real_error=e) from e
29
29
 
30
30
  async def _download_request(self, request: Request) -> HtmlResponse:
31
31
  kwargs = {
aioscrapy/exceptions.py CHANGED
@@ -5,6 +5,7 @@ These exceptions are documented in docs/topics/exceptions.rst. Please don't add
5
5
  new exceptions here without documenting them there.
6
6
  """
7
7
 
8
+
8
9
  # Internal
9
10
 
10
11
 
@@ -95,4 +96,21 @@ class ProxyException(Exception):
95
96
 
96
97
  class DownloadError(Exception):
97
98
  """下载页面时发生的错误"""
98
- pass
99
+
100
+ def __init__(self, *args, real_error=None):
101
+ self.real_error = real_error
102
+ super().__init__(*args)
103
+
104
+ def __str__(self):
105
+ if not self.real_error:
106
+ return "DownloadError"
107
+
108
+ return f"{self.real_error.__class__.__module__}.{self.real_error.__class__.__name__}: {str(self.real_error)}"
109
+
110
+
111
+ if __name__ == '__main__':
112
+ e = Exception("xxx")
113
+ reason = DownloadError(real_error=e)
114
+ print(reason)
115
+ obj = reason.real_error.__class__
116
+ print(f"{obj.__module__}.{obj.__name__}")
@@ -51,7 +51,7 @@ def get_retry_request(
51
51
  if callable(reason):
52
52
  reason = reason()
53
53
  if isinstance(reason, Exception):
54
- reason = global_object_name(reason.__class__)
54
+ reason = global_object_name((getattr(reason, "real_error", None) or reason).__class__)
55
55
 
56
56
  logger.info(
57
57
  "Retrying %(request)s (failed %(retry_times)d times): %(reason)s" % {