aio-scrapy 2.1.3__py3-none-any.whl → 2.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.4.dist-info}/METADATA +2 -1
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.4.dist-info}/RECORD +18 -18
- aioscrapy/VERSION +1 -1
- aioscrapy/core/downloader/__init__.py +1 -1
- aioscrapy/core/downloader/handlers/__init__.py +1 -1
- aioscrapy/core/downloader/handlers/aiohttp.py +1 -1
- aioscrapy/core/downloader/handlers/curl_cffi.py +1 -1
- aioscrapy/core/downloader/handlers/httpx.py +1 -1
- aioscrapy/core/downloader/handlers/playwright/__init__.py +8 -3
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +7 -1
- aioscrapy/core/downloader/handlers/pyhttpx.py +1 -1
- aioscrapy/core/downloader/handlers/requests.py +1 -1
- aioscrapy/exceptions.py +19 -1
- aioscrapy/libs/downloader/retry.py +1 -1
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.4.dist-info}/LICENSE +0 -0
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.4.dist-info}/WHEEL +0 -0
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.4.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.3.dist-info → aio_scrapy-2.1.4.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: aio-scrapy
|
|
3
|
-
Version: 2.1.
|
|
3
|
+
Version: 2.1.4
|
|
4
4
|
Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
|
|
5
5
|
Home-page: https://github.com/conlin-huang/aio-scrapy.git
|
|
6
6
|
Author: conlin
|
|
@@ -28,6 +28,7 @@ Requires-Dist: zope.interface >=5.1.0
|
|
|
28
28
|
Requires-Dist: redis >=4.3.1
|
|
29
29
|
Requires-Dist: aiomultiprocess >=0.9.0
|
|
30
30
|
Requires-Dist: loguru >=0.7.0
|
|
31
|
+
Requires-Dist: anyio >=3.6.2
|
|
31
32
|
Provides-Extra: aio-pika
|
|
32
33
|
Requires-Dist: aio-pika >=8.1.1 ; extra == 'aio-pika'
|
|
33
34
|
Provides-Extra: aiomysql
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
aioscrapy/VERSION,sha256=
|
|
1
|
+
aioscrapy/VERSION,sha256=Z7BD32ByWBAJNuaSjQBe7W_NFoIm-41YzXKXt3z-bUI,5
|
|
2
2
|
aioscrapy/__init__.py,sha256=esJeH66Mz9WV7XbotvZEjNn49jc589YZ_L2DKoD0JvA,858
|
|
3
3
|
aioscrapy/__main__.py,sha256=rvTdJ0cQwbi29aucPj3jJRpccx5SBzvRcV7qvxvX2NQ,80
|
|
4
4
|
aioscrapy/cmdline.py,sha256=1qhNg2Edl-Obmf2re2K4V8pJG7ubGfZZCzcHdKtdE_s,5159
|
|
5
5
|
aioscrapy/crawler.py,sha256=6-ptivIjIGKdojOlZqXV0hV3x1Gont81tOC5u5JqIME,10330
|
|
6
|
-
aioscrapy/exceptions.py,sha256=
|
|
6
|
+
aioscrapy/exceptions.py,sha256=B1UZUXF_dZNJ5b1wltDemijK8iCNpH-EF2sOooH9AsA,2628
|
|
7
7
|
aioscrapy/link.py,sha256=fXMqsHvYEzsuYi-sNDcElS7jV6Lusq0tjPkPUGOlyZw,1867
|
|
8
8
|
aioscrapy/logformatter.py,sha256=y3etd28ACbpTbcGprJ_cQ086gxQY3k_QX_yxYFoF1AU,3028
|
|
9
9
|
aioscrapy/process.py,sha256=uFkj2wzaBu0Vs3pGFKdJ4R-0Gn7hROX6EU-B5zddnyQ,1603
|
|
@@ -24,15 +24,15 @@ aioscrapy/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
24
24
|
aioscrapy/core/engine.py,sha256=h02-K2lQqlCxvNIlURgPpnhHCbyiJRIWrFJt5Ys7vZY,9843
|
|
25
25
|
aioscrapy/core/scheduler.py,sha256=czCx5oHknXuHadpISTfoEMSKXXrlwJTmLTUQtHdtaTc,7407
|
|
26
26
|
aioscrapy/core/scraper.py,sha256=eS_qEX_Q9fXZnK8Ou1wDtBJhRKk9JoUSnbn4c04u1cA,10750
|
|
27
|
-
aioscrapy/core/downloader/__init__.py,sha256=
|
|
28
|
-
aioscrapy/core/downloader/handlers/__init__.py,sha256=
|
|
29
|
-
aioscrapy/core/downloader/handlers/aiohttp.py,sha256=
|
|
30
|
-
aioscrapy/core/downloader/handlers/curl_cffi.py,sha256=
|
|
31
|
-
aioscrapy/core/downloader/handlers/httpx.py,sha256=
|
|
32
|
-
aioscrapy/core/downloader/handlers/pyhttpx.py,sha256=
|
|
33
|
-
aioscrapy/core/downloader/handlers/requests.py,sha256=
|
|
34
|
-
aioscrapy/core/downloader/handlers/playwright/__init__.py,sha256=
|
|
35
|
-
aioscrapy/core/downloader/handlers/playwright/driverpool.py,sha256=
|
|
27
|
+
aioscrapy/core/downloader/__init__.py,sha256=OCg21payZbmQPcZ1_Wrhhgos7angRB-w9qya3CxrmSU,10040
|
|
28
|
+
aioscrapy/core/downloader/handlers/__init__.py,sha256=KwID2qt3dhFvvBIF3CJnPR4w4a4_qz4uKaXgQI5b59o,3199
|
|
29
|
+
aioscrapy/core/downloader/handlers/aiohttp.py,sha256=qt8Wys8NrbLatBqEob5lzjKmy_C2Nl9XxLyA2npdv6A,4277
|
|
30
|
+
aioscrapy/core/downloader/handlers/curl_cffi.py,sha256=hYlUf2BzS6GrWaPKLJhuqj8fxOt9AANBoeAp9vx7-KU,2590
|
|
31
|
+
aioscrapy/core/downloader/handlers/httpx.py,sha256=HqidohwQr8G7GNhrS1v23rYmD2dzNW69bObcO0X_6Qs,3398
|
|
32
|
+
aioscrapy/core/downloader/handlers/pyhttpx.py,sha256=djxaNoYVD6TJSN3UruviQBx8_oLVtCn4d__qwsoxRJA,2573
|
|
33
|
+
aioscrapy/core/downloader/handlers/requests.py,sha256=RdRi6Izj-jvWa_8T8axW9EzcUfMqfman7eFKTFjOro4,2328
|
|
34
|
+
aioscrapy/core/downloader/handlers/playwright/__init__.py,sha256=xjPNlvM0zzR8lOIzgJeDnq1p0x1VHGhGiyMQmihdkmM,4676
|
|
35
|
+
aioscrapy/core/downloader/handlers/playwright/driverpool.py,sha256=IlkYB8TlSuDq7-sTLlGvtAsFMalNvzpTJR7wEMYe2jE,1595
|
|
36
36
|
aioscrapy/core/downloader/handlers/playwright/webdriver.py,sha256=QFtAT--2Ea_Gg4x1EhMidyOwQjbqljUl4sKGB_hAA00,3530
|
|
37
37
|
aioscrapy/db/__init__.py,sha256=ISBXM_-cCf5CgTLc3i_emLxV163-ZAbgttkQiRxokD0,2456
|
|
38
38
|
aioscrapy/db/absmanager.py,sha256=6vlPcjDHOtZCHePiUYPe6ezRnM-TB4XLhmuw7APaWDk,1162
|
|
@@ -59,7 +59,7 @@ aioscrapy/libs/downloader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
|
|
|
59
59
|
aioscrapy/libs/downloader/defaultheaders.py,sha256=tg_ULA0Y-41bZKG607mowFJQGVfnZ45LdR044DsjA_A,563
|
|
60
60
|
aioscrapy/libs/downloader/downloadtimeout.py,sha256=hNh3OEj7rC0ceQrv_yrhR5lb5AvfxJ6cspj3qsQWj4o,704
|
|
61
61
|
aioscrapy/libs/downloader/ja3fingerprint.py,sha256=DgTw74GXC_Bp94eD_bwoG6A_DphUHTt7bH4glBNXyV8,1058
|
|
62
|
-
aioscrapy/libs/downloader/retry.py,sha256=
|
|
62
|
+
aioscrapy/libs/downloader/retry.py,sha256=uKU8XuPya8Co6vTTTgs1-rFtMsZreSwz0Zo1ErgaA6I,4482
|
|
63
63
|
aioscrapy/libs/downloader/stats.py,sha256=FlkS8Zm4j3SBjHb6caXwq08HvvZ37VKORGCAjlA2U38,1376
|
|
64
64
|
aioscrapy/libs/downloader/useragent.py,sha256=E5x5dk9AxsSCGDDICJlTXwWXRkqAibWgesqG0VhAG8M,743
|
|
65
65
|
aioscrapy/libs/extensions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -125,9 +125,9 @@ aioscrapy/utils/template.py,sha256=HR97X4lpv2WuqhuPfzTgaBN66fYnzHVpP6zQ5IoTwcI,8
|
|
|
125
125
|
aioscrapy/utils/tools.py,sha256=WJowViZB8XEs2CFqjVvbqXK3H5Uvf4BgWgBD_RcHMaM,2319
|
|
126
126
|
aioscrapy/utils/trackref.py,sha256=0nIpelT1d5WYxALl8SGA8vHNYsh-jS0Z2lwVEAhwx8E,2019
|
|
127
127
|
aioscrapy/utils/url.py,sha256=8W8tAhU7lgfPOfzKp3ejJGEcLj1i_PnA_53Jv5LpxiY,5464
|
|
128
|
-
aio_scrapy-2.1.
|
|
129
|
-
aio_scrapy-2.1.
|
|
130
|
-
aio_scrapy-2.1.
|
|
131
|
-
aio_scrapy-2.1.
|
|
132
|
-
aio_scrapy-2.1.
|
|
133
|
-
aio_scrapy-2.1.
|
|
128
|
+
aio_scrapy-2.1.4.dist-info/LICENSE,sha256=L-UoAEM3fQSjKA7FVWxQM7gwSCbeue6gZRAnpRS_UCo,1088
|
|
129
|
+
aio_scrapy-2.1.4.dist-info/METADATA,sha256=9R1Kw1XYe7yrLJ3h4SeiV69tPphz8sTaIf2Sizfh0GU,6536
|
|
130
|
+
aio_scrapy-2.1.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
131
|
+
aio_scrapy-2.1.4.dist-info/entry_points.txt,sha256=WWhoVHZvqhW8a5uFg97K0EP_GjG3uuCIFLkyqDICgaw,56
|
|
132
|
+
aio_scrapy-2.1.4.dist-info/top_level.txt,sha256=8l08KyMt22wfX_5BmhrGH0PgwZdzZIPq-hBUa1GNir4,10
|
|
133
|
+
aio_scrapy-2.1.4.dist-info/RECORD,,
|
aioscrapy/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
2.1.
|
|
1
|
+
2.1.4
|
|
@@ -143,7 +143,7 @@ class Downloader(BaseDownloader):
|
|
|
143
143
|
crawler.spider.dupefilter = df # 将指纹绑定到Spider 在解析成功的时候 调用DUPEFILTER_CLASS的success方法
|
|
144
144
|
return cls(
|
|
145
145
|
crawler,
|
|
146
|
-
await call_helper(DownloadHandlerManager.
|
|
146
|
+
await call_helper(DownloadHandlerManager.from_crawler, crawler),
|
|
147
147
|
await call_helper(DownloaderMiddlewareManager.from_crawler, crawler),
|
|
148
148
|
proxy=crawler.settings.get("PROXY_HANDLER") and await load_instance(crawler.settings["PROXY_HANDLER"],
|
|
149
149
|
crawler=crawler),
|
|
@@ -37,7 +37,7 @@ class DownloadHandlerManager:
|
|
|
37
37
|
crawler.signals.connect(self._close, signals.engine_stopped)
|
|
38
38
|
|
|
39
39
|
@classmethod
|
|
40
|
-
def
|
|
40
|
+
def from_crawler(cls, crawler) -> "DownloadHandlerManager":
|
|
41
41
|
return cls(crawler)
|
|
42
42
|
|
|
43
43
|
async def _get_handler(self, scheme: str) -> Optional[BaseDownloadHandler]:
|
|
@@ -37,7 +37,7 @@ class AioHttpDownloadHandler(BaseDownloadHandler):
|
|
|
37
37
|
try:
|
|
38
38
|
return await self._download_request(request)
|
|
39
39
|
except ClientError as e:
|
|
40
|
-
raise DownloadError(e) from e
|
|
40
|
+
raise DownloadError(real_error=e) from e
|
|
41
41
|
|
|
42
42
|
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
43
43
|
kwargs = {
|
|
@@ -24,7 +24,7 @@ class CurlCffiDownloadHandler(BaseDownloadHandler):
|
|
|
24
24
|
try:
|
|
25
25
|
return await self._download_request(request)
|
|
26
26
|
except CurlError as e:
|
|
27
|
-
raise DownloadError(e) from e
|
|
27
|
+
raise DownloadError(real_error=e) from e
|
|
28
28
|
|
|
29
29
|
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
30
30
|
kwargs = {
|
|
@@ -32,7 +32,7 @@ class HttpxDownloadHandler(BaseDownloadHandler):
|
|
|
32
32
|
try:
|
|
33
33
|
return await self._download_request(request)
|
|
34
34
|
except HttpxError as e:
|
|
35
|
-
raise DownloadError(e) from e
|
|
35
|
+
raise DownloadError(real_error=e) from e
|
|
36
36
|
|
|
37
37
|
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
38
38
|
kwargs = {
|
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
from functools import wraps
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
try:
|
|
4
|
+
from playwright._impl._errors import Error
|
|
5
|
+
except ImportError:
|
|
6
|
+
from playwright._impl._api_types import Error
|
|
7
|
+
|
|
4
8
|
from playwright.async_api._generated import Response as EventResponse
|
|
5
9
|
|
|
6
10
|
from aioscrapy import Request, Spider
|
|
@@ -17,10 +21,11 @@ class PlaywrightHandler(BaseDownloadHandler):
|
|
|
17
21
|
def __init__(self, settings: Settings):
|
|
18
22
|
self.settings = settings
|
|
19
23
|
playwright_client_args = settings.getdict('PLAYWRIGHT_CLIENT_ARGS')
|
|
24
|
+
use_pool = settings.getbool('PLAYWRIGHT_USE_POOL', True)
|
|
20
25
|
self.wait_until = playwright_client_args.get('wait_until', 'domcontentloaded')
|
|
21
26
|
self.url_regexes = playwright_client_args.pop('url_regexes', [])
|
|
22
27
|
pool_size = playwright_client_args.pop('pool_size', settings.getint("CONCURRENT_REQUESTS", 1))
|
|
23
|
-
self._webdriver_pool = WebDriverPool(pool_size=pool_size, driver_cls=PlaywrightDriver, **playwright_client_args)
|
|
28
|
+
self._webdriver_pool = WebDriverPool(use_pool=use_pool, pool_size=pool_size, driver_cls=PlaywrightDriver, **playwright_client_args)
|
|
24
29
|
|
|
25
30
|
@classmethod
|
|
26
31
|
def from_settings(cls, settings: Settings):
|
|
@@ -30,7 +35,7 @@ class PlaywrightHandler(BaseDownloadHandler):
|
|
|
30
35
|
try:
|
|
31
36
|
return await self._download_request(request, spider)
|
|
32
37
|
except Error as e:
|
|
33
|
-
raise DownloadError(e) from e
|
|
38
|
+
raise DownloadError(real_error=e) from e
|
|
34
39
|
|
|
35
40
|
async def _download_request(self, request: Request, spider) -> PlaywrightResponse:
|
|
36
41
|
cookies = dict(request.cookies)
|
|
@@ -9,8 +9,9 @@ from aioscrapy.utils.tools import singleton
|
|
|
9
9
|
@singleton
|
|
10
10
|
class WebDriverPool:
|
|
11
11
|
def __init__(
|
|
12
|
-
self, pool_size=5, driver_cls=None, **kwargs
|
|
12
|
+
self, use_pool=True, pool_size=5, driver_cls=None, **kwargs
|
|
13
13
|
):
|
|
14
|
+
self.use_pool = use_pool
|
|
14
15
|
self.pool_size = pool_size
|
|
15
16
|
self.driver_cls = driver_cls
|
|
16
17
|
self.kwargs = kwargs
|
|
@@ -32,6 +33,8 @@ class WebDriverPool:
|
|
|
32
33
|
|
|
33
34
|
async def get(self, **kwargs):
|
|
34
35
|
async with self.lock:
|
|
36
|
+
if not self.use_pool:
|
|
37
|
+
return await self.create_driver(**kwargs)
|
|
35
38
|
if not self.is_full:
|
|
36
39
|
driver = await self.create_driver(**kwargs)
|
|
37
40
|
self.driver_count += 1
|
|
@@ -40,6 +43,9 @@ class WebDriverPool:
|
|
|
40
43
|
return driver
|
|
41
44
|
|
|
42
45
|
async def release(self, driver):
|
|
46
|
+
if not self.use_pool:
|
|
47
|
+
await driver.quit()
|
|
48
|
+
return
|
|
43
49
|
await self.queue.put(driver)
|
|
44
50
|
|
|
45
51
|
async def remove(self, driver):
|
|
@@ -27,7 +27,7 @@ class PyhttpxDownloadHandler(BaseDownloadHandler):
|
|
|
27
27
|
try:
|
|
28
28
|
return await self._download_request(request)
|
|
29
29
|
except PyHttpxError as e:
|
|
30
|
-
raise DownloadError(e) from e
|
|
30
|
+
raise DownloadError(real_error=e) from e
|
|
31
31
|
|
|
32
32
|
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
33
33
|
kwargs = {
|
|
@@ -25,7 +25,7 @@ class RequestsDownloadHandler(BaseDownloadHandler):
|
|
|
25
25
|
try:
|
|
26
26
|
return await self._download_request(request)
|
|
27
27
|
except RequestsError as e:
|
|
28
|
-
raise DownloadError(e) from e
|
|
28
|
+
raise DownloadError(real_error=e) from e
|
|
29
29
|
|
|
30
30
|
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
31
31
|
kwargs = {
|
aioscrapy/exceptions.py
CHANGED
|
@@ -5,6 +5,7 @@ These exceptions are documented in docs/topics/exceptions.rst. Please don't add
|
|
|
5
5
|
new exceptions here without documenting them there.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
|
+
|
|
8
9
|
# Internal
|
|
9
10
|
|
|
10
11
|
|
|
@@ -95,4 +96,21 @@ class ProxyException(Exception):
|
|
|
95
96
|
|
|
96
97
|
class DownloadError(Exception):
|
|
97
98
|
"""下载页面时发生的错误"""
|
|
98
|
-
|
|
99
|
+
|
|
100
|
+
def __init__(self, *args, real_error=None):
|
|
101
|
+
self.real_error = real_error
|
|
102
|
+
super().__init__(*args)
|
|
103
|
+
|
|
104
|
+
def __str__(self):
|
|
105
|
+
if not self.real_error:
|
|
106
|
+
return "DownloadError"
|
|
107
|
+
|
|
108
|
+
return f"{self.real_error.__class__.__module__}.{self.real_error.__class__.__name__}: {str(self.real_error)}"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
if __name__ == '__main__':
|
|
112
|
+
e = Exception("xxx")
|
|
113
|
+
reason = DownloadError(real_error=e)
|
|
114
|
+
print(reason)
|
|
115
|
+
obj = reason.real_error.__class__
|
|
116
|
+
print(f"{obj.__module__}.{obj.__name__}")
|
|
@@ -51,7 +51,7 @@ def get_retry_request(
|
|
|
51
51
|
if callable(reason):
|
|
52
52
|
reason = reason()
|
|
53
53
|
if isinstance(reason, Exception):
|
|
54
|
-
reason = global_object_name(reason.__class__)
|
|
54
|
+
reason = global_object_name((getattr(reason, "real_error", None) or reason).__class__)
|
|
55
55
|
|
|
56
56
|
logger.info(
|
|
57
57
|
"Retrying %(request)s (failed %(retry_times)d times): %(reason)s" % {
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|