aio-scrapy 2.0.7__py3-none-any.whl → 2.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.0.7.dist-info → aio_scrapy-2.0.9.dist-info}/METADATA +27 -27
- {aio_scrapy-2.0.7.dist-info → aio_scrapy-2.0.9.dist-info}/RECORD +19 -19
- {aio_scrapy-2.0.7.dist-info → aio_scrapy-2.0.9.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/dupefilters/__init__.py +18 -3
- aioscrapy/dupefilters/disk.py +5 -19
- aioscrapy/dupefilters/redis.py +5 -15
- aioscrapy/libs/downloader/retry.py +7 -0
- aioscrapy/queue/__init__.py +2 -2
- aioscrapy/queue/memory.py +2 -2
- aioscrapy/queue/rabbitmq.py +1 -1
- aioscrapy/queue/redis.py +3 -3
- aioscrapy/spiders/__init__.py +1 -1
- aioscrapy/utils/log.py +3 -1
- aioscrapy/utils/reqser.py +2 -2
- aioscrapy/utils/request.py +2 -2
- {aio_scrapy-2.0.7.dist-info → aio_scrapy-2.0.9.dist-info}/LICENSE +0 -0
- {aio_scrapy-2.0.7.dist-info → aio_scrapy-2.0.9.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.0.7.dist-info → aio_scrapy-2.0.9.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: aio-scrapy
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.9
|
|
4
4
|
Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
|
|
5
5
|
Home-page: https://github.com/conlin-huang/aio-scrapy.git
|
|
6
6
|
Author: conlin
|
|
@@ -21,44 +21,44 @@ Description-Content-Type: text/markdown
|
|
|
21
21
|
License-File: LICENSE
|
|
22
22
|
Requires-Dist: aiohttp
|
|
23
23
|
Requires-Dist: ujson
|
|
24
|
-
Requires-Dist: w3lib
|
|
25
|
-
Requires-Dist: parsel
|
|
26
|
-
Requires-Dist: PyDispatcher
|
|
27
|
-
Requires-Dist: zope.interface
|
|
28
|
-
Requires-Dist: redis
|
|
29
|
-
Requires-Dist: aiomultiprocess
|
|
30
|
-
Requires-Dist: loguru
|
|
24
|
+
Requires-Dist: w3lib >=1.17.0
|
|
25
|
+
Requires-Dist: parsel >=1.5.0
|
|
26
|
+
Requires-Dist: PyDispatcher >=2.0.5
|
|
27
|
+
Requires-Dist: zope.interface >=5.1.0
|
|
28
|
+
Requires-Dist: redis >=4.3.1
|
|
29
|
+
Requires-Dist: aiomultiprocess >=0.9.0
|
|
30
|
+
Requires-Dist: loguru >=0.7.0
|
|
31
31
|
Provides-Extra: aio-pika
|
|
32
|
-
Requires-Dist: aio-pika
|
|
32
|
+
Requires-Dist: aio-pika >=8.1.1 ; extra == 'aio-pika'
|
|
33
33
|
Provides-Extra: aiomysql
|
|
34
|
-
Requires-Dist: aiomysql
|
|
34
|
+
Requires-Dist: aiomysql >=0.1.1 ; extra == 'aiomysql'
|
|
35
35
|
Requires-Dist: cryptography ; extra == 'aiomysql'
|
|
36
36
|
Provides-Extra: all
|
|
37
|
-
Requires-Dist: aiomysql
|
|
38
|
-
Requires-Dist: httpx[http2]
|
|
39
|
-
Requires-Dist: aio-pika
|
|
37
|
+
Requires-Dist: aiomysql >=0.1.1 ; extra == 'all'
|
|
38
|
+
Requires-Dist: httpx[http2] >=0.23.0 ; extra == 'all'
|
|
39
|
+
Requires-Dist: aio-pika >=8.1.1 ; extra == 'all'
|
|
40
40
|
Requires-Dist: cryptography ; extra == 'all'
|
|
41
|
-
Requires-Dist: motor
|
|
42
|
-
Requires-Dist: pyhttpx
|
|
43
|
-
Requires-Dist: asyncpg
|
|
44
|
-
Requires-Dist: XlsxWriter
|
|
45
|
-
Requires-Dist: pillow
|
|
46
|
-
Requires-Dist: requests
|
|
41
|
+
Requires-Dist: motor >=3.1.1 ; extra == 'all'
|
|
42
|
+
Requires-Dist: pyhttpx >=2.10.1 ; extra == 'all'
|
|
43
|
+
Requires-Dist: asyncpg >=0.27.0 ; extra == 'all'
|
|
44
|
+
Requires-Dist: XlsxWriter >=3.1.2 ; extra == 'all'
|
|
45
|
+
Requires-Dist: pillow >=9.4.0 ; extra == 'all'
|
|
46
|
+
Requires-Dist: requests >=2.28.2 ; extra == 'all'
|
|
47
47
|
Provides-Extra: execl
|
|
48
|
-
Requires-Dist: XlsxWriter
|
|
49
|
-
Requires-Dist: pillow
|
|
48
|
+
Requires-Dist: XlsxWriter >=3.1.2 ; extra == 'execl'
|
|
49
|
+
Requires-Dist: pillow >=9.4.0 ; extra == 'execl'
|
|
50
50
|
Provides-Extra: httpx
|
|
51
|
-
Requires-Dist: httpx[http2]
|
|
51
|
+
Requires-Dist: httpx[http2] >=0.23.0 ; extra == 'httpx'
|
|
52
52
|
Provides-Extra: mongo
|
|
53
|
-
Requires-Dist: motor
|
|
53
|
+
Requires-Dist: motor >=3.1.1 ; extra == 'mongo'
|
|
54
54
|
Provides-Extra: pg
|
|
55
|
-
Requires-Dist: asyncpg
|
|
55
|
+
Requires-Dist: asyncpg >=0.27.0 ; extra == 'pg'
|
|
56
56
|
Provides-Extra: playwright
|
|
57
|
-
Requires-Dist: playwright
|
|
57
|
+
Requires-Dist: playwright >=1.31.1 ; extra == 'playwright'
|
|
58
58
|
Provides-Extra: pyhttpx
|
|
59
|
-
Requires-Dist: pyhttpx
|
|
59
|
+
Requires-Dist: pyhttpx >=2.10.4 ; extra == 'pyhttpx'
|
|
60
60
|
Provides-Extra: requests
|
|
61
|
-
Requires-Dist: requests
|
|
61
|
+
Requires-Dist: requests >=2.28.2 ; extra == 'requests'
|
|
62
62
|
|
|
63
63
|
<!--
|
|
64
64
|

|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
aioscrapy/VERSION,sha256=
|
|
1
|
+
aioscrapy/VERSION,sha256=Qd60-DGk0CyAsfZTOK4DTPjIJ6aXFjyqch4-b7ff6f0,5
|
|
2
2
|
aioscrapy/__init__.py,sha256=esJeH66Mz9WV7XbotvZEjNn49jc589YZ_L2DKoD0JvA,858
|
|
3
3
|
aioscrapy/__main__.py,sha256=rvTdJ0cQwbi29aucPj3jJRpccx5SBzvRcV7qvxvX2NQ,80
|
|
4
4
|
aioscrapy/cmdline.py,sha256=1qhNg2Edl-Obmf2re2K4V8pJG7ubGfZZCzcHdKtdE_s,5159
|
|
@@ -40,9 +40,9 @@ aioscrapy/db/aiomysql.py,sha256=-xCLfeH7RzvghY1jqREAb_Qnz9q_dVjxoHGfz7sCqbU,3799
|
|
|
40
40
|
aioscrapy/db/aiopg.py,sha256=WG4s_2X0b8LQHbZpoIrwZeuGHNolKj-SvmvAZQlCk00,3213
|
|
41
41
|
aioscrapy/db/aiorabbitmq.py,sha256=tNKl4Kx7KM7H_lOj8xfeA0uD8PuBTVzySApTEn5TyAE,5583
|
|
42
42
|
aioscrapy/db/aioredis.py,sha256=UOoTRTQUvghnq29bVL8v1HvksMXYOzHaS8Btgbpn0bY,2966
|
|
43
|
-
aioscrapy/dupefilters/__init__.py,sha256=
|
|
44
|
-
aioscrapy/dupefilters/disk.py,sha256=
|
|
45
|
-
aioscrapy/dupefilters/redis.py,sha256=
|
|
43
|
+
aioscrapy/dupefilters/__init__.py,sha256=17s6Hyr_lWDFPto6wLEvRfT2TbGU2RIssTDuChzrDNA,1498
|
|
44
|
+
aioscrapy/dupefilters/disk.py,sha256=EMgxeC2a6aYCGKgp4QOs5xwHp33LUsOZ8pliKBTFx1c,1551
|
|
45
|
+
aioscrapy/dupefilters/redis.py,sha256=1bDqB1avfDRR9b9doDXAyxwL1Fa8LEpMYlV7YoBXBvw,4723
|
|
46
46
|
aioscrapy/http/__init__.py,sha256=yeQTT5W1iwr6dKznTS5d9vnx2hsB47i9roPM57wQp_0,597
|
|
47
47
|
aioscrapy/http/headers.py,sha256=H-RJ6KqOsFFFAXORfvoyz3V-ud0I8TAj5Jt5fAACcLc,1573
|
|
48
48
|
aioscrapy/http/request/__init__.py,sha256=PFoFU3ncTN-gj6Rx01rjVa_744Qfv3EH29mooW6JX9U,7121
|
|
@@ -58,7 +58,7 @@ aioscrapy/libs/downloader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
|
|
|
58
58
|
aioscrapy/libs/downloader/defaultheaders.py,sha256=tg_ULA0Y-41bZKG607mowFJQGVfnZ45LdR044DsjA_A,563
|
|
59
59
|
aioscrapy/libs/downloader/downloadtimeout.py,sha256=hNh3OEj7rC0ceQrv_yrhR5lb5AvfxJ6cspj3qsQWj4o,704
|
|
60
60
|
aioscrapy/libs/downloader/ja3fingerprint.py,sha256=DgTw74GXC_Bp94eD_bwoG6A_DphUHTt7bH4glBNXyV8,1058
|
|
61
|
-
aioscrapy/libs/downloader/retry.py,sha256=
|
|
61
|
+
aioscrapy/libs/downloader/retry.py,sha256=eaMig7JpSyr6QQBD6FNYpcttuGK811Dm4tJGTUIi3q8,5191
|
|
62
62
|
aioscrapy/libs/downloader/stats.py,sha256=FlkS8Zm4j3SBjHb6caXwq08HvvZ37VKORGCAjlA2U38,1376
|
|
63
63
|
aioscrapy/libs/downloader/useragent.py,sha256=E5x5dk9AxsSCGDDICJlTXwWXRkqAibWgesqG0VhAG8M,743
|
|
64
64
|
aioscrapy/libs/extensions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -87,15 +87,15 @@ aioscrapy/middleware/itempipeline.py,sha256=_Htfrs3vzIUfajTOzQLdGaX_4xTFtSSoFzyh
|
|
|
87
87
|
aioscrapy/middleware/spider.py,sha256=QvV5dchOlskBn1sXKd5dj6s9zSZmlT6LibydCjfmYjU,6361
|
|
88
88
|
aioscrapy/proxy/__init__.py,sha256=Cwua97Z-ezxtDSlud7mCOAV-iExY7RX_8O1oP5PS__k,1807
|
|
89
89
|
aioscrapy/proxy/redis.py,sha256=LFfnnkihf6Wq1-HeRzPLVEiy5e5wxJbMY7htU-C_Pd8,2711
|
|
90
|
-
aioscrapy/queue/__init__.py,sha256
|
|
91
|
-
aioscrapy/queue/memory.py,sha256=
|
|
92
|
-
aioscrapy/queue/rabbitmq.py,sha256=
|
|
93
|
-
aioscrapy/queue/redis.py,sha256=
|
|
90
|
+
aioscrapy/queue/__init__.py,sha256=MKHNOgcZRAjFHAxoKLujvsBCkB_Ne1-gz5DqNbdTYNA,2037
|
|
91
|
+
aioscrapy/queue/memory.py,sha256=_Dvkd-HXwdR8B-wsEPlHNWbAgaMH9M_UGZhF21LbnHA,3140
|
|
92
|
+
aioscrapy/queue/rabbitmq.py,sha256=rE1GCoIGxoaV4KAi_9Umt623A00FaHHIFV5ZH_nypzY,2516
|
|
93
|
+
aioscrapy/queue/redis.py,sha256=KU31ZNciLI9xxZDxsDhtOPLtmkxZQlRPOx_1z8afdwY,4788
|
|
94
94
|
aioscrapy/scrapyd/__init__.py,sha256=Ey14RVLUP7typ2XqP8RWcUum2fuFyigdhuhBBiEheIo,68
|
|
95
95
|
aioscrapy/scrapyd/runner.py,sha256=tewEkdNTMrBoredCbhmdrswSrF-GWsU3MLgC__ntnzQ,1777
|
|
96
96
|
aioscrapy/settings/__init__.py,sha256=GuiVhezV8U2J1B-WJwSvxxeH_1YWYD_Wighr9owC4HU,15781
|
|
97
97
|
aioscrapy/settings/default_settings.py,sha256=ffGA1SKEBQtmRC7UaFcNBlZrVW9PjUwukDiARqVfTXs,5432
|
|
98
|
-
aioscrapy/spiders/__init__.py,sha256=
|
|
98
|
+
aioscrapy/spiders/__init__.py,sha256=vAfod_sqXs85E-QRNji_Qhf7SyWx1kXgJD8n3AhAj1g,3934
|
|
99
99
|
aioscrapy/templates/project/aioscrapy.cfg,sha256=_nRHP5wtPnZaBi7wCmjWv5BgUu5NYFJZhvCTRVSipyM,112
|
|
100
100
|
aioscrapy/templates/project/module/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
101
101
|
aioscrapy/templates/project/module/middlewares.py.tmpl,sha256=0eEf2LC0vYcWPH82HNqieYSORyUuIo3Bgl5t-neRAJ4,3469
|
|
@@ -110,13 +110,13 @@ aioscrapy/utils/curl.py,sha256=I8eZWFNgvyUiJ2YS9-s3HltGNVG8XMMU0HPhlMxuxdA,3295
|
|
|
110
110
|
aioscrapy/utils/decorators.py,sha256=gMQArNxF9QQc1bENA0IqDchAjqmfWvHGKOyUdjXdg6A,794
|
|
111
111
|
aioscrapy/utils/deprecate.py,sha256=STy55Q8kZI8q8CQUfxK4QQVu1Rs4En3rlhzWc7p7T00,5467
|
|
112
112
|
aioscrapy/utils/httpobj.py,sha256=ytec7IZzsQY_GwR___051hdbOWs1ZM6S57HwcNiu2es,708
|
|
113
|
-
aioscrapy/utils/log.py,sha256=
|
|
113
|
+
aioscrapy/utils/log.py,sha256=NRDivw8w21J77qEUeqqLdC4sgdIKaj2UAP6lDvWGotM,1697
|
|
114
114
|
aioscrapy/utils/misc.py,sha256=9NOssEl7CP_c6R9skxyXwmz4bd-nZ_gkw6F0EybeLTQ,3509
|
|
115
115
|
aioscrapy/utils/ossignal.py,sha256=jAsCIKu17KV45-9dZwEkFJHF31Y13KP_zxY0x49j1jo,896
|
|
116
116
|
aioscrapy/utils/project.py,sha256=cT98HaR5JaNmm-Y1UzSuzXj6B5S7GlmMshUfMhjpjJY,2905
|
|
117
117
|
aioscrapy/utils/python.py,sha256=fMV3Y2s7AnbQ7TChBoQodqPNzGEdVA3J89W-arwswd4,4577
|
|
118
|
-
aioscrapy/utils/reqser.py,sha256=
|
|
119
|
-
aioscrapy/utils/request.py,sha256=
|
|
118
|
+
aioscrapy/utils/reqser.py,sha256=qjrYut6KtvGpLLd-HDM0cncNzWCtXgpH6NyERu_5A9g,487
|
|
119
|
+
aioscrapy/utils/request.py,sha256=bkFaLDeebAOp7pF-7vta9LKOB2OR2s7V9jVKfA-XlqA,2418
|
|
120
120
|
aioscrapy/utils/response.py,sha256=UPR1wTTAYZkLGiiIs28kJLhlF7WPrgLuW31l9LZuYKM,1341
|
|
121
121
|
aioscrapy/utils/signal.py,sha256=bkqRgGMqQ82dly_D4tDe_0pHBbc9QUxBJqSsH9RSQf0,2282
|
|
122
122
|
aioscrapy/utils/spider.py,sha256=Usq3UlCaDUvXGp0ojFt39UPKFrR2rbInlJc_q0Xk7Qc,610
|
|
@@ -124,9 +124,9 @@ aioscrapy/utils/template.py,sha256=HR97X4lpv2WuqhuPfzTgaBN66fYnzHVpP6zQ5IoTwcI,8
|
|
|
124
124
|
aioscrapy/utils/tools.py,sha256=WJowViZB8XEs2CFqjVvbqXK3H5Uvf4BgWgBD_RcHMaM,2319
|
|
125
125
|
aioscrapy/utils/trackref.py,sha256=0nIpelT1d5WYxALl8SGA8vHNYsh-jS0Z2lwVEAhwx8E,2019
|
|
126
126
|
aioscrapy/utils/url.py,sha256=8W8tAhU7lgfPOfzKp3ejJGEcLj1i_PnA_53Jv5LpxiY,5464
|
|
127
|
-
aio_scrapy-2.0.
|
|
128
|
-
aio_scrapy-2.0.
|
|
129
|
-
aio_scrapy-2.0.
|
|
130
|
-
aio_scrapy-2.0.
|
|
131
|
-
aio_scrapy-2.0.
|
|
132
|
-
aio_scrapy-2.0.
|
|
127
|
+
aio_scrapy-2.0.9.dist-info/LICENSE,sha256=L-UoAEM3fQSjKA7FVWxQM7gwSCbeue6gZRAnpRS_UCo,1088
|
|
128
|
+
aio_scrapy-2.0.9.dist-info/METADATA,sha256=AzYfL1fSr0PvPyJDV7QgQ2rm3DzWaIN5WwRhtNsT8Ik,6384
|
|
129
|
+
aio_scrapy-2.0.9.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
130
|
+
aio_scrapy-2.0.9.dist-info/entry_points.txt,sha256=WWhoVHZvqhW8a5uFg97K0EP_GjG3uuCIFLkyqDICgaw,56
|
|
131
|
+
aio_scrapy-2.0.9.dist-info/top_level.txt,sha256=8l08KyMt22wfX_5BmhrGH0PgwZdzZIPq-hBUa1GNir4,10
|
|
132
|
+
aio_scrapy-2.0.9.dist-info/RECORD,,
|
aioscrapy/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
2.0.
|
|
1
|
+
2.0.9
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from abc import ABCMeta, abstractmethod
|
|
2
2
|
|
|
3
3
|
from aioscrapy import Request, Spider
|
|
4
|
+
from aioscrapy.utils.log import logger
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
class DupeFilterBase(metaclass=ABCMeta):
|
|
@@ -19,6 +20,20 @@ class DupeFilterBase(metaclass=ABCMeta):
|
|
|
19
20
|
async def close(self, reason: str = '') -> None:
|
|
20
21
|
""" Delete data on close """
|
|
21
22
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
23
|
+
def log(self, request: Request, spider: Spider):
|
|
24
|
+
if self.info:
|
|
25
|
+
logger.info("Filtered duplicate request: %(request)s" % {
|
|
26
|
+
'request': request.meta.get('dupefilter_msg') or request
|
|
27
|
+
})
|
|
28
|
+
elif self.debug:
|
|
29
|
+
logger.debug("Filtered duplicate request: %(request)s" % {
|
|
30
|
+
'request': request.meta.get('dupefilter_msg') or request
|
|
31
|
+
})
|
|
32
|
+
elif self.logdupes:
|
|
33
|
+
msg = ("Filtered duplicate request: %(request)s"
|
|
34
|
+
" - no more duplicates will be shown"
|
|
35
|
+
" (see DUPEFILTER_DEBUG to show all duplicates)")
|
|
36
|
+
logger.debug(msg % {'request': request.meta.get('dupefilter_msg') or request})
|
|
37
|
+
self.logdupes = False
|
|
38
|
+
|
|
39
|
+
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
aioscrapy/dupefilters/disk.py
CHANGED
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import Optional, Set
|
|
3
3
|
|
|
4
|
-
from aioscrapy import Request
|
|
4
|
+
from aioscrapy import Request
|
|
5
5
|
from aioscrapy.dupefilters import DupeFilterBase
|
|
6
|
-
from aioscrapy.utils.log import logger
|
|
7
|
-
from aioscrapy.utils.request import referer_str
|
|
8
6
|
|
|
9
7
|
|
|
10
8
|
class DiskRFPDupeFilter(DupeFilterBase):
|
|
11
9
|
"""Request Fingerprint duplicates filter built with Disk storage"""
|
|
12
10
|
|
|
13
|
-
def __init__(self, path: Optional[str] = None, debug: bool = False):
|
|
11
|
+
def __init__(self, path: Optional[str] = None, debug: bool = False, info: bool = False):
|
|
14
12
|
self.file: Optional["File object"] = None
|
|
15
13
|
self.debug = debug
|
|
16
14
|
self.fingerprints: Set = set()
|
|
17
15
|
self.logdupes: bool = True
|
|
16
|
+
self.info: bool = info
|
|
18
17
|
if path:
|
|
19
18
|
self.file = open(os.path.join(path, 'requests.seen'), 'a+')
|
|
20
19
|
self.file.seek(0)
|
|
@@ -23,10 +22,11 @@ class DiskRFPDupeFilter(DupeFilterBase):
|
|
|
23
22
|
@classmethod
|
|
24
23
|
def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
|
|
25
24
|
debug = crawler.settings.getbool('DUPEFILTER_DEBUG')
|
|
25
|
+
info = crawler.settings.getbool('DUPEFILTER_INFO')
|
|
26
26
|
path = crawler.settings.get('JOBDIR', './job_dir')
|
|
27
27
|
if path and not os.path.exists(path):
|
|
28
28
|
os.makedirs(path)
|
|
29
|
-
return cls(path, debug)
|
|
29
|
+
return cls(path, debug, info)
|
|
30
30
|
|
|
31
31
|
async def request_seen(self, request: Request) -> bool:
|
|
32
32
|
if request.fingerprint in self.fingerprints:
|
|
@@ -40,19 +40,5 @@ class DiskRFPDupeFilter(DupeFilterBase):
|
|
|
40
40
|
if self.file:
|
|
41
41
|
self.file.close()
|
|
42
42
|
|
|
43
|
-
def log(self, request: Request, spider: Spider):
|
|
44
|
-
if self.debug:
|
|
45
|
-
logger.debug("Filtered duplicate request: %(request)s (referer: %(referer)s)" % {
|
|
46
|
-
'request': request, 'referer': referer_str(request)
|
|
47
|
-
})
|
|
48
|
-
elif self.logdupes:
|
|
49
|
-
msg = ("Filtered duplicate request: %(request)s"
|
|
50
|
-
" - no more duplicates will be shown"
|
|
51
|
-
" (see DUPEFILTER_DEBUG to show all duplicates)")
|
|
52
|
-
logger.debug(msg % {'request': request})
|
|
53
|
-
self.logdupes = False
|
|
54
|
-
|
|
55
|
-
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
|
56
|
-
|
|
57
43
|
|
|
58
44
|
RFPDupeFilter = DiskRFPDupeFilter
|
aioscrapy/dupefilters/redis.py
CHANGED
|
@@ -2,8 +2,6 @@ from aioscrapy import Request
|
|
|
2
2
|
from aioscrapy.db import db_manager
|
|
3
3
|
from aioscrapy.dupefilters import DupeFilterBase
|
|
4
4
|
|
|
5
|
-
from aioscrapy.utils.log import logger
|
|
6
|
-
|
|
7
5
|
|
|
8
6
|
class RedisRFPDupeFilter(DupeFilterBase):
|
|
9
7
|
"""Request Fingerprint duplicates filter built with Set of Redis"""
|
|
@@ -13,13 +11,15 @@ class RedisRFPDupeFilter(DupeFilterBase):
|
|
|
13
11
|
server: "redis.asyncio.Redis",
|
|
14
12
|
key: str,
|
|
15
13
|
debug: bool = False,
|
|
16
|
-
keep_on_close: bool = True
|
|
14
|
+
keep_on_close: bool = True,
|
|
15
|
+
info: bool = False,
|
|
17
16
|
):
|
|
18
17
|
self.server = server
|
|
19
18
|
self.key = key
|
|
20
19
|
self.debug = debug
|
|
21
20
|
self.keep_on_close = keep_on_close
|
|
22
21
|
self.logdupes: bool = True
|
|
22
|
+
self.info: bool = info
|
|
23
23
|
|
|
24
24
|
@classmethod
|
|
25
25
|
def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
|
|
@@ -28,7 +28,8 @@ class RedisRFPDupeFilter(DupeFilterBase):
|
|
|
28
28
|
keep_on_close = crawler.settings.getbool("KEEP_DUPEFILTER_DATA_ON_CLOSE", True)
|
|
29
29
|
key = dupefilter_key % {'spider': crawler.spider.name}
|
|
30
30
|
debug = crawler.settings.getbool('DUPEFILTER_DEBUG', False)
|
|
31
|
-
|
|
31
|
+
info = crawler.settings.getbool('DUPEFILTER_DEBUG', False)
|
|
32
|
+
instance = cls(server, key=key, debug=debug, keep_on_close=keep_on_close, info=info)
|
|
32
33
|
return instance
|
|
33
34
|
|
|
34
35
|
async def request_seen(self, request: Request):
|
|
@@ -41,17 +42,6 @@ class RedisRFPDupeFilter(DupeFilterBase):
|
|
|
41
42
|
async def clear(self):
|
|
42
43
|
await self.server.delete(self.key)
|
|
43
44
|
|
|
44
|
-
def log(self, request, spider):
|
|
45
|
-
if self.debug:
|
|
46
|
-
logger.debug("Filtered duplicate request: %(request)s" % {'request': request})
|
|
47
|
-
elif self.logdupes:
|
|
48
|
-
msg = ("Filtered duplicate request %(request)s"
|
|
49
|
-
" - no more duplicates will be shown"
|
|
50
|
-
" (see DUPEFILTER_DEBUG to show all duplicates)")
|
|
51
|
-
logger.debug(msg % {'request': request})
|
|
52
|
-
self.logdupes = False
|
|
53
|
-
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
|
54
|
-
|
|
55
45
|
|
|
56
46
|
class HashMap(object):
|
|
57
47
|
def __init__(self, m, seed):
|
aioscrapy/queue/__init__.py
CHANGED
|
@@ -37,10 +37,10 @@ class AbsQueue(metaclass=ABCMeta):
|
|
|
37
37
|
obj = request.to_dict(spider=self.spider)
|
|
38
38
|
return self.serializer.dumps(obj)
|
|
39
39
|
|
|
40
|
-
def _decode_request(self, encoded_request: Any) -> aioscrapy.Request:
|
|
40
|
+
async def _decode_request(self, encoded_request: Any) -> aioscrapy.Request:
|
|
41
41
|
"""Decode an request previously encoded"""
|
|
42
42
|
obj = self.serializer.loads(encoded_request)
|
|
43
|
-
return request_from_dict(obj, spider=self.spider)
|
|
43
|
+
return await request_from_dict(obj, spider=self.spider)
|
|
44
44
|
|
|
45
45
|
def __len__(self) -> None:
|
|
46
46
|
"""Return the length of the queue"""
|
aioscrapy/queue/memory.py
CHANGED
|
@@ -58,7 +58,7 @@ class MemoryQueueBase(AbsQueue):
|
|
|
58
58
|
data = self.container.get_nowait()
|
|
59
59
|
except QueueEmpty:
|
|
60
60
|
break
|
|
61
|
-
yield self._decode_request(data)
|
|
61
|
+
yield await self._decode_request(data)
|
|
62
62
|
|
|
63
63
|
async def clear(self, timeout: int = 0) -> None:
|
|
64
64
|
self.container = self.get_queue(self.max_size)
|
|
@@ -93,7 +93,7 @@ class MemoryPriorityQueue(MemoryFifoQueue):
|
|
|
93
93
|
score, data = self.container.get_nowait()
|
|
94
94
|
except QueueEmpty:
|
|
95
95
|
break
|
|
96
|
-
yield self._decode_request(data)
|
|
96
|
+
yield await self._decode_request(data)
|
|
97
97
|
|
|
98
98
|
|
|
99
99
|
SpiderQueue = MemoryFifoQueue
|
aioscrapy/queue/rabbitmq.py
CHANGED
|
@@ -57,7 +57,7 @@ class RabbitMqPriorityQueue(AbsQueue):
|
|
|
57
57
|
async def pop(self, count: int = 1) -> Optional[aioscrapy.Request]:
|
|
58
58
|
result = await self.container.get_message(self.key)
|
|
59
59
|
if result:
|
|
60
|
-
yield self._decode_request(result)
|
|
60
|
+
yield await self._decode_request(result)
|
|
61
61
|
|
|
62
62
|
async def clear(self) -> None:
|
|
63
63
|
await self.container.clean_message_queue(self.key)
|
aioscrapy/queue/redis.py
CHANGED
|
@@ -67,7 +67,7 @@ class RedisFifoQueue(RedisQueueBase):
|
|
|
67
67
|
results = await pipe.execute()
|
|
68
68
|
for result in results:
|
|
69
69
|
if result:
|
|
70
|
-
yield self._decode_request(result)
|
|
70
|
+
yield await self._decode_request(result)
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
class RedisPriorityQueue(RedisQueueBase):
|
|
@@ -97,7 +97,7 @@ class RedisPriorityQueue(RedisQueueBase):
|
|
|
97
97
|
.execute()
|
|
98
98
|
)
|
|
99
99
|
for result in results:
|
|
100
|
-
yield self._decode_request(result)
|
|
100
|
+
yield await self._decode_request(result)
|
|
101
101
|
|
|
102
102
|
|
|
103
103
|
class RedisLifoQueue(RedisQueueBase):
|
|
@@ -124,7 +124,7 @@ class RedisLifoQueue(RedisQueueBase):
|
|
|
124
124
|
results = await pipe.execute()
|
|
125
125
|
for result in results:
|
|
126
126
|
if result:
|
|
127
|
-
yield self._decode_request(result)
|
|
127
|
+
yield await self._decode_request(result)
|
|
128
128
|
|
|
129
129
|
|
|
130
130
|
SpiderQueue = RedisFifoQueue
|
aioscrapy/spiders/__init__.py
CHANGED
aioscrapy/utils/log.py
CHANGED
|
@@ -7,7 +7,9 @@ from loguru import logger as _logger
|
|
|
7
7
|
|
|
8
8
|
from aioscrapy.settings import Settings
|
|
9
9
|
|
|
10
|
-
_logger.
|
|
10
|
+
for _handler in _logger._core.handlers.values():
|
|
11
|
+
if _handler._name == '<stderr>':
|
|
12
|
+
_logger.remove(_handler._id)
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
def configure_logging(spider: Type["Spider"], settings: Settings):
|
aioscrapy/utils/reqser.py
CHANGED
|
@@ -11,5 +11,5 @@ def request_to_dict(request: "aioscrapy.Request", spider: Optional["aioscrapy.Sp
|
|
|
11
11
|
return request.to_dict(spider=spider)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def request_from_dict(d: dict, spider: Optional["aioscrapy.Spider"] = None) -> "aioscrapy.Request":
|
|
15
|
-
return _from_dict(d, spider=spider)
|
|
14
|
+
async def request_from_dict(d: dict, spider: Optional["aioscrapy.Spider"] = None) -> "aioscrapy.Request":
|
|
15
|
+
return await _from_dict(d, spider=spider)
|
aioscrapy/utils/request.py
CHANGED
|
@@ -39,13 +39,13 @@ def referer_str(request: Request) -> Optional[str]:
|
|
|
39
39
|
return to_unicode(referrer, errors='replace')
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
def request_from_dict(d: dict, *, spider: Optional[Spider] = None) -> Request:
|
|
42
|
+
async def request_from_dict(d: dict, *, spider: Optional[Spider] = None) -> Request:
|
|
43
43
|
"""Create a :class:`~scrapy.Request` object from a dict.
|
|
44
44
|
|
|
45
45
|
If a spider is given, it will try to resolve the callbacks looking at the
|
|
46
46
|
spider for methods with the same name.
|
|
47
47
|
"""
|
|
48
|
-
d = spider.request_from_dict(d) or d
|
|
48
|
+
d = await spider.request_from_dict(d) or d
|
|
49
49
|
if isinstance(d, Request):
|
|
50
50
|
return d
|
|
51
51
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|