aio-scrapy 2.0.8__py3-none-any.whl → 2.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: aio-scrapy
3
- Version: 2.0.8
3
+ Version: 2.0.10
4
4
  Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
5
5
  Home-page: https://github.com/conlin-huang/aio-scrapy.git
6
6
  Author: conlin
@@ -1,4 +1,4 @@
1
- aioscrapy/VERSION,sha256=FHhKxnUnBTC0gfINdE8plREGVAupdclqCULpdz5FL3U,5
1
+ aioscrapy/VERSION,sha256=bkksF7-FeZMTR8EfltCUKJZNQaHaQkySSXYbwvc2qdw,6
2
2
  aioscrapy/__init__.py,sha256=esJeH66Mz9WV7XbotvZEjNn49jc589YZ_L2DKoD0JvA,858
3
3
  aioscrapy/__main__.py,sha256=rvTdJ0cQwbi29aucPj3jJRpccx5SBzvRcV7qvxvX2NQ,80
4
4
  aioscrapy/cmdline.py,sha256=1qhNg2Edl-Obmf2re2K4V8pJG7ubGfZZCzcHdKtdE_s,5159
@@ -21,10 +21,10 @@ aioscrapy/commands/settings.py,sha256=sc0rwwfBQNySKX8uV3iJqv3i7SelFwNcrlHYxDupKO
21
21
  aioscrapy/commands/startproject.py,sha256=Rcc7JkN75Jp2t2aZIxBzPsWbLXChNAUSByDhcW_6Ig8,4001
22
22
  aioscrapy/commands/version.py,sha256=yqqTMlZkkiQhtbU9w_IqUWLMOAjqYlv24friEkPRQYM,485
23
23
  aioscrapy/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- aioscrapy/core/engine.py,sha256=YJC597TZwhX81qjPHBH2mkRi4oVGNImzSz58_sFiXPA,10925
24
+ aioscrapy/core/engine.py,sha256=zW3GPigPqyrWJ_Jk7SUxD0ueV1HTuxUvweFUU4WFG-0,10926
25
25
  aioscrapy/core/scheduler.py,sha256=YCRw9j79ZOL8bijDa3IdRaw0YlMTrwXuJGzaApkN7lc,5737
26
26
  aioscrapy/core/scraper.py,sha256=M_bcizLUzWuECe7sIIZ_HJLNrPzL7dX2o-tN5nvFnCs,10304
27
- aioscrapy/core/downloader/__init__.py,sha256=XRpiDJg9Yc47SoFy-48HxFyuV3sZkjE30sK3DA80YJQ,9465
27
+ aioscrapy/core/downloader/__init__.py,sha256=22TC0z49BX3YvDUPl6DKMrOonECpY5tjaWJGGEV7RbU,9574
28
28
  aioscrapy/core/downloader/handlers/__init__.py,sha256=CriaX2Cp4jUqzDDGZDB7HiIEgUWt2pnYVho6HMV6sJ0,3198
29
29
  aioscrapy/core/downloader/handlers/aiohttp.py,sha256=dFVVeGgJ1WZcE1zI4fQOZIzmrkC6l1WZcYstHmB3qYg,3942
30
30
  aioscrapy/core/downloader/handlers/httpx.py,sha256=-DfjYgfrjxMhaMpTgEOFlQRONasCXV0g6UgH3WmWcfs,3041
@@ -40,9 +40,9 @@ aioscrapy/db/aiomysql.py,sha256=-xCLfeH7RzvghY1jqREAb_Qnz9q_dVjxoHGfz7sCqbU,3799
40
40
  aioscrapy/db/aiopg.py,sha256=WG4s_2X0b8LQHbZpoIrwZeuGHNolKj-SvmvAZQlCk00,3213
41
41
  aioscrapy/db/aiorabbitmq.py,sha256=tNKl4Kx7KM7H_lOj8xfeA0uD8PuBTVzySApTEn5TyAE,5583
42
42
  aioscrapy/db/aioredis.py,sha256=UOoTRTQUvghnq29bVL8v1HvksMXYOzHaS8Btgbpn0bY,2966
43
- aioscrapy/dupefilters/__init__.py,sha256=bxOMWqqXuOZS8r5OXqK9JSB8Sf-LNDqfKtllTCi-Gls,725
44
- aioscrapy/dupefilters/disk.py,sha256=z4PcayzR1zMxXkCR1pJAJvzcvzsfpHM64ibYQltnOMo,2178
45
- aioscrapy/dupefilters/redis.py,sha256=rYmKnjoSZlOmAABkdrFa9HUSy3F6VBre_7z6u8YpMCs,5160
43
+ aioscrapy/dupefilters/__init__.py,sha256=17s6Hyr_lWDFPto6wLEvRfT2TbGU2RIssTDuChzrDNA,1498
44
+ aioscrapy/dupefilters/disk.py,sha256=EMgxeC2a6aYCGKgp4QOs5xwHp33LUsOZ8pliKBTFx1c,1551
45
+ aioscrapy/dupefilters/redis.py,sha256=cUuM68dEM1_ki2eOzZ6pAvmLZlAP_tC4lx73Ufmg_Bs,4812
46
46
  aioscrapy/http/__init__.py,sha256=yeQTT5W1iwr6dKznTS5d9vnx2hsB47i9roPM57wQp_0,597
47
47
  aioscrapy/http/headers.py,sha256=H-RJ6KqOsFFFAXORfvoyz3V-ud0I8TAj5Jt5fAACcLc,1573
48
48
  aioscrapy/http/request/__init__.py,sha256=PFoFU3ncTN-gj6Rx01rjVa_744Qfv3EH29mooW6JX9U,7121
@@ -58,7 +58,7 @@ aioscrapy/libs/downloader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
58
58
  aioscrapy/libs/downloader/defaultheaders.py,sha256=tg_ULA0Y-41bZKG607mowFJQGVfnZ45LdR044DsjA_A,563
59
59
  aioscrapy/libs/downloader/downloadtimeout.py,sha256=hNh3OEj7rC0ceQrv_yrhR5lb5AvfxJ6cspj3qsQWj4o,704
60
60
  aioscrapy/libs/downloader/ja3fingerprint.py,sha256=DgTw74GXC_Bp94eD_bwoG6A_DphUHTt7bH4glBNXyV8,1058
61
- aioscrapy/libs/downloader/retry.py,sha256=onItmt4s_diH3v1iMJiTitMn12DflwerSK2fIgJyBsY,5082
61
+ aioscrapy/libs/downloader/retry.py,sha256=eaMig7JpSyr6QQBD6FNYpcttuGK811Dm4tJGTUIi3q8,5191
62
62
  aioscrapy/libs/downloader/stats.py,sha256=FlkS8Zm4j3SBjHb6caXwq08HvvZ37VKORGCAjlA2U38,1376
63
63
  aioscrapy/libs/downloader/useragent.py,sha256=E5x5dk9AxsSCGDDICJlTXwWXRkqAibWgesqG0VhAG8M,743
64
64
  aioscrapy/libs/extensions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -110,7 +110,7 @@ aioscrapy/utils/curl.py,sha256=I8eZWFNgvyUiJ2YS9-s3HltGNVG8XMMU0HPhlMxuxdA,3295
110
110
  aioscrapy/utils/decorators.py,sha256=gMQArNxF9QQc1bENA0IqDchAjqmfWvHGKOyUdjXdg6A,794
111
111
  aioscrapy/utils/deprecate.py,sha256=STy55Q8kZI8q8CQUfxK4QQVu1Rs4En3rlhzWc7p7T00,5467
112
112
  aioscrapy/utils/httpobj.py,sha256=ytec7IZzsQY_GwR___051hdbOWs1ZM6S57HwcNiu2es,708
113
- aioscrapy/utils/log.py,sha256=U4BqJQ3p-s_lYbqgln_9FHeUegKF7T1MTrN1KTtn92o,1592
113
+ aioscrapy/utils/log.py,sha256=NRDivw8w21J77qEUeqqLdC4sgdIKaj2UAP6lDvWGotM,1697
114
114
  aioscrapy/utils/misc.py,sha256=9NOssEl7CP_c6R9skxyXwmz4bd-nZ_gkw6F0EybeLTQ,3509
115
115
  aioscrapy/utils/ossignal.py,sha256=jAsCIKu17KV45-9dZwEkFJHF31Y13KP_zxY0x49j1jo,896
116
116
  aioscrapy/utils/project.py,sha256=cT98HaR5JaNmm-Y1UzSuzXj6B5S7GlmMshUfMhjpjJY,2905
@@ -124,9 +124,9 @@ aioscrapy/utils/template.py,sha256=HR97X4lpv2WuqhuPfzTgaBN66fYnzHVpP6zQ5IoTwcI,8
124
124
  aioscrapy/utils/tools.py,sha256=WJowViZB8XEs2CFqjVvbqXK3H5Uvf4BgWgBD_RcHMaM,2319
125
125
  aioscrapy/utils/trackref.py,sha256=0nIpelT1d5WYxALl8SGA8vHNYsh-jS0Z2lwVEAhwx8E,2019
126
126
  aioscrapy/utils/url.py,sha256=8W8tAhU7lgfPOfzKp3ejJGEcLj1i_PnA_53Jv5LpxiY,5464
127
- aio_scrapy-2.0.8.dist-info/LICENSE,sha256=L-UoAEM3fQSjKA7FVWxQM7gwSCbeue6gZRAnpRS_UCo,1088
128
- aio_scrapy-2.0.8.dist-info/METADATA,sha256=AzuL_eshj6vpMA1gAPpc18nTm96ptt5Las7oJ1s3U38,6384
129
- aio_scrapy-2.0.8.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
130
- aio_scrapy-2.0.8.dist-info/entry_points.txt,sha256=WWhoVHZvqhW8a5uFg97K0EP_GjG3uuCIFLkyqDICgaw,56
131
- aio_scrapy-2.0.8.dist-info/top_level.txt,sha256=8l08KyMt22wfX_5BmhrGH0PgwZdzZIPq-hBUa1GNir4,10
132
- aio_scrapy-2.0.8.dist-info/RECORD,,
127
+ aio_scrapy-2.0.10.dist-info/LICENSE,sha256=L-UoAEM3fQSjKA7FVWxQM7gwSCbeue6gZRAnpRS_UCo,1088
128
+ aio_scrapy-2.0.10.dist-info/METADATA,sha256=qMfSjJmZpj8xAaoGdjEC-oNULa4wYcWFwgJJm8wBQ3U,6385
129
+ aio_scrapy-2.0.10.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
130
+ aio_scrapy-2.0.10.dist-info/entry_points.txt,sha256=WWhoVHZvqhW8a5uFg97K0EP_GjG3uuCIFLkyqDICgaw,56
131
+ aio_scrapy-2.0.10.dist-info/top_level.txt,sha256=8l08KyMt22wfX_5BmhrGH0PgwZdzZIPq-hBUa1GNir4,10
132
+ aio_scrapy-2.0.10.dist-info/RECORD,,
aioscrapy/VERSION CHANGED
@@ -1 +1 @@
1
- 2.0.8
1
+ 2.0.10
@@ -126,6 +126,7 @@ class Downloader(BaseDownloader):
126
126
  self.dupefilter = dupefilter
127
127
 
128
128
  self.total_concurrency: int = self.settings.getint('CONCURRENT_REQUESTS')
129
+ self.get_requests_count: int = self.settings.getint('GET_REQUESTS_COUNT') or self.total_concurrency
129
130
  self.domain_concurrency: int = self.settings.getint('CONCURRENT_REQUESTS_PER_DOMAIN')
130
131
  self.ip_concurrency: int = self.settings.getint('CONCURRENT_REQUESTS_PER_IP')
131
132
  self.randomize_delay: bool = self.settings.getbool('RANDOMIZE_DOWNLOAD_DELAY')
aioscrapy/core/engine.py CHANGED
@@ -138,7 +138,7 @@ class ExecutionEngine(object):
138
138
  while self.unlock and not self._needs_backout() and self.unlock:
139
139
  self.unlock = False
140
140
  try:
141
- async for request in self.scheduler.next_request(self.downloader.total_concurrency):
141
+ async for request in self.scheduler.next_request(self.downloader.get_requests_count):
142
142
  if request:
143
143
  self.slot.add_request(request)
144
144
  await self.downloader.fetch(request)
@@ -1,6 +1,7 @@
1
1
  from abc import ABCMeta, abstractmethod
2
2
 
3
3
  from aioscrapy import Request, Spider
4
+ from aioscrapy.utils.log import logger
4
5
 
5
6
 
6
7
  class DupeFilterBase(metaclass=ABCMeta):
@@ -19,6 +20,20 @@ class DupeFilterBase(metaclass=ABCMeta):
19
20
  async def close(self, reason: str = '') -> None:
20
21
  """ Delete data on close """
21
22
 
22
- @abstractmethod
23
- def log(self, request: Request, spider: Spider) -> None:
24
- """ Logs given request """
23
+ def log(self, request: Request, spider: Spider):
24
+ if self.info:
25
+ logger.info("Filtered duplicate request: %(request)s" % {
26
+ 'request': request.meta.get('dupefilter_msg') or request
27
+ })
28
+ elif self.debug:
29
+ logger.debug("Filtered duplicate request: %(request)s" % {
30
+ 'request': request.meta.get('dupefilter_msg') or request
31
+ })
32
+ elif self.logdupes:
33
+ msg = ("Filtered duplicate request: %(request)s"
34
+ " - no more duplicates will be shown"
35
+ " (see DUPEFILTER_DEBUG to show all duplicates)")
36
+ logger.debug(msg % {'request': request.meta.get('dupefilter_msg') or request})
37
+ self.logdupes = False
38
+
39
+ spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
@@ -1,20 +1,19 @@
1
1
  import os
2
2
  from typing import Optional, Set
3
3
 
4
- from aioscrapy import Request, Spider
4
+ from aioscrapy import Request
5
5
  from aioscrapy.dupefilters import DupeFilterBase
6
- from aioscrapy.utils.log import logger
7
- from aioscrapy.utils.request import referer_str
8
6
 
9
7
 
10
8
  class DiskRFPDupeFilter(DupeFilterBase):
11
9
  """Request Fingerprint duplicates filter built with Disk storage"""
12
10
 
13
- def __init__(self, path: Optional[str] = None, debug: bool = False):
11
+ def __init__(self, path: Optional[str] = None, debug: bool = False, info: bool = False):
14
12
  self.file: Optional["File object"] = None
15
13
  self.debug = debug
16
14
  self.fingerprints: Set = set()
17
15
  self.logdupes: bool = True
16
+ self.info: bool = info
18
17
  if path:
19
18
  self.file = open(os.path.join(path, 'requests.seen'), 'a+')
20
19
  self.file.seek(0)
@@ -23,10 +22,11 @@ class DiskRFPDupeFilter(DupeFilterBase):
23
22
  @classmethod
24
23
  def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
25
24
  debug = crawler.settings.getbool('DUPEFILTER_DEBUG')
25
+ info = crawler.settings.getbool('DUPEFILTER_INFO')
26
26
  path = crawler.settings.get('JOBDIR', './job_dir')
27
27
  if path and not os.path.exists(path):
28
28
  os.makedirs(path)
29
- return cls(path, debug)
29
+ return cls(path, debug, info)
30
30
 
31
31
  async def request_seen(self, request: Request) -> bool:
32
32
  if request.fingerprint in self.fingerprints:
@@ -40,19 +40,5 @@ class DiskRFPDupeFilter(DupeFilterBase):
40
40
  if self.file:
41
41
  self.file.close()
42
42
 
43
- def log(self, request: Request, spider: Spider):
44
- if self.debug:
45
- logger.debug("Filtered duplicate request: %(request)s (referer: %(referer)s)" % {
46
- 'request': request, 'referer': referer_str(request)
47
- })
48
- elif self.logdupes:
49
- msg = ("Filtered duplicate request: %(request)s"
50
- " - no more duplicates will be shown"
51
- " (see DUPEFILTER_DEBUG to show all duplicates)")
52
- logger.debug(msg % {'request': request})
53
- self.logdupes = False
54
-
55
- spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
56
-
57
43
 
58
44
  RFPDupeFilter = DiskRFPDupeFilter
@@ -2,8 +2,6 @@ from aioscrapy import Request
2
2
  from aioscrapy.db import db_manager
3
3
  from aioscrapy.dupefilters import DupeFilterBase
4
4
 
5
- from aioscrapy.utils.log import logger
6
-
7
5
 
8
6
  class RedisRFPDupeFilter(DupeFilterBase):
9
7
  """Request Fingerprint duplicates filter built with Set of Redis"""
@@ -13,13 +11,15 @@ class RedisRFPDupeFilter(DupeFilterBase):
13
11
  server: "redis.asyncio.Redis",
14
12
  key: str,
15
13
  debug: bool = False,
16
- keep_on_close: bool = True
14
+ keep_on_close: bool = True,
15
+ info: bool = False,
17
16
  ):
18
17
  self.server = server
19
18
  self.key = key
20
19
  self.debug = debug
21
20
  self.keep_on_close = keep_on_close
22
21
  self.logdupes: bool = True
22
+ self.info: bool = info
23
23
 
24
24
  @classmethod
25
25
  def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
@@ -28,7 +28,8 @@ class RedisRFPDupeFilter(DupeFilterBase):
28
28
  keep_on_close = crawler.settings.getbool("KEEP_DUPEFILTER_DATA_ON_CLOSE", True)
29
29
  key = dupefilter_key % {'spider': crawler.spider.name}
30
30
  debug = crawler.settings.getbool('DUPEFILTER_DEBUG', False)
31
- instance = cls(server, key=key, debug=debug, keep_on_close=keep_on_close)
31
+ info = crawler.settings.getbool('DUPEFILTER_INFO', False)
32
+ instance = cls(server, key=key, debug=debug, keep_on_close=keep_on_close, info=info)
32
33
  return instance
33
34
 
34
35
  async def request_seen(self, request: Request):
@@ -41,17 +42,6 @@ class RedisRFPDupeFilter(DupeFilterBase):
41
42
  async def clear(self):
42
43
  await self.server.delete(self.key)
43
44
 
44
- def log(self, request, spider):
45
- if self.debug:
46
- logger.debug("Filtered duplicate request: %(request)s" % {'request': request})
47
- elif self.logdupes:
48
- msg = ("Filtered duplicate request %(request)s"
49
- " - no more duplicates will be shown"
50
- " (see DUPEFILTER_DEBUG to show all duplicates)")
51
- logger.debug(msg % {'request': request})
52
- self.logdupes = False
53
- spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
54
-
55
45
 
56
46
  class HashMap(object):
57
47
  def __init__(self, m, seed):
@@ -112,8 +102,8 @@ class BloomFilter(object):
112
102
  class RedisBloomDupeFilter(RedisRFPDupeFilter):
113
103
  """Bloom filter built with the bitis bitmap of redis"""
114
104
 
115
- def __init__(self, server, key, debug, bit, hash_number, keep_on_close):
116
- super().__init__(server, key, debug, keep_on_close)
105
+ def __init__(self, server, key, debug, bit, hash_number, keep_on_close, info):
106
+ super().__init__(server, key, debug, keep_on_close, info)
117
107
  self.bit = bit
118
108
  self.hash_number = hash_number
119
109
  self.bf = BloomFilter(server, self.key, bit, hash_number)
@@ -125,9 +115,10 @@ class RedisBloomDupeFilter(RedisRFPDupeFilter):
125
115
  keep_on_close = crawler.settings.getbool("KEEP_DUPEFILTER_DATA_ON_CLOSE", True)
126
116
  key = dupefilter_key % {'spider': crawler.spider.name}
127
117
  debug = crawler.settings.getbool('DUPEFILTER_DEBUG', False)
118
+ info = crawler.settings.getbool('DUPEFILTER_INFO', False)
128
119
  bit = crawler.settings.getint('BLOOMFILTER_BIT', 30)
129
120
  hash_number = crawler.settings.getint('BLOOMFILTER_HASH_NUMBER', 6)
130
- return cls(server, key=key, debug=debug, bit=bit, hash_number=hash_number, keep_on_close=keep_on_close)
121
+ return cls(server, key=key, debug=debug, bit=bit, hash_number=hash_number, keep_on_close=keep_on_close, info=info)
131
122
 
132
123
  async def request_seen(self, request: Request) -> bool:
133
124
  fp = await self.bf.exists(request.fingerprint)
@@ -26,6 +26,13 @@ try:
26
26
  except ImportError:
27
27
  pass
28
28
 
29
+ try:
30
+ from anyio import EndOfStream
31
+
32
+ NEED_RETRY_ERROR += (EndOfStream,)
33
+ except ImportError:
34
+ pass
35
+
29
36
  try:
30
37
  from httpx import HTTPError as HttpxError
31
38
 
aioscrapy/utils/log.py CHANGED
@@ -7,7 +7,9 @@ from loguru import logger as _logger
7
7
 
8
8
  from aioscrapy.settings import Settings
9
9
 
10
- _logger.remove(0)
10
+ for _handler in _logger._core.handlers.values():
11
+ if _handler._name == '<stderr>':
12
+ _logger.remove(_handler._id)
11
13
 
12
14
 
13
15
  def configure_logging(spider: Type["Spider"], settings: Settings):