aio-scrapy 2.0.8__tar.gz → 2.0.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio-scrapy-2.0.8/aio_scrapy.egg-info → aio-scrapy-2.0.9}/PKG-INFO +1 -1
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9/aio_scrapy.egg-info}/PKG-INFO +1 -1
- aio-scrapy-2.0.9/aioscrapy/VERSION +1 -0
- aio-scrapy-2.0.9/aioscrapy/dupefilters/__init__.py +39 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/dupefilters/disk.py +5 -19
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/dupefilters/redis.py +5 -15
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/downloader/retry.py +7 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/log.py +3 -1
- aio-scrapy-2.0.8/aioscrapy/VERSION +0 -1
- aio-scrapy-2.0.8/aioscrapy/dupefilters/__init__.py +0 -24
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/LICENSE +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/MANIFEST.in +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/README.md +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aio_scrapy.egg-info/SOURCES.txt +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aio_scrapy.egg-info/dependency_links.txt +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aio_scrapy.egg-info/entry_points.txt +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aio_scrapy.egg-info/not-zip-safe +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aio_scrapy.egg-info/requires.txt +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aio_scrapy.egg-info/top_level.txt +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/__main__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/cmdline.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/commands/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/commands/crawl.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/commands/genspider.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/commands/list.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/commands/runspider.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/commands/settings.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/commands/startproject.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/commands/version.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/downloader/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/downloader/handlers/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/downloader/handlers/aiohttp.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/downloader/handlers/httpx.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/downloader/handlers/pyhttpx.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/downloader/handlers/requests.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/engine.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/scheduler.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/scraper.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/crawler.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/db/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/db/absmanager.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/db/aiomongo.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/db/aiomysql.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/db/aiopg.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/db/aiorabbitmq.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/db/aioredis.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/exceptions.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/http/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/http/headers.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/http/request/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/http/request/form.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/http/request/json_request.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/http/response/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/http/response/html.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/http/response/playwright.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/http/response/text.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/http/response/xml.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/downloader/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/downloader/defaultheaders.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/downloader/downloadtimeout.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/downloader/ja3fingerprint.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/downloader/stats.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/downloader/useragent.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/extensions/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/extensions/closespider.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/extensions/corestats.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/extensions/logstats.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/extensions/metric.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/extensions/throttle.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/pipelines/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/pipelines/csv.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/pipelines/execl.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/pipelines/mongo.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/pipelines/mysql.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/pipelines/pg.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/spider/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/spider/depth.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/spider/httperror.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/spider/offsite.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/spider/referer.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/libs/spider/urllength.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/link.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/logformatter.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/middleware/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/middleware/absmanager.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/middleware/downloader.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/middleware/extension.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/middleware/itempipeline.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/middleware/spider.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/process.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/proxy/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/proxy/redis.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/queue/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/queue/memory.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/queue/rabbitmq.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/queue/redis.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/scrapyd/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/scrapyd/runner.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/serializer.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/settings/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/settings/default_settings.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/signalmanager.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/signals.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/spiderloader.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/spiders/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/statscollectors.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/templates/project/aioscrapy.cfg +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/templates/project/module/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/templates/project/module/middlewares.py.tmpl +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/templates/project/module/pipelines.py.tmpl +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/templates/project/module/settings.py.tmpl +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/templates/project/module/spiders/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/templates/spiders/basic.tmpl +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/templates/spiders/single.tmpl +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/__init__.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/conf.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/curl.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/decorators.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/deprecate.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/httpobj.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/misc.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/ossignal.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/project.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/python.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/reqser.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/request.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/response.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/signal.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/spider.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/template.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/tools.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/trackref.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/utils/url.py +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/setup.cfg +0 -0
- {aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
2.0.9
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from abc import ABCMeta, abstractmethod
|
|
2
|
+
|
|
3
|
+
from aioscrapy import Request, Spider
|
|
4
|
+
from aioscrapy.utils.log import logger
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DupeFilterBase(metaclass=ABCMeta):
|
|
8
|
+
"""Request Fingerprint duplicates filter"""
|
|
9
|
+
|
|
10
|
+
@classmethod
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
|
|
13
|
+
""" Get Instance of RFPDupeFilter from crawler """
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
async def request_seen(self, request: Request) -> bool:
|
|
17
|
+
""" Check whether fingerprint of request exists """
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
async def close(self, reason: str = '') -> None:
|
|
21
|
+
""" Delete data on close """
|
|
22
|
+
|
|
23
|
+
def log(self, request: Request, spider: Spider):
|
|
24
|
+
if self.info:
|
|
25
|
+
logger.info("Filtered duplicate request: %(request)s" % {
|
|
26
|
+
'request': request.meta.get('dupefilter_msg') or request
|
|
27
|
+
})
|
|
28
|
+
elif self.debug:
|
|
29
|
+
logger.debug("Filtered duplicate request: %(request)s" % {
|
|
30
|
+
'request': request.meta.get('dupefilter_msg') or request
|
|
31
|
+
})
|
|
32
|
+
elif self.logdupes:
|
|
33
|
+
msg = ("Filtered duplicate request: %(request)s"
|
|
34
|
+
" - no more duplicates will be shown"
|
|
35
|
+
" (see DUPEFILTER_DEBUG to show all duplicates)")
|
|
36
|
+
logger.debug(msg % {'request': request.meta.get('dupefilter_msg') or request})
|
|
37
|
+
self.logdupes = False
|
|
38
|
+
|
|
39
|
+
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import Optional, Set
|
|
3
3
|
|
|
4
|
-
from aioscrapy import Request
|
|
4
|
+
from aioscrapy import Request
|
|
5
5
|
from aioscrapy.dupefilters import DupeFilterBase
|
|
6
|
-
from aioscrapy.utils.log import logger
|
|
7
|
-
from aioscrapy.utils.request import referer_str
|
|
8
6
|
|
|
9
7
|
|
|
10
8
|
class DiskRFPDupeFilter(DupeFilterBase):
|
|
11
9
|
"""Request Fingerprint duplicates filter built with Disk storage"""
|
|
12
10
|
|
|
13
|
-
def __init__(self, path: Optional[str] = None, debug: bool = False):
|
|
11
|
+
def __init__(self, path: Optional[str] = None, debug: bool = False, info: bool = False):
|
|
14
12
|
self.file: Optional["File object"] = None
|
|
15
13
|
self.debug = debug
|
|
16
14
|
self.fingerprints: Set = set()
|
|
17
15
|
self.logdupes: bool = True
|
|
16
|
+
self.info: bool = info
|
|
18
17
|
if path:
|
|
19
18
|
self.file = open(os.path.join(path, 'requests.seen'), 'a+')
|
|
20
19
|
self.file.seek(0)
|
|
@@ -23,10 +22,11 @@ class DiskRFPDupeFilter(DupeFilterBase):
|
|
|
23
22
|
@classmethod
|
|
24
23
|
def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
|
|
25
24
|
debug = crawler.settings.getbool('DUPEFILTER_DEBUG')
|
|
25
|
+
info = crawler.settings.getbool('DUPEFILTER_INFO')
|
|
26
26
|
path = crawler.settings.get('JOBDIR', './job_dir')
|
|
27
27
|
if path and not os.path.exists(path):
|
|
28
28
|
os.makedirs(path)
|
|
29
|
-
return cls(path, debug)
|
|
29
|
+
return cls(path, debug, info)
|
|
30
30
|
|
|
31
31
|
async def request_seen(self, request: Request) -> bool:
|
|
32
32
|
if request.fingerprint in self.fingerprints:
|
|
@@ -40,19 +40,5 @@ class DiskRFPDupeFilter(DupeFilterBase):
|
|
|
40
40
|
if self.file:
|
|
41
41
|
self.file.close()
|
|
42
42
|
|
|
43
|
-
def log(self, request: Request, spider: Spider):
|
|
44
|
-
if self.debug:
|
|
45
|
-
logger.debug("Filtered duplicate request: %(request)s (referer: %(referer)s)" % {
|
|
46
|
-
'request': request, 'referer': referer_str(request)
|
|
47
|
-
})
|
|
48
|
-
elif self.logdupes:
|
|
49
|
-
msg = ("Filtered duplicate request: %(request)s"
|
|
50
|
-
" - no more duplicates will be shown"
|
|
51
|
-
" (see DUPEFILTER_DEBUG to show all duplicates)")
|
|
52
|
-
logger.debug(msg % {'request': request})
|
|
53
|
-
self.logdupes = False
|
|
54
|
-
|
|
55
|
-
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
|
56
|
-
|
|
57
43
|
|
|
58
44
|
RFPDupeFilter = DiskRFPDupeFilter
|
|
@@ -2,8 +2,6 @@ from aioscrapy import Request
|
|
|
2
2
|
from aioscrapy.db import db_manager
|
|
3
3
|
from aioscrapy.dupefilters import DupeFilterBase
|
|
4
4
|
|
|
5
|
-
from aioscrapy.utils.log import logger
|
|
6
|
-
|
|
7
5
|
|
|
8
6
|
class RedisRFPDupeFilter(DupeFilterBase):
|
|
9
7
|
"""Request Fingerprint duplicates filter built with Set of Redis"""
|
|
@@ -13,13 +11,15 @@ class RedisRFPDupeFilter(DupeFilterBase):
|
|
|
13
11
|
server: "redis.asyncio.Redis",
|
|
14
12
|
key: str,
|
|
15
13
|
debug: bool = False,
|
|
16
|
-
keep_on_close: bool = True
|
|
14
|
+
keep_on_close: bool = True,
|
|
15
|
+
info: bool = False,
|
|
17
16
|
):
|
|
18
17
|
self.server = server
|
|
19
18
|
self.key = key
|
|
20
19
|
self.debug = debug
|
|
21
20
|
self.keep_on_close = keep_on_close
|
|
22
21
|
self.logdupes: bool = True
|
|
22
|
+
self.info: bool = info
|
|
23
23
|
|
|
24
24
|
@classmethod
|
|
25
25
|
def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
|
|
@@ -28,7 +28,8 @@ class RedisRFPDupeFilter(DupeFilterBase):
|
|
|
28
28
|
keep_on_close = crawler.settings.getbool("KEEP_DUPEFILTER_DATA_ON_CLOSE", True)
|
|
29
29
|
key = dupefilter_key % {'spider': crawler.spider.name}
|
|
30
30
|
debug = crawler.settings.getbool('DUPEFILTER_DEBUG', False)
|
|
31
|
-
|
|
31
|
+
info = crawler.settings.getbool('DUPEFILTER_DEBUG', False)
|
|
32
|
+
instance = cls(server, key=key, debug=debug, keep_on_close=keep_on_close, info=info)
|
|
32
33
|
return instance
|
|
33
34
|
|
|
34
35
|
async def request_seen(self, request: Request):
|
|
@@ -41,17 +42,6 @@ class RedisRFPDupeFilter(DupeFilterBase):
|
|
|
41
42
|
async def clear(self):
|
|
42
43
|
await self.server.delete(self.key)
|
|
43
44
|
|
|
44
|
-
def log(self, request, spider):
|
|
45
|
-
if self.debug:
|
|
46
|
-
logger.debug("Filtered duplicate request: %(request)s" % {'request': request})
|
|
47
|
-
elif self.logdupes:
|
|
48
|
-
msg = ("Filtered duplicate request %(request)s"
|
|
49
|
-
" - no more duplicates will be shown"
|
|
50
|
-
" (see DUPEFILTER_DEBUG to show all duplicates)")
|
|
51
|
-
logger.debug(msg % {'request': request})
|
|
52
|
-
self.logdupes = False
|
|
53
|
-
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
|
54
|
-
|
|
55
45
|
|
|
56
46
|
class HashMap(object):
|
|
57
47
|
def __init__(self, m, seed):
|
|
@@ -7,7 +7,9 @@ from loguru import logger as _logger
|
|
|
7
7
|
|
|
8
8
|
from aioscrapy.settings import Settings
|
|
9
9
|
|
|
10
|
-
_logger.
|
|
10
|
+
for _handler in _logger._core.handlers.values():
|
|
11
|
+
if _handler._name == '<stderr>':
|
|
12
|
+
_logger.remove(_handler._id)
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
def configure_logging(spider: Type["Spider"], settings: Settings):
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
2.0.8
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
from abc import ABCMeta, abstractmethod
|
|
2
|
-
|
|
3
|
-
from aioscrapy import Request, Spider
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class DupeFilterBase(metaclass=ABCMeta):
|
|
7
|
-
"""Request Fingerprint duplicates filter"""
|
|
8
|
-
|
|
9
|
-
@classmethod
|
|
10
|
-
@abstractmethod
|
|
11
|
-
def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
|
|
12
|
-
""" Get Instance of RFPDupeFilter from crawler """
|
|
13
|
-
|
|
14
|
-
@abstractmethod
|
|
15
|
-
async def request_seen(self, request: Request) -> bool:
|
|
16
|
-
""" Check whether fingerprint of request exists """
|
|
17
|
-
|
|
18
|
-
@abstractmethod
|
|
19
|
-
async def close(self, reason: str = '') -> None:
|
|
20
|
-
""" Delete data on close """
|
|
21
|
-
|
|
22
|
-
@abstractmethod
|
|
23
|
-
def log(self, request: Request, spider: Spider) -> None:
|
|
24
|
-
""" Logs given request """
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/downloader/handlers/playwright/__init__.py
RENAMED
|
File without changes
|
{aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/downloader/handlers/playwright/driverpool.py
RENAMED
|
File without changes
|
{aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/core/downloader/handlers/playwright/webdriver.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/templates/project/module/middlewares.py.tmpl
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{aio-scrapy-2.0.8 → aio-scrapy-2.0.9}/aioscrapy/templates/project/module/spiders/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|