aio-scrapy 2.1.2__tar.gz → 2.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio-scrapy-2.1.2/aio_scrapy.egg-info → aio-scrapy-2.1.3}/PKG-INFO +1 -1
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3/aio_scrapy.egg-info}/PKG-INFO +1 -1
- aio-scrapy-2.1.3/aioscrapy/VERSION +1 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/__init__.py +10 -3
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/aiohttp.py +1 -1
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/curl_cffi.py +1 -1
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/httpx.py +1 -1
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/playwright/__init__.py +1 -1
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/pyhttpx.py +1 -1
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/requests.py +1 -2
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/scraper.py +10 -5
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/dupefilters/__init__.py +6 -2
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/dupefilters/redis.py +24 -6
- aio-scrapy-2.1.2/aioscrapy/VERSION +0 -1
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/LICENSE +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/MANIFEST.in +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/README.md +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/SOURCES.txt +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/dependency_links.txt +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/entry_points.txt +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/not-zip-safe +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/requires.txt +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/top_level.txt +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/__main__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/cmdline.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/crawl.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/genspider.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/list.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/runspider.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/settings.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/startproject.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/version.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/engine.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/scheduler.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/crawler.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/db/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/db/absmanager.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/db/aiomongo.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/db/aiomysql.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/db/aiopg.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/db/aiorabbitmq.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/db/aioredis.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/dupefilters/disk.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/exceptions.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/headers.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/request/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/request/form.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/request/json_request.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/response/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/response/html.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/response/playwright.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/response/text.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/response/xml.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/defaultheaders.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/downloadtimeout.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/ja3fingerprint.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/retry.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/stats.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/useragent.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/closespider.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/corestats.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/logstats.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/metric.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/throttle.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/csv.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/execl.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/mongo.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/mysql.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/pg.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/depth.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/httperror.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/offsite.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/referer.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/urllength.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/link.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/logformatter.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/middleware/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/middleware/absmanager.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/middleware/downloader.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/middleware/extension.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/middleware/itempipeline.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/middleware/spider.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/process.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/proxy/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/proxy/redis.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/queue/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/queue/memory.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/queue/rabbitmq.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/queue/redis.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/scrapyd/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/scrapyd/runner.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/serializer.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/settings/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/settings/default_settings.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/signalmanager.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/signals.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/spiderloader.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/spiders/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/statscollectors.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/project/aioscrapy.cfg +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/middlewares.py.tmpl +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/pipelines.py.tmpl +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/settings.py.tmpl +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/spiders/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/spiders/basic.tmpl +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/spiders/single.tmpl +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/__init__.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/conf.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/curl.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/decorators.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/deprecate.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/httpobj.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/log.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/misc.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/ossignal.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/project.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/python.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/reqser.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/request.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/response.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/signal.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/spider.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/template.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/tools.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/trackref.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/url.py +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/setup.cfg +0 -0
- {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/setup.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
2.1.3
|
|
@@ -138,13 +138,15 @@ class Downloader(BaseDownloader):
|
|
|
138
138
|
|
|
139
139
|
@classmethod
|
|
140
140
|
async def from_crawler(cls, crawler) -> "Downloader":
|
|
141
|
-
df = crawler.settings.get('DUPEFILTER_CLASS') and await load_instance(crawler.settings['DUPEFILTER_CLASS'],
|
|
141
|
+
df = crawler.settings.get('DUPEFILTER_CLASS') and await load_instance(crawler.settings['DUPEFILTER_CLASS'],
|
|
142
|
+
crawler=crawler)
|
|
142
143
|
crawler.spider.dupefilter = df # 将指纹绑定到Spider 在解析成功的时候 调用DUPEFILTER_CLASS的success方法
|
|
143
144
|
return cls(
|
|
144
145
|
crawler,
|
|
145
146
|
await call_helper(DownloadHandlerManager.for_crawler, crawler),
|
|
146
147
|
await call_helper(DownloaderMiddlewareManager.from_crawler, crawler),
|
|
147
|
-
proxy=crawler.settings.get("PROXY_HANDLER") and await load_instance(crawler.settings["PROXY_HANDLER"],
|
|
148
|
+
proxy=crawler.settings.get("PROXY_HANDLER") and await load_instance(crawler.settings["PROXY_HANDLER"],
|
|
149
|
+
crawler=crawler),
|
|
148
150
|
dupefilter=df
|
|
149
151
|
)
|
|
150
152
|
|
|
@@ -204,12 +206,17 @@ class Downloader(BaseDownloader):
|
|
|
204
206
|
slot.transferring.remove(request)
|
|
205
207
|
slot.active.remove(request)
|
|
206
208
|
self.active.remove(request)
|
|
207
|
-
|
|
209
|
+
|
|
208
210
|
if isinstance(result, Response):
|
|
209
211
|
await self.signals.send_catch_log(signal=signals.response_downloaded,
|
|
210
212
|
response=result,
|
|
211
213
|
request=request,
|
|
212
214
|
spider=self.spider)
|
|
215
|
+
# 控制指纹是否移除
|
|
216
|
+
self.dupefilter and \
|
|
217
|
+
not request.dont_filter and \
|
|
218
|
+
await self.dupefilter.done(request, done_type="request_ok" if isinstance(result, Response) else "request_err")
|
|
219
|
+
|
|
213
220
|
await self._call_engine(result, request)
|
|
214
221
|
await self._process_queue(slot)
|
|
215
222
|
|
|
@@ -37,7 +37,7 @@ class AioHttpDownloadHandler(BaseDownloadHandler):
|
|
|
37
37
|
try:
|
|
38
38
|
return await self._download_request(request)
|
|
39
39
|
except ClientError as e:
|
|
40
|
-
raise DownloadError from e
|
|
40
|
+
raise DownloadError(e) from e
|
|
41
41
|
|
|
42
42
|
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
43
43
|
kwargs = {
|
|
@@ -24,7 +24,7 @@ class CurlCffiDownloadHandler(BaseDownloadHandler):
|
|
|
24
24
|
try:
|
|
25
25
|
return await self._download_request(request)
|
|
26
26
|
except CurlError as e:
|
|
27
|
-
raise DownloadError from e
|
|
27
|
+
raise DownloadError(e) from e
|
|
28
28
|
|
|
29
29
|
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
30
30
|
kwargs = {
|
|
@@ -32,7 +32,7 @@ class HttpxDownloadHandler(BaseDownloadHandler):
|
|
|
32
32
|
try:
|
|
33
33
|
return await self._download_request(request)
|
|
34
34
|
except HttpxError as e:
|
|
35
|
-
raise DownloadError from e
|
|
35
|
+
raise DownloadError(e) from e
|
|
36
36
|
|
|
37
37
|
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
38
38
|
kwargs = {
|
{aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/playwright/__init__.py
RENAMED
|
@@ -30,7 +30,7 @@ class PlaywrightHandler(BaseDownloadHandler):
|
|
|
30
30
|
try:
|
|
31
31
|
return await self._download_request(request, spider)
|
|
32
32
|
except Error as e:
|
|
33
|
-
raise DownloadError from e
|
|
33
|
+
raise DownloadError(e) from e
|
|
34
34
|
|
|
35
35
|
async def _download_request(self, request: Request, spider) -> PlaywrightResponse:
|
|
36
36
|
cookies = dict(request.cookies)
|
|
@@ -27,7 +27,7 @@ class PyhttpxDownloadHandler(BaseDownloadHandler):
|
|
|
27
27
|
try:
|
|
28
28
|
return await self._download_request(request)
|
|
29
29
|
except PyHttpxError as e:
|
|
30
|
-
raise DownloadError from e
|
|
30
|
+
raise DownloadError(e) from e
|
|
31
31
|
|
|
32
32
|
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
33
33
|
kwargs = {
|
|
@@ -16,7 +16,6 @@ class RequestsDownloadHandler(BaseDownloadHandler):
|
|
|
16
16
|
def __init__(self, settings):
|
|
17
17
|
self.settings: Settings = settings
|
|
18
18
|
self.verify_ssl: bool = self.settings.get("VERIFY_SSL", True)
|
|
19
|
-
self.loop = asyncio.get_running_loop()
|
|
20
19
|
|
|
21
20
|
@classmethod
|
|
22
21
|
def from_settings(cls, settings: Settings):
|
|
@@ -26,7 +25,7 @@ class RequestsDownloadHandler(BaseDownloadHandler):
|
|
|
26
25
|
try:
|
|
27
26
|
return await self._download_request(request)
|
|
28
27
|
except RequestsError as e:
|
|
29
|
-
raise DownloadError from e
|
|
28
|
+
raise DownloadError(e) from e
|
|
30
29
|
|
|
31
30
|
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
32
31
|
kwargs = {
|
|
@@ -113,6 +113,11 @@ class Scraper:
|
|
|
113
113
|
except BaseException as e:
|
|
114
114
|
await self.handle_spider_error(e, request, result)
|
|
115
115
|
finally:
|
|
116
|
+
# 控制指纹是否移除
|
|
117
|
+
self.spider.dupefilter and \
|
|
118
|
+
not request.dont_filter and \
|
|
119
|
+
await self.spider.dupefilter.done(request, done_type="parse_ok" if getattr(request, "parse_ok", False) else "parse_err")
|
|
120
|
+
|
|
116
121
|
if isinstance(result, PlaywrightResponse):
|
|
117
122
|
await result.release()
|
|
118
123
|
|
|
@@ -161,22 +166,22 @@ class Scraper:
|
|
|
161
166
|
"""Iter each Request/Item (given in the output parameter) returned from the given spider"""
|
|
162
167
|
if not result:
|
|
163
168
|
return
|
|
164
|
-
|
|
169
|
+
|
|
170
|
+
parse_ok = True
|
|
165
171
|
while True:
|
|
166
172
|
try:
|
|
167
173
|
output = await result.__anext__()
|
|
168
174
|
except StopAsyncIteration:
|
|
169
175
|
break
|
|
170
176
|
except Exception as e:
|
|
171
|
-
|
|
177
|
+
parse_ok = False
|
|
172
178
|
await self.handle_spider_error(e, request, response)
|
|
173
179
|
else:
|
|
174
180
|
await self._process_spidermw_output(output, request, response)
|
|
175
181
|
|
|
176
182
|
self.spider.dupefilter and \
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
await self.spider.dupefilter.done(request, done_type="parse_done")
|
|
183
|
+
not request.dont_filter and \
|
|
184
|
+
setattr(request, "parse_ok", parse_ok)
|
|
180
185
|
|
|
181
186
|
async def _process_spidermw_output(self, output: Any, request: Request, response: Response) -> None:
|
|
182
187
|
"""Process each Request/Item (given in the output parameter) returned from the given spider"""
|
|
@@ -39,5 +39,9 @@ class DupeFilterBase(metaclass=ABCMeta):
|
|
|
39
39
|
|
|
40
40
|
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
|
41
41
|
|
|
42
|
-
async def done(
|
|
43
|
-
|
|
42
|
+
async def done(
|
|
43
|
+
self,
|
|
44
|
+
request: Request,
|
|
45
|
+
done_type: Literal["request_ok", "request_err", "parse_ok", "parse_err"]
|
|
46
|
+
) -> None:
|
|
47
|
+
""" 根据done_type的状态 控制指纹的移除 """
|
|
@@ -130,7 +130,7 @@ class RedisBloomDupeFilter(RedisRFPDupeFilter):
|
|
|
130
130
|
return False
|
|
131
131
|
|
|
132
132
|
|
|
133
|
-
class
|
|
133
|
+
class ExRedisBloomDupeFilter(RedisBloomDupeFilter):
|
|
134
134
|
|
|
135
135
|
def __init__(self, server, key, key_set, ttl, debug, bit, hash_number, keep_on_close, info):
|
|
136
136
|
super().__init__(server, key, debug, bit, hash_number, keep_on_close, info)
|
|
@@ -161,11 +161,14 @@ class RedisBloomSetDupeFilter(RedisBloomDupeFilter):
|
|
|
161
161
|
ret, _ = await pipe.execute()
|
|
162
162
|
return ret == 0
|
|
163
163
|
|
|
164
|
-
async def done(
|
|
165
|
-
|
|
166
|
-
|
|
164
|
+
async def done(
|
|
165
|
+
self,
|
|
166
|
+
request: Request,
|
|
167
|
+
done_type: Literal["request_ok", "request_err", "parse_ok", "parse_err"]
|
|
168
|
+
):
|
|
169
|
+
if done_type == "request_ok" or done_type == "request_err":
|
|
167
170
|
await self.server.srem(self.key_set, request.fingerprint)
|
|
168
|
-
elif done_type == "
|
|
171
|
+
elif done_type == "parse_ok":
|
|
169
172
|
await self.bf.insert(request.fingerprint)
|
|
170
173
|
|
|
171
174
|
async def close(self, reason=''):
|
|
@@ -174,6 +177,21 @@ class RedisBloomSetDupeFilter(RedisBloomDupeFilter):
|
|
|
174
177
|
await self.server.delete(self.key_set)
|
|
175
178
|
|
|
176
179
|
|
|
180
|
+
class ExRedisRFPDupeFilter(RedisRFPDupeFilter):
|
|
181
|
+
|
|
182
|
+
async def done(
|
|
183
|
+
self,
|
|
184
|
+
request: Request,
|
|
185
|
+
done_type: Literal["request_ok", "request_err", "parse_ok", "parse_err"]
|
|
186
|
+
):
|
|
187
|
+
# 当请求失败或解析失败的时候 从Redis的Set中移除指纹
|
|
188
|
+
if done_type == "request_err" or done_type == "parse_err":
|
|
189
|
+
await self.server.srem(self.key, request.fingerprint)
|
|
190
|
+
|
|
191
|
+
|
|
177
192
|
RFPDupeFilter = RedisRFPDupeFilter
|
|
193
|
+
ExRFPDupeFilter = ExRedisRFPDupeFilter
|
|
178
194
|
BloomDupeFilter = RedisBloomDupeFilter
|
|
179
|
-
|
|
195
|
+
ExBloomDupeFilter = ExRedisBloomDupeFilter
|
|
196
|
+
BloomSetDupeFilter = ExRedisBloomDupeFilter
|
|
197
|
+
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
2.1.2
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/playwright/driverpool.py
RENAMED
|
File without changes
|
{aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/playwright/webdriver.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/middlewares.py.tmpl
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/spiders/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|