aio-scrapy 2.1.2__tar.gz → 2.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. {aio-scrapy-2.1.2/aio_scrapy.egg-info → aio-scrapy-2.1.3}/PKG-INFO +1 -1
  2. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3/aio_scrapy.egg-info}/PKG-INFO +1 -1
  3. aio-scrapy-2.1.3/aioscrapy/VERSION +1 -0
  4. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/__init__.py +10 -3
  5. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/aiohttp.py +1 -1
  6. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/curl_cffi.py +1 -1
  7. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/httpx.py +1 -1
  8. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/playwright/__init__.py +1 -1
  9. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/pyhttpx.py +1 -1
  10. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/requests.py +1 -2
  11. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/scraper.py +10 -5
  12. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/dupefilters/__init__.py +6 -2
  13. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/dupefilters/redis.py +24 -6
  14. aio-scrapy-2.1.2/aioscrapy/VERSION +0 -1
  15. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/LICENSE +0 -0
  16. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/MANIFEST.in +0 -0
  17. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/README.md +0 -0
  18. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/SOURCES.txt +0 -0
  19. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/dependency_links.txt +0 -0
  20. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/entry_points.txt +0 -0
  21. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/not-zip-safe +0 -0
  22. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/requires.txt +0 -0
  23. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/top_level.txt +0 -0
  24. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/__init__.py +0 -0
  25. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/__main__.py +0 -0
  26. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/cmdline.py +0 -0
  27. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/__init__.py +0 -0
  28. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/crawl.py +0 -0
  29. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/genspider.py +0 -0
  30. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/list.py +0 -0
  31. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/runspider.py +0 -0
  32. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/settings.py +0 -0
  33. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/startproject.py +0 -0
  34. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/commands/version.py +0 -0
  35. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/__init__.py +0 -0
  36. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/__init__.py +0 -0
  37. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -0
  38. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -0
  39. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/engine.py +0 -0
  40. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/core/scheduler.py +0 -0
  41. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/crawler.py +0 -0
  42. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/db/__init__.py +0 -0
  43. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/db/absmanager.py +0 -0
  44. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/db/aiomongo.py +0 -0
  45. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/db/aiomysql.py +0 -0
  46. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/db/aiopg.py +0 -0
  47. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/db/aiorabbitmq.py +0 -0
  48. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/db/aioredis.py +0 -0
  49. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/dupefilters/disk.py +0 -0
  50. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/exceptions.py +0 -0
  51. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/__init__.py +0 -0
  52. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/headers.py +0 -0
  53. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/request/__init__.py +0 -0
  54. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/request/form.py +0 -0
  55. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/request/json_request.py +0 -0
  56. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/response/__init__.py +0 -0
  57. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/response/html.py +0 -0
  58. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/response/playwright.py +0 -0
  59. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/response/text.py +0 -0
  60. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/http/response/xml.py +0 -0
  61. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/__init__.py +0 -0
  62. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/__init__.py +0 -0
  63. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/defaultheaders.py +0 -0
  64. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/downloadtimeout.py +0 -0
  65. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/ja3fingerprint.py +0 -0
  66. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/retry.py +0 -0
  67. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/stats.py +0 -0
  68. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/useragent.py +0 -0
  69. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/__init__.py +0 -0
  70. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/closespider.py +0 -0
  71. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/corestats.py +0 -0
  72. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/logstats.py +0 -0
  73. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/metric.py +0 -0
  74. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/throttle.py +0 -0
  75. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/__init__.py +0 -0
  76. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/csv.py +0 -0
  77. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/execl.py +0 -0
  78. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/mongo.py +0 -0
  79. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/mysql.py +0 -0
  80. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/pg.py +0 -0
  81. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/__init__.py +0 -0
  82. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/depth.py +0 -0
  83. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/httperror.py +0 -0
  84. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/offsite.py +0 -0
  85. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/referer.py +0 -0
  86. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/urllength.py +0 -0
  87. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/link.py +0 -0
  88. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/logformatter.py +0 -0
  89. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/middleware/__init__.py +0 -0
  90. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/middleware/absmanager.py +0 -0
  91. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/middleware/downloader.py +0 -0
  92. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/middleware/extension.py +0 -0
  93. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/middleware/itempipeline.py +0 -0
  94. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/middleware/spider.py +0 -0
  95. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/process.py +0 -0
  96. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/proxy/__init__.py +0 -0
  97. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/proxy/redis.py +0 -0
  98. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/queue/__init__.py +0 -0
  99. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/queue/memory.py +0 -0
  100. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/queue/rabbitmq.py +0 -0
  101. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/queue/redis.py +0 -0
  102. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/scrapyd/__init__.py +0 -0
  103. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/scrapyd/runner.py +0 -0
  104. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/serializer.py +0 -0
  105. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/settings/__init__.py +0 -0
  106. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/settings/default_settings.py +0 -0
  107. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/signalmanager.py +0 -0
  108. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/signals.py +0 -0
  109. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/spiderloader.py +0 -0
  110. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/spiders/__init__.py +0 -0
  111. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/statscollectors.py +0 -0
  112. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/project/aioscrapy.cfg +0 -0
  113. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/__init__.py +0 -0
  114. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/middlewares.py.tmpl +0 -0
  115. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/pipelines.py.tmpl +0 -0
  116. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/settings.py.tmpl +0 -0
  117. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/spiders/__init__.py +0 -0
  118. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/spiders/basic.tmpl +0 -0
  119. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/templates/spiders/single.tmpl +0 -0
  120. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/__init__.py +0 -0
  121. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/conf.py +0 -0
  122. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/curl.py +0 -0
  123. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/decorators.py +0 -0
  124. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/deprecate.py +0 -0
  125. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/httpobj.py +0 -0
  126. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/log.py +0 -0
  127. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/misc.py +0 -0
  128. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/ossignal.py +0 -0
  129. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/project.py +0 -0
  130. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/python.py +0 -0
  131. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/reqser.py +0 -0
  132. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/request.py +0 -0
  133. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/response.py +0 -0
  134. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/signal.py +0 -0
  135. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/spider.py +0 -0
  136. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/template.py +0 -0
  137. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/tools.py +0 -0
  138. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/trackref.py +0 -0
  139. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/aioscrapy/utils/url.py +0 -0
  140. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/setup.cfg +0 -0
  141. {aio-scrapy-2.1.2 → aio-scrapy-2.1.3}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: aio-scrapy
3
- Version: 2.1.2
3
+ Version: 2.1.3
4
4
  Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
5
5
  Home-page: https://github.com/conlin-huang/aio-scrapy.git
6
6
  Author: conlin
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: aio-scrapy
3
- Version: 2.1.2
3
+ Version: 2.1.3
4
4
  Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
5
5
  Home-page: https://github.com/conlin-huang/aio-scrapy.git
6
6
  Author: conlin
@@ -0,0 +1 @@
1
+ 2.1.3
@@ -138,13 +138,15 @@ class Downloader(BaseDownloader):
138
138
 
139
139
  @classmethod
140
140
  async def from_crawler(cls, crawler) -> "Downloader":
141
- df = crawler.settings.get('DUPEFILTER_CLASS') and await load_instance(crawler.settings['DUPEFILTER_CLASS'], crawler=crawler)
141
+ df = crawler.settings.get('DUPEFILTER_CLASS') and await load_instance(crawler.settings['DUPEFILTER_CLASS'],
142
+ crawler=crawler)
142
143
  crawler.spider.dupefilter = df # 将指纹绑定到Spider 在解析成功的时候 调用DUPEFILTER_CLASS的success方法
143
144
  return cls(
144
145
  crawler,
145
146
  await call_helper(DownloadHandlerManager.for_crawler, crawler),
146
147
  await call_helper(DownloaderMiddlewareManager.from_crawler, crawler),
147
- proxy=crawler.settings.get("PROXY_HANDLER") and await load_instance(crawler.settings["PROXY_HANDLER"], crawler=crawler),
148
+ proxy=crawler.settings.get("PROXY_HANDLER") and await load_instance(crawler.settings["PROXY_HANDLER"],
149
+ crawler=crawler),
148
150
  dupefilter=df
149
151
  )
150
152
 
@@ -204,12 +206,17 @@ class Downloader(BaseDownloader):
204
206
  slot.transferring.remove(request)
205
207
  slot.active.remove(request)
206
208
  self.active.remove(request)
207
- self.dupefilter and not request.dont_filter and await self.dupefilter.done(request, done_type="request_done")
209
+
208
210
  if isinstance(result, Response):
209
211
  await self.signals.send_catch_log(signal=signals.response_downloaded,
210
212
  response=result,
211
213
  request=request,
212
214
  spider=self.spider)
215
+ # 控制指纹是否移除
216
+ self.dupefilter and \
217
+ not request.dont_filter and \
218
+ await self.dupefilter.done(request, done_type="request_ok" if isinstance(result, Response) else "request_err")
219
+
213
220
  await self._call_engine(result, request)
214
221
  await self._process_queue(slot)
215
222
 
@@ -37,7 +37,7 @@ class AioHttpDownloadHandler(BaseDownloadHandler):
37
37
  try:
38
38
  return await self._download_request(request)
39
39
  except ClientError as e:
40
- raise DownloadError from e
40
+ raise DownloadError(e) from e
41
41
 
42
42
  async def _download_request(self, request: Request) -> HtmlResponse:
43
43
  kwargs = {
@@ -24,7 +24,7 @@ class CurlCffiDownloadHandler(BaseDownloadHandler):
24
24
  try:
25
25
  return await self._download_request(request)
26
26
  except CurlError as e:
27
- raise DownloadError from e
27
+ raise DownloadError(e) from e
28
28
 
29
29
  async def _download_request(self, request: Request) -> HtmlResponse:
30
30
  kwargs = {
@@ -32,7 +32,7 @@ class HttpxDownloadHandler(BaseDownloadHandler):
32
32
  try:
33
33
  return await self._download_request(request)
34
34
  except HttpxError as e:
35
- raise DownloadError from e
35
+ raise DownloadError(e) from e
36
36
 
37
37
  async def _download_request(self, request: Request) -> HtmlResponse:
38
38
  kwargs = {
@@ -30,7 +30,7 @@ class PlaywrightHandler(BaseDownloadHandler):
30
30
  try:
31
31
  return await self._download_request(request, spider)
32
32
  except Error as e:
33
- raise DownloadError from e
33
+ raise DownloadError(e) from e
34
34
 
35
35
  async def _download_request(self, request: Request, spider) -> PlaywrightResponse:
36
36
  cookies = dict(request.cookies)
@@ -27,7 +27,7 @@ class PyhttpxDownloadHandler(BaseDownloadHandler):
27
27
  try:
28
28
  return await self._download_request(request)
29
29
  except PyHttpxError as e:
30
- raise DownloadError from e
30
+ raise DownloadError(e) from e
31
31
 
32
32
  async def _download_request(self, request: Request) -> HtmlResponse:
33
33
  kwargs = {
@@ -16,7 +16,6 @@ class RequestsDownloadHandler(BaseDownloadHandler):
16
16
  def __init__(self, settings):
17
17
  self.settings: Settings = settings
18
18
  self.verify_ssl: bool = self.settings.get("VERIFY_SSL", True)
19
- self.loop = asyncio.get_running_loop()
20
19
 
21
20
  @classmethod
22
21
  def from_settings(cls, settings: Settings):
@@ -26,7 +25,7 @@ class RequestsDownloadHandler(BaseDownloadHandler):
26
25
  try:
27
26
  return await self._download_request(request)
28
27
  except RequestsError as e:
29
- raise DownloadError from e
28
+ raise DownloadError(e) from e
30
29
 
31
30
  async def _download_request(self, request: Request) -> HtmlResponse:
32
31
  kwargs = {
@@ -113,6 +113,11 @@ class Scraper:
113
113
  except BaseException as e:
114
114
  await self.handle_spider_error(e, request, result)
115
115
  finally:
116
+ # 控制指纹是否移除
117
+ self.spider.dupefilter and \
118
+ not request.dont_filter and \
119
+ await self.spider.dupefilter.done(request, done_type="parse_ok" if getattr(request, "parse_ok", False) else "parse_err")
120
+
116
121
  if isinstance(result, PlaywrightResponse):
117
122
  await result.release()
118
123
 
@@ -161,22 +166,22 @@ class Scraper:
161
166
  """Iter each Request/Item (given in the output parameter) returned from the given spider"""
162
167
  if not result:
163
168
  return
164
- parser_successful = True
169
+
170
+ parse_ok = True
165
171
  while True:
166
172
  try:
167
173
  output = await result.__anext__()
168
174
  except StopAsyncIteration:
169
175
  break
170
176
  except Exception as e:
171
- parser_successful = False
177
+ parse_ok = False
172
178
  await self.handle_spider_error(e, request, response)
173
179
  else:
174
180
  await self._process_spidermw_output(output, request, response)
175
181
 
176
182
  self.spider.dupefilter and \
177
- not request.dont_filter and \
178
- parser_successful and \
179
- await self.spider.dupefilter.done(request, done_type="parse_done")
183
+ not request.dont_filter and \
184
+ setattr(request, "parse_ok", parse_ok)
180
185
 
181
186
  async def _process_spidermw_output(self, output: Any, request: Request, response: Response) -> None:
182
187
  """Process each Request/Item (given in the output parameter) returned from the given spider"""
@@ -39,5 +39,9 @@ class DupeFilterBase(metaclass=ABCMeta):
39
39
 
40
40
  spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
41
41
 
42
- async def done(self, request: Request, done_type: Literal["request_done", "parse_done"]) -> None:
43
- """ deal fingerprint on task successful """
42
+ async def done(
43
+ self,
44
+ request: Request,
45
+ done_type: Literal["request_ok", "request_err", "parse_ok", "parse_err"]
46
+ ) -> None:
47
+ """ 根据done_type的状态 控制指纹的移除 """
@@ -130,7 +130,7 @@ class RedisBloomDupeFilter(RedisRFPDupeFilter):
130
130
  return False
131
131
 
132
132
 
133
- class RedisBloomSetDupeFilter(RedisBloomDupeFilter):
133
+ class ExRedisBloomDupeFilter(RedisBloomDupeFilter):
134
134
 
135
135
  def __init__(self, server, key, key_set, ttl, debug, bit, hash_number, keep_on_close, info):
136
136
  super().__init__(server, key, debug, bit, hash_number, keep_on_close, info)
@@ -161,11 +161,14 @@ class RedisBloomSetDupeFilter(RedisBloomDupeFilter):
161
161
  ret, _ = await pipe.execute()
162
162
  return ret == 0
163
163
 
164
- async def done(self, request: Request, done_type: Literal["request_done", "parse_done"]):
165
- print(done_type)
166
- if done_type == "request_done":
164
+ async def done(
165
+ self,
166
+ request: Request,
167
+ done_type: Literal["request_ok", "request_err", "parse_ok", "parse_err"]
168
+ ):
169
+ if done_type == "request_ok" or done_type == "request_err":
167
170
  await self.server.srem(self.key_set, request.fingerprint)
168
- elif done_type == "parse_done":
171
+ elif done_type == "parse_ok":
169
172
  await self.bf.insert(request.fingerprint)
170
173
 
171
174
  async def close(self, reason=''):
@@ -174,6 +177,21 @@ class RedisBloomSetDupeFilter(RedisBloomDupeFilter):
174
177
  await self.server.delete(self.key_set)
175
178
 
176
179
 
180
+ class ExRedisRFPDupeFilter(RedisRFPDupeFilter):
181
+
182
+ async def done(
183
+ self,
184
+ request: Request,
185
+ done_type: Literal["request_ok", "request_err", "parse_ok", "parse_err"]
186
+ ):
187
+ # 当请求失败或解析失败的时候 从Redis的Set中移除指纹
188
+ if done_type == "request_err" or done_type == "parse_err":
189
+ await self.server.srem(self.key, request.fingerprint)
190
+
191
+
177
192
  RFPDupeFilter = RedisRFPDupeFilter
193
+ ExRFPDupeFilter = ExRedisRFPDupeFilter
178
194
  BloomDupeFilter = RedisBloomDupeFilter
179
- BloomSetDupeFilter = RedisBloomSetDupeFilter
195
+ ExBloomDupeFilter = ExRedisBloomDupeFilter
196
+ BloomSetDupeFilter = ExRedisBloomDupeFilter
197
+
@@ -1 +0,0 @@
1
- 2.1.2
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes