aio-scrapy 2.0.10__tar.gz → 2.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio-scrapy-2.0.10/aio_scrapy.egg-info → aio-scrapy-2.1.2}/PKG-INFO +7 -4
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/README.md +1 -1
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2/aio_scrapy.egg-info}/PKG-INFO +7 -4
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aio_scrapy.egg-info/SOURCES.txt +1 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aio_scrapy.egg-info/requires.txt +6 -2
- aio-scrapy-2.1.2/aioscrapy/VERSION +1 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/core/downloader/__init__.py +5 -4
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/core/downloader/handlers/aiohttp.py +8 -0
- aio-scrapy-2.1.2/aioscrapy/core/downloader/handlers/curl_cffi.py +67 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/core/downloader/handlers/httpx.py +9 -1
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/core/downloader/handlers/playwright/__init__.py +10 -2
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/core/downloader/handlers/pyhttpx.py +8 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/core/downloader/handlers/requests.py +9 -1
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/core/engine.py +2 -27
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/core/scheduler.py +45 -5
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/core/scraper.py +9 -3
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/crawler.py +5 -2
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/db/aiomongo.py +10 -2
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/dupefilters/__init__.py +4 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/dupefilters/redis.py +47 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/exceptions.py +5 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/downloader/retry.py +6 -47
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/pipelines/__init__.py +3 -6
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/pipelines/mongo.py +7 -2
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/settings/default_settings.py +4 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/spiders/__init__.py +4 -3
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/templates/spiders/single.tmpl +6 -5
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/python.py +1 -6
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/setup.py +4 -3
- aio-scrapy-2.0.10/aioscrapy/VERSION +0 -1
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/LICENSE +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/MANIFEST.in +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aio_scrapy.egg-info/dependency_links.txt +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aio_scrapy.egg-info/entry_points.txt +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aio_scrapy.egg-info/not-zip-safe +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aio_scrapy.egg-info/top_level.txt +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/__main__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/cmdline.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/commands/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/commands/crawl.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/commands/genspider.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/commands/list.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/commands/runspider.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/commands/settings.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/commands/startproject.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/commands/version.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/core/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/core/downloader/handlers/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/db/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/db/absmanager.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/db/aiomysql.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/db/aiopg.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/db/aiorabbitmq.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/db/aioredis.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/dupefilters/disk.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/http/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/http/headers.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/http/request/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/http/request/form.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/http/request/json_request.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/http/response/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/http/response/html.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/http/response/playwright.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/http/response/text.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/http/response/xml.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/downloader/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/downloader/defaultheaders.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/downloader/downloadtimeout.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/downloader/ja3fingerprint.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/downloader/stats.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/downloader/useragent.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/extensions/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/extensions/closespider.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/extensions/corestats.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/extensions/logstats.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/extensions/metric.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/extensions/throttle.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/pipelines/csv.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/pipelines/execl.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/pipelines/mysql.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/pipelines/pg.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/spider/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/spider/depth.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/spider/httperror.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/spider/offsite.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/spider/referer.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/libs/spider/urllength.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/link.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/logformatter.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/middleware/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/middleware/absmanager.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/middleware/downloader.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/middleware/extension.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/middleware/itempipeline.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/middleware/spider.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/process.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/proxy/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/proxy/redis.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/queue/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/queue/memory.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/queue/rabbitmq.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/queue/redis.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/scrapyd/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/scrapyd/runner.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/serializer.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/settings/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/signalmanager.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/signals.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/spiderloader.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/statscollectors.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/templates/project/aioscrapy.cfg +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/templates/project/module/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/templates/project/module/middlewares.py.tmpl +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/templates/project/module/pipelines.py.tmpl +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/templates/project/module/settings.py.tmpl +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/templates/project/module/spiders/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/templates/spiders/basic.tmpl +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/__init__.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/conf.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/curl.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/decorators.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/deprecate.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/httpobj.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/log.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/misc.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/ossignal.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/project.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/reqser.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/request.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/response.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/signal.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/spider.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/template.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/tools.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/trackref.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/utils/url.py +0 -0
- {aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: aio-scrapy
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.1.2
|
|
4
4
|
Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
|
|
5
5
|
Home-page: https://github.com/conlin-huang/aio-scrapy.git
|
|
6
6
|
Author: conlin
|
|
@@ -33,12 +33,13 @@ Requires-Dist: aiomysql>=0.1.1; extra == "all"
|
|
|
33
33
|
Requires-Dist: httpx[http2]>=0.23.0; extra == "all"
|
|
34
34
|
Requires-Dist: aio-pika>=8.1.1; extra == "all"
|
|
35
35
|
Requires-Dist: cryptography; extra == "all"
|
|
36
|
-
Requires-Dist: motor>=
|
|
36
|
+
Requires-Dist: motor>=2.1.0; extra == "all"
|
|
37
37
|
Requires-Dist: pyhttpx>=2.10.1; extra == "all"
|
|
38
38
|
Requires-Dist: asyncpg>=0.27.0; extra == "all"
|
|
39
39
|
Requires-Dist: XlsxWriter>=3.1.2; extra == "all"
|
|
40
40
|
Requires-Dist: pillow>=9.4.0; extra == "all"
|
|
41
41
|
Requires-Dist: requests>=2.28.2; extra == "all"
|
|
42
|
+
Requires-Dist: curl_cffi; extra == "all"
|
|
42
43
|
Provides-Extra: aiomysql
|
|
43
44
|
Requires-Dist: aiomysql>=0.1.1; extra == "aiomysql"
|
|
44
45
|
Requires-Dist: cryptography; extra == "aiomysql"
|
|
@@ -47,11 +48,13 @@ Requires-Dist: httpx[http2]>=0.23.0; extra == "httpx"
|
|
|
47
48
|
Provides-Extra: aio-pika
|
|
48
49
|
Requires-Dist: aio-pika>=8.1.1; extra == "aio-pika"
|
|
49
50
|
Provides-Extra: mongo
|
|
50
|
-
Requires-Dist: motor>=
|
|
51
|
+
Requires-Dist: motor>=2.1.0; extra == "mongo"
|
|
51
52
|
Provides-Extra: playwright
|
|
52
53
|
Requires-Dist: playwright>=1.31.1; extra == "playwright"
|
|
53
54
|
Provides-Extra: pyhttpx
|
|
54
55
|
Requires-Dist: pyhttpx>=2.10.4; extra == "pyhttpx"
|
|
56
|
+
Provides-Extra: curl-cffi
|
|
57
|
+
Requires-Dist: curl_cffi>=0.6.1; extra == "curl-cffi"
|
|
55
58
|
Provides-Extra: requests
|
|
56
59
|
Requires-Dist: requests>=2.28.2; extra == "requests"
|
|
57
60
|
Provides-Extra: pg
|
|
@@ -86,7 +89,7 @@ The quick way:
|
|
|
86
89
|
|
|
87
90
|
```shell
|
|
88
91
|
# Install the latest aio-scrapy
|
|
89
|
-
pip install git+https://github.com/
|
|
92
|
+
pip install git+https://github.com/ConlinH/aio-scrapy
|
|
90
93
|
|
|
91
94
|
# default
|
|
92
95
|
pip install aio-scrapy
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: aio-scrapy
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.1.2
|
|
4
4
|
Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
|
|
5
5
|
Home-page: https://github.com/conlin-huang/aio-scrapy.git
|
|
6
6
|
Author: conlin
|
|
@@ -33,12 +33,13 @@ Requires-Dist: aiomysql>=0.1.1; extra == "all"
|
|
|
33
33
|
Requires-Dist: httpx[http2]>=0.23.0; extra == "all"
|
|
34
34
|
Requires-Dist: aio-pika>=8.1.1; extra == "all"
|
|
35
35
|
Requires-Dist: cryptography; extra == "all"
|
|
36
|
-
Requires-Dist: motor>=
|
|
36
|
+
Requires-Dist: motor>=2.1.0; extra == "all"
|
|
37
37
|
Requires-Dist: pyhttpx>=2.10.1; extra == "all"
|
|
38
38
|
Requires-Dist: asyncpg>=0.27.0; extra == "all"
|
|
39
39
|
Requires-Dist: XlsxWriter>=3.1.2; extra == "all"
|
|
40
40
|
Requires-Dist: pillow>=9.4.0; extra == "all"
|
|
41
41
|
Requires-Dist: requests>=2.28.2; extra == "all"
|
|
42
|
+
Requires-Dist: curl_cffi; extra == "all"
|
|
42
43
|
Provides-Extra: aiomysql
|
|
43
44
|
Requires-Dist: aiomysql>=0.1.1; extra == "aiomysql"
|
|
44
45
|
Requires-Dist: cryptography; extra == "aiomysql"
|
|
@@ -47,11 +48,13 @@ Requires-Dist: httpx[http2]>=0.23.0; extra == "httpx"
|
|
|
47
48
|
Provides-Extra: aio-pika
|
|
48
49
|
Requires-Dist: aio-pika>=8.1.1; extra == "aio-pika"
|
|
49
50
|
Provides-Extra: mongo
|
|
50
|
-
Requires-Dist: motor>=
|
|
51
|
+
Requires-Dist: motor>=2.1.0; extra == "mongo"
|
|
51
52
|
Provides-Extra: playwright
|
|
52
53
|
Requires-Dist: playwright>=1.31.1; extra == "playwright"
|
|
53
54
|
Provides-Extra: pyhttpx
|
|
54
55
|
Requires-Dist: pyhttpx>=2.10.4; extra == "pyhttpx"
|
|
56
|
+
Provides-Extra: curl-cffi
|
|
57
|
+
Requires-Dist: curl_cffi>=0.6.1; extra == "curl-cffi"
|
|
55
58
|
Provides-Extra: requests
|
|
56
59
|
Requires-Dist: requests>=2.28.2; extra == "requests"
|
|
57
60
|
Provides-Extra: pg
|
|
@@ -86,7 +89,7 @@ The quick way:
|
|
|
86
89
|
|
|
87
90
|
```shell
|
|
88
91
|
# Install the latest aio-scrapy
|
|
89
|
-
pip install git+https://github.com/
|
|
92
|
+
pip install git+https://github.com/ConlinH/aio-scrapy
|
|
90
93
|
|
|
91
94
|
# default
|
|
92
95
|
pip install aio-scrapy
|
|
@@ -38,6 +38,7 @@ aioscrapy/core/scraper.py
|
|
|
38
38
|
aioscrapy/core/downloader/__init__.py
|
|
39
39
|
aioscrapy/core/downloader/handlers/__init__.py
|
|
40
40
|
aioscrapy/core/downloader/handlers/aiohttp.py
|
|
41
|
+
aioscrapy/core/downloader/handlers/curl_cffi.py
|
|
41
42
|
aioscrapy/core/downloader/handlers/httpx.py
|
|
42
43
|
aioscrapy/core/downloader/handlers/pyhttpx.py
|
|
43
44
|
aioscrapy/core/downloader/handlers/requests.py
|
|
@@ -20,12 +20,16 @@ aiomysql>=0.1.1
|
|
|
20
20
|
httpx[http2]>=0.23.0
|
|
21
21
|
aio-pika>=8.1.1
|
|
22
22
|
cryptography
|
|
23
|
-
motor>=
|
|
23
|
+
motor>=2.1.0
|
|
24
24
|
pyhttpx>=2.10.1
|
|
25
25
|
asyncpg>=0.27.0
|
|
26
26
|
XlsxWriter>=3.1.2
|
|
27
27
|
pillow>=9.4.0
|
|
28
28
|
requests>=2.28.2
|
|
29
|
+
curl_cffi
|
|
30
|
+
|
|
31
|
+
[curl_cffi]
|
|
32
|
+
curl_cffi>=0.6.1
|
|
29
33
|
|
|
30
34
|
[execl]
|
|
31
35
|
XlsxWriter>=3.1.2
|
|
@@ -35,7 +39,7 @@ pillow>=9.4.0
|
|
|
35
39
|
httpx[http2]>=0.23.0
|
|
36
40
|
|
|
37
41
|
[mongo]
|
|
38
|
-
motor>=
|
|
42
|
+
motor>=2.1.0
|
|
39
43
|
|
|
40
44
|
[pg]
|
|
41
45
|
asyncpg>=0.27.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
2.1.2
|
|
@@ -138,14 +138,14 @@ class Downloader(BaseDownloader):
|
|
|
138
138
|
|
|
139
139
|
@classmethod
|
|
140
140
|
async def from_crawler(cls, crawler) -> "Downloader":
|
|
141
|
+
df = crawler.settings.get('DUPEFILTER_CLASS') and await load_instance(crawler.settings['DUPEFILTER_CLASS'], crawler=crawler)
|
|
142
|
+
crawler.spider.dupefilter = df # 将指纹绑定到Spider 在解析成功的时候 调用DUPEFILTER_CLASS的success方法
|
|
141
143
|
return cls(
|
|
142
144
|
crawler,
|
|
143
145
|
await call_helper(DownloadHandlerManager.for_crawler, crawler),
|
|
144
146
|
await call_helper(DownloaderMiddlewareManager.from_crawler, crawler),
|
|
145
|
-
proxy=crawler.settings.get("PROXY_HANDLER") and await load_instance(crawler.settings["PROXY_HANDLER"],
|
|
146
|
-
|
|
147
|
-
dupefilter=crawler.settings.get('DUPEFILTER_CLASS') and await load_instance(
|
|
148
|
-
crawler.settings['DUPEFILTER_CLASS'], crawler=crawler)
|
|
147
|
+
proxy=crawler.settings.get("PROXY_HANDLER") and await load_instance(crawler.settings["PROXY_HANDLER"], crawler=crawler),
|
|
148
|
+
dupefilter=df
|
|
149
149
|
)
|
|
150
150
|
|
|
151
151
|
async def fetch(self, request: Request) -> None:
|
|
@@ -204,6 +204,7 @@ class Downloader(BaseDownloader):
|
|
|
204
204
|
slot.transferring.remove(request)
|
|
205
205
|
slot.active.remove(request)
|
|
206
206
|
self.active.remove(request)
|
|
207
|
+
self.dupefilter and not request.dont_filter and await self.dupefilter.done(request, done_type="request_done")
|
|
207
208
|
if isinstance(result, Response):
|
|
208
209
|
await self.signals.send_catch_log(signal=signals.response_downloaded,
|
|
209
210
|
response=result,
|
|
@@ -4,9 +4,11 @@ import ssl
|
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
6
|
import aiohttp
|
|
7
|
+
from aiohttp.client_exceptions import ClientError
|
|
7
8
|
|
|
8
9
|
from aioscrapy import Request
|
|
9
10
|
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
11
|
+
from aioscrapy.exceptions import DownloadError
|
|
10
12
|
from aioscrapy.http import HtmlResponse
|
|
11
13
|
from aioscrapy.settings import Settings
|
|
12
14
|
from aioscrapy.utils.log import logger
|
|
@@ -32,6 +34,12 @@ class AioHttpDownloadHandler(BaseDownloadHandler):
|
|
|
32
34
|
return self.session
|
|
33
35
|
|
|
34
36
|
async def download_request(self, request: Request, _) -> HtmlResponse:
|
|
37
|
+
try:
|
|
38
|
+
return await self._download_request(request)
|
|
39
|
+
except ClientError as e:
|
|
40
|
+
raise DownloadError from e
|
|
41
|
+
|
|
42
|
+
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
35
43
|
kwargs = {
|
|
36
44
|
'verify_ssl': request.meta.get('verify_ssl', self.verify_ssl),
|
|
37
45
|
'timeout': request.meta.get('download_timeout', 180),
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from curl_cffi.curl import CurlError
|
|
2
|
+
from curl_cffi.requests import AsyncSession
|
|
3
|
+
|
|
4
|
+
from aioscrapy import Request
|
|
5
|
+
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
6
|
+
from aioscrapy.exceptions import DownloadError
|
|
7
|
+
from aioscrapy.http import HtmlResponse
|
|
8
|
+
from aioscrapy.settings import Settings
|
|
9
|
+
from aioscrapy.utils.log import logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CurlCffiDownloadHandler(BaseDownloadHandler):
|
|
13
|
+
|
|
14
|
+
def __init__(self, settings):
|
|
15
|
+
self.settings: Settings = settings
|
|
16
|
+
self.httpx_client_session_args: dict = self.settings.get('CURL_CFFI_CLIENT_SESSION_ARGS', {})
|
|
17
|
+
self.verify_ssl: bool = self.settings.get("VERIFY_SSL", True)
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def from_settings(cls, settings: Settings):
|
|
21
|
+
return cls(settings)
|
|
22
|
+
|
|
23
|
+
async def download_request(self, request: Request, _) -> HtmlResponse:
|
|
24
|
+
try:
|
|
25
|
+
return await self._download_request(request)
|
|
26
|
+
except CurlError as e:
|
|
27
|
+
raise DownloadError from e
|
|
28
|
+
|
|
29
|
+
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
30
|
+
kwargs = {
|
|
31
|
+
'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
|
|
32
|
+
'cookies': dict(request.cookies),
|
|
33
|
+
'verify': request.meta.get('verify_ssl', self.verify_ssl),
|
|
34
|
+
'allow_redirects': self.settings.getbool('REDIRECT_ENABLED', True) if request.meta.get(
|
|
35
|
+
'dont_redirect') is None else request.meta.get('dont_redirect'),
|
|
36
|
+
'impersonate': request.meta.get('impersonate'),
|
|
37
|
+
}
|
|
38
|
+
post_data = request.body or None
|
|
39
|
+
if isinstance(post_data, dict):
|
|
40
|
+
kwargs['json'] = post_data
|
|
41
|
+
else:
|
|
42
|
+
kwargs['data'] = post_data
|
|
43
|
+
|
|
44
|
+
headers = request.headers or self.settings.get('DEFAULT_REQUEST_HEADERS')
|
|
45
|
+
kwargs['headers'] = headers
|
|
46
|
+
|
|
47
|
+
proxy = request.meta.get("proxy")
|
|
48
|
+
if proxy:
|
|
49
|
+
kwargs["proxies"] = {'http': proxy, 'https': proxy}
|
|
50
|
+
logger.debug(f"use proxy {proxy}: {request.url}")
|
|
51
|
+
|
|
52
|
+
session_args = self.httpx_client_session_args.copy()
|
|
53
|
+
|
|
54
|
+
async with AsyncSession(**session_args) as session:
|
|
55
|
+
response = await session.request(request.method, request.url, **kwargs)
|
|
56
|
+
|
|
57
|
+
return HtmlResponse(
|
|
58
|
+
str(response.url),
|
|
59
|
+
status=response.status_code,
|
|
60
|
+
headers=response.headers,
|
|
61
|
+
body=response.content,
|
|
62
|
+
cookies={j.name: j.value or '' for j in response.cookies.jar},
|
|
63
|
+
encoding=response.encoding
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
async def close(self):
|
|
67
|
+
pass
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import ssl
|
|
2
2
|
|
|
3
3
|
import httpx
|
|
4
|
+
from httpx import HTTPError as HttpxError
|
|
4
5
|
|
|
5
6
|
from aioscrapy import Request
|
|
6
7
|
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
8
|
+
from aioscrapy.exceptions import DownloadError
|
|
7
9
|
from aioscrapy.http import HtmlResponse
|
|
8
10
|
from aioscrapy.settings import Settings
|
|
9
11
|
from aioscrapy.utils.log import logger
|
|
@@ -27,6 +29,12 @@ class HttpxDownloadHandler(BaseDownloadHandler):
|
|
|
27
29
|
return cls(settings)
|
|
28
30
|
|
|
29
31
|
async def download_request(self, request: Request, _) -> HtmlResponse:
|
|
32
|
+
try:
|
|
33
|
+
return await self._download_request(request)
|
|
34
|
+
except HttpxError as e:
|
|
35
|
+
raise DownloadError from e
|
|
36
|
+
|
|
37
|
+
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
30
38
|
kwargs = {
|
|
31
39
|
'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
|
|
32
40
|
'cookies': dict(request.cookies),
|
|
@@ -68,7 +76,7 @@ class HttpxDownloadHandler(BaseDownloadHandler):
|
|
|
68
76
|
status=response.status_code,
|
|
69
77
|
headers=response.headers,
|
|
70
78
|
body=content,
|
|
71
|
-
cookies=
|
|
79
|
+
cookies={j.name: j.value or '' for j in response.cookies.jar},
|
|
72
80
|
encoding=response.encoding
|
|
73
81
|
)
|
|
74
82
|
|
{aio-scrapy-2.0.10 → aio-scrapy-2.1.2}/aioscrapy/core/downloader/handlers/playwright/__init__.py
RENAMED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
from functools import wraps
|
|
2
2
|
|
|
3
|
+
from playwright._impl._api_types import Error
|
|
3
4
|
from playwright.async_api._generated import Response as EventResponse
|
|
4
5
|
|
|
5
|
-
from aioscrapy import Request
|
|
6
|
+
from aioscrapy import Request, Spider
|
|
6
7
|
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
7
8
|
from aioscrapy.core.downloader.handlers.playwright.driverpool import WebDriverPool
|
|
8
9
|
from aioscrapy.core.downloader.handlers.playwright.webdriver import PlaywrightDriver
|
|
10
|
+
from aioscrapy.exceptions import DownloadError
|
|
9
11
|
from aioscrapy.http import PlaywrightResponse
|
|
10
12
|
from aioscrapy.settings import Settings
|
|
11
13
|
from aioscrapy.utils.tools import call_helper
|
|
@@ -24,7 +26,13 @@ class PlaywrightHandler(BaseDownloadHandler):
|
|
|
24
26
|
def from_settings(cls, settings: Settings):
|
|
25
27
|
return cls(settings)
|
|
26
28
|
|
|
27
|
-
async def download_request(self, request: Request, spider) -> PlaywrightResponse:
|
|
29
|
+
async def download_request(self, request: Request, spider: Spider) -> PlaywrightResponse:
|
|
30
|
+
try:
|
|
31
|
+
return await self._download_request(request, spider)
|
|
32
|
+
except Error as e:
|
|
33
|
+
raise DownloadError from e
|
|
34
|
+
|
|
35
|
+
async def _download_request(self, request: Request, spider) -> PlaywrightResponse:
|
|
28
36
|
cookies = dict(request.cookies)
|
|
29
37
|
timeout = request.meta.get('download_timeout', 30) * 1000
|
|
30
38
|
user_agent = request.headers.get("User-Agent")
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
|
|
3
3
|
import pyhttpx
|
|
4
|
+
from pyhttpx.exception import BaseExpetion as PyHttpxError
|
|
4
5
|
|
|
5
6
|
from aioscrapy import Request
|
|
6
7
|
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
8
|
+
from aioscrapy.exceptions import DownloadError
|
|
7
9
|
from aioscrapy.http import HtmlResponse
|
|
8
10
|
from aioscrapy.settings import Settings
|
|
9
11
|
from aioscrapy.utils.log import logger
|
|
@@ -22,6 +24,12 @@ class PyhttpxDownloadHandler(BaseDownloadHandler):
|
|
|
22
24
|
return cls(settings)
|
|
23
25
|
|
|
24
26
|
async def download_request(self, request: Request, _) -> HtmlResponse:
|
|
27
|
+
try:
|
|
28
|
+
return await self._download_request(request)
|
|
29
|
+
except PyHttpxError as e:
|
|
30
|
+
raise DownloadError from e
|
|
31
|
+
|
|
32
|
+
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
25
33
|
kwargs = {
|
|
26
34
|
'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
|
|
27
35
|
'cookies': dict(request.cookies),
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
|
|
3
3
|
import requests
|
|
4
|
+
from requests.exceptions import RequestException as RequestsError
|
|
4
5
|
|
|
5
6
|
from aioscrapy import Request
|
|
6
7
|
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
8
|
+
from aioscrapy.exceptions import DownloadError
|
|
7
9
|
from aioscrapy.http import HtmlResponse
|
|
8
10
|
from aioscrapy.settings import Settings
|
|
9
11
|
from aioscrapy.utils.log import logger
|
|
@@ -21,6 +23,12 @@ class RequestsDownloadHandler(BaseDownloadHandler):
|
|
|
21
23
|
return cls(settings)
|
|
22
24
|
|
|
23
25
|
async def download_request(self, request: Request, _) -> HtmlResponse:
|
|
26
|
+
try:
|
|
27
|
+
return await self._download_request(request)
|
|
28
|
+
except RequestsError as e:
|
|
29
|
+
raise DownloadError from e
|
|
30
|
+
|
|
31
|
+
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
24
32
|
kwargs = {
|
|
25
33
|
'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
|
|
26
34
|
'cookies': dict(request.cookies),
|
|
@@ -48,7 +56,7 @@ class RequestsDownloadHandler(BaseDownloadHandler):
|
|
|
48
56
|
status=response.status_code,
|
|
49
57
|
headers=response.headers,
|
|
50
58
|
body=response.content,
|
|
51
|
-
cookies=
|
|
59
|
+
cookies={k: v or '' for k, v in response.cookies.items()},
|
|
52
60
|
encoding=response.encoding
|
|
53
61
|
)
|
|
54
62
|
|
|
@@ -42,8 +42,6 @@ class ExecutionEngine(object):
|
|
|
42
42
|
self.signals = crawler.signals
|
|
43
43
|
self.logformatter = crawler.logformatter
|
|
44
44
|
|
|
45
|
-
self.enqueue_cache_num = self.settings.getint("ENQUEUE_CACHE_NUM")
|
|
46
|
-
self.enqueue_cache: Queue = Queue(self.enqueue_cache_num)
|
|
47
45
|
self.slot: Optional[Slot] = None
|
|
48
46
|
self.spider: Optional[Spider] = None
|
|
49
47
|
self.downloader: Optional[DownloaderTV] = None
|
|
@@ -53,7 +51,6 @@ class ExecutionEngine(object):
|
|
|
53
51
|
self.running: bool = False
|
|
54
52
|
self.unlock: bool = True
|
|
55
53
|
self.finish: bool = False
|
|
56
|
-
self.enqueue_unlock: bool = True
|
|
57
54
|
|
|
58
55
|
async def start(
|
|
59
56
|
self,
|
|
@@ -70,7 +67,6 @@ class ExecutionEngine(object):
|
|
|
70
67
|
while not self.finish:
|
|
71
68
|
self.running and await self._next_request()
|
|
72
69
|
await asyncio.sleep(1)
|
|
73
|
-
self.enqueue_cache_num != 1 and create_task(self._crawl())
|
|
74
70
|
self.running and await self._spider_idle(self.spider)
|
|
75
71
|
|
|
76
72
|
async def stop(self, reason: str = 'shutdown') -> None:
|
|
@@ -81,7 +77,6 @@ class ExecutionEngine(object):
|
|
|
81
77
|
|
|
82
78
|
while not self.is_idle():
|
|
83
79
|
await asyncio.sleep(0.2)
|
|
84
|
-
self.enqueue_cache_num != 1 and create_task(self._crawl())
|
|
85
80
|
await self.close_spider(self.spider, reason=reason)
|
|
86
81
|
await self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
|
|
87
82
|
self.finish = True
|
|
@@ -212,27 +207,8 @@ class ExecutionEngine(object):
|
|
|
212
207
|
return True
|
|
213
208
|
|
|
214
209
|
async def crawl(self, request: Request) -> None:
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
create_task(self._next_request())
|
|
218
|
-
else:
|
|
219
|
-
await self.enqueue_cache.put(request)
|
|
220
|
-
|
|
221
|
-
async def _crawl(self) -> None:
|
|
222
|
-
if not self.enqueue_unlock:
|
|
223
|
-
return
|
|
224
|
-
self.enqueue_unlock = False
|
|
225
|
-
requests = []
|
|
226
|
-
for _ in range(self.enqueue_cache.qsize()):
|
|
227
|
-
try:
|
|
228
|
-
request = self.enqueue_cache.get_nowait()
|
|
229
|
-
requests.append(request)
|
|
230
|
-
except QueueEmpty:
|
|
231
|
-
break
|
|
232
|
-
if requests:
|
|
233
|
-
await call_helper(self.scheduler.enqueue_request_batch, requests)
|
|
234
|
-
create_task(self._next_request())
|
|
235
|
-
self.enqueue_unlock = True
|
|
210
|
+
await self.scheduler.enqueue_request(request)
|
|
211
|
+
# create_task(self._next_request())
|
|
236
212
|
|
|
237
213
|
async def close_spider(self, spider: Spider, reason: str = 'cancelled') -> None:
|
|
238
214
|
"""Close (cancel) spider and clear all its outstanding requests"""
|
|
@@ -276,7 +252,6 @@ class ExecutionEngine(object):
|
|
|
276
252
|
# method of 'has_pending_requests' has IO, so method of 'is_idle' execute twice
|
|
277
253
|
if self.is_idle() \
|
|
278
254
|
and self.slot.start_requests is None \
|
|
279
|
-
and self.enqueue_unlock and self.enqueue_cache.empty() \
|
|
280
255
|
and not await self.scheduler.has_pending_requests() \
|
|
281
256
|
and self.is_idle():
|
|
282
257
|
await self.stop(reason='finished')
|
|
@@ -31,7 +31,7 @@ class BaseScheduler(metaclass=BaseSchedulerMeta):
|
|
|
31
31
|
@classmethod
|
|
32
32
|
async def from_crawler(cls, crawler: "aioscrapy.Crawler") -> "BaseScheduler":
|
|
33
33
|
"""
|
|
34
|
-
Factory method which receives the current :class:`~
|
|
34
|
+
Factory method which receives the current :class:`~aioscrapy.crawler.Crawler` object as argument.
|
|
35
35
|
"""
|
|
36
36
|
return cls()
|
|
37
37
|
|
|
@@ -103,20 +103,27 @@ class Scheduler(BaseScheduler):
|
|
|
103
103
|
queue: AbsQueue,
|
|
104
104
|
spider: aioscrapy.Spider,
|
|
105
105
|
stats=Optional[StatsCollector],
|
|
106
|
-
persist: bool = True
|
|
106
|
+
persist: bool = True,
|
|
107
|
+
cache_queue: Optional[AbsQueue] = None
|
|
107
108
|
):
|
|
109
|
+
|
|
108
110
|
self.queue = queue
|
|
111
|
+
self.cache_queue = cache_queue
|
|
109
112
|
self.spider = spider
|
|
110
113
|
self.stats = stats
|
|
111
114
|
self.persist = persist
|
|
112
115
|
|
|
113
116
|
@classmethod
|
|
114
117
|
async def from_crawler(cls: Type[SchedulerTV], crawler: "aioscrapy.Crawler") -> SchedulerTV:
|
|
118
|
+
cache_queue = None
|
|
119
|
+
if crawler.settings.getbool('USE_SCHEDULER_QUEUE_CACHE', False):
|
|
120
|
+
cache_queue = await load_instance('aioscrapy.queue.memory.SpiderPriorityQueue', spider=crawler.spider)
|
|
115
121
|
instance = cls(
|
|
116
122
|
await load_instance(crawler.settings['SCHEDULER_QUEUE_CLASS'], spider=crawler.spider),
|
|
117
123
|
crawler.spider,
|
|
118
124
|
stats=crawler.stats,
|
|
119
|
-
persist=crawler.settings.getbool('SCHEDULER_PERSIST', True)
|
|
125
|
+
persist=crawler.settings.getbool('SCHEDULER_PERSIST', True),
|
|
126
|
+
cache_queue=cache_queue
|
|
120
127
|
)
|
|
121
128
|
|
|
122
129
|
if crawler.settings.getbool('SCHEDULER_FLUSH_ON_START', False):
|
|
@@ -128,8 +135,20 @@ class Scheduler(BaseScheduler):
|
|
|
128
135
|
return instance
|
|
129
136
|
|
|
130
137
|
async def close(self, reason: str) -> None:
|
|
138
|
+
|
|
131
139
|
if not self.persist:
|
|
132
140
|
await self.flush()
|
|
141
|
+
return
|
|
142
|
+
|
|
143
|
+
# 如果持久化,将缓存中的任务放回到redis等分布式队列中
|
|
144
|
+
if self.cache_queue is not None:
|
|
145
|
+
while True:
|
|
146
|
+
temp = []
|
|
147
|
+
async for request in self.cache_queue.pop(2000):
|
|
148
|
+
temp.append(request)
|
|
149
|
+
temp and await self.queue.push_batch(temp)
|
|
150
|
+
if len(temp) < 2000:
|
|
151
|
+
break
|
|
133
152
|
|
|
134
153
|
async def flush(self) -> None:
|
|
135
154
|
await call_helper(self.queue.clear)
|
|
@@ -141,16 +160,37 @@ class Scheduler(BaseScheduler):
|
|
|
141
160
|
return True
|
|
142
161
|
|
|
143
162
|
async def enqueue_request(self, request: aioscrapy.Request) -> bool:
|
|
144
|
-
|
|
163
|
+
"""
|
|
164
|
+
如果启用了缓存队列(USE_SCHEDULER_QUEUE_CACHE),则优先将任务放到缓存队列中
|
|
165
|
+
"""
|
|
166
|
+
if self.cache_queue is not None:
|
|
167
|
+
await call_helper(self.cache_queue.push, request)
|
|
168
|
+
else:
|
|
169
|
+
await call_helper(self.queue.push, request)
|
|
145
170
|
if self.stats:
|
|
146
171
|
self.stats.inc_value(self.queue.inc_key, spider=self.spider)
|
|
147
172
|
return True
|
|
148
173
|
|
|
149
174
|
async def next_request(self, count: int = 1) -> Optional[aioscrapy.Request]:
|
|
175
|
+
"""
|
|
176
|
+
如果启用了缓存队列(USE_SCHEDULER_QUEUE_CACHE),则优先从缓存队列中获取任务,然后从redis等分布式队列中获取任务
|
|
177
|
+
"""
|
|
178
|
+
flag = False
|
|
179
|
+
if self.cache_queue is not None:
|
|
180
|
+
async for request in self.cache_queue.pop(count):
|
|
181
|
+
if request and self.stats:
|
|
182
|
+
self.stats.inc_value(self.queue.inc_key, spider=self.spider)
|
|
183
|
+
yield request
|
|
184
|
+
flag = True
|
|
185
|
+
|
|
186
|
+
if flag:
|
|
187
|
+
return
|
|
188
|
+
|
|
150
189
|
async for request in self.queue.pop(count):
|
|
151
190
|
if request and self.stats:
|
|
152
191
|
self.stats.inc_value(self.queue.inc_key, spider=self.spider)
|
|
153
192
|
yield request
|
|
154
193
|
|
|
155
194
|
async def has_pending_requests(self) -> bool:
|
|
156
|
-
return await call_helper(self.queue.len)
|
|
195
|
+
return await call_helper(self.queue.len) if self.cache_queue is None \
|
|
196
|
+
else (await call_helper(self.queue.len) + await call_helper(self.cache_queue.len)) > 0
|
|
@@ -110,8 +110,8 @@ class Scraper:
|
|
|
110
110
|
await self.handle_spider_error(e, request, result)
|
|
111
111
|
else:
|
|
112
112
|
await self.handle_spider_output(output, request, result)
|
|
113
|
-
except BaseException:
|
|
114
|
-
|
|
113
|
+
except BaseException as e:
|
|
114
|
+
await self.handle_spider_error(e, request, result)
|
|
115
115
|
finally:
|
|
116
116
|
if isinstance(result, PlaywrightResponse):
|
|
117
117
|
await result.release()
|
|
@@ -161,17 +161,23 @@ class Scraper:
|
|
|
161
161
|
"""Iter each Request/Item (given in the output parameter) returned from the given spider"""
|
|
162
162
|
if not result:
|
|
163
163
|
return
|
|
164
|
-
|
|
164
|
+
parser_successful = True
|
|
165
165
|
while True:
|
|
166
166
|
try:
|
|
167
167
|
output = await result.__anext__()
|
|
168
168
|
except StopAsyncIteration:
|
|
169
169
|
break
|
|
170
170
|
except Exception as e:
|
|
171
|
+
parser_successful = False
|
|
171
172
|
await self.handle_spider_error(e, request, response)
|
|
172
173
|
else:
|
|
173
174
|
await self._process_spidermw_output(output, request, response)
|
|
174
175
|
|
|
176
|
+
self.spider.dupefilter and \
|
|
177
|
+
not request.dont_filter and \
|
|
178
|
+
parser_successful and \
|
|
179
|
+
await self.spider.dupefilter.done(request, done_type="parse_done")
|
|
180
|
+
|
|
175
181
|
async def _process_spidermw_output(self, output: Any, request: Request, response: Response) -> None:
|
|
176
182
|
"""Process each Request/Item (given in the output parameter) returned from the given spider"""
|
|
177
183
|
|
|
@@ -234,9 +234,12 @@ class CrawlerProcess(CrawlerRunner):
|
|
|
234
234
|
finally:
|
|
235
235
|
await self.recycle_db_connect()
|
|
236
236
|
|
|
237
|
-
def start(self) -> None:
|
|
237
|
+
def start(self, use_windows_selector_eventLoop: bool = False) -> None:
|
|
238
238
|
if sys.platform.startswith('win'):
|
|
239
|
-
|
|
239
|
+
if use_windows_selector_eventLoop:
|
|
240
|
+
asyncio.set_event_loop_policy(asyncio.windows_events.WindowsSelectorEventLoopPolicy())
|
|
241
|
+
else:
|
|
242
|
+
asyncio.set_event_loop(asyncio.windows_events.ProactorEventLoop())
|
|
240
243
|
else:
|
|
241
244
|
try:
|
|
242
245
|
import uvloop
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from motor.motor_asyncio import AsyncIOMotorClient
|
|
2
|
+
from pymongo.errors import NetworkTimeout
|
|
2
3
|
|
|
3
4
|
import aioscrapy
|
|
4
5
|
from aioscrapy.db.absmanager import AbsDBPoolManager
|
|
6
|
+
from loguru import logger
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
class MongoExecutor:
|
|
@@ -9,10 +11,16 @@ class MongoExecutor:
|
|
|
9
11
|
self.alias = alias
|
|
10
12
|
self.pool_manager = pool_manager
|
|
11
13
|
|
|
12
|
-
async def insert(self, table_name, values, db_name=None):
|
|
14
|
+
async def insert(self, table_name, values, db_name=None, ordered=False, retry_times=3):
|
|
13
15
|
client, db_name_default = self.pool_manager.get_pool(self.alias)
|
|
14
16
|
db_name = db_name or db_name_default
|
|
15
|
-
|
|
17
|
+
for _ in range(retry_times):
|
|
18
|
+
try:
|
|
19
|
+
return await client[f'{db_name}'][f'{table_name}'].insert_many(values, ordered=ordered)
|
|
20
|
+
except NetworkTimeout:
|
|
21
|
+
logger.warning("mongo insert error by NetworkTimeout, retrying...")
|
|
22
|
+
|
|
23
|
+
raise NetworkTimeout
|
|
16
24
|
|
|
17
25
|
def __getattr__(self, table_name: str):
|
|
18
26
|
client, db_name_default = self.pool_manager.get_pool(self.alias)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from typing import Literal
|
|
1
2
|
from abc import ABCMeta, abstractmethod
|
|
2
3
|
|
|
3
4
|
from aioscrapy import Request, Spider
|
|
@@ -37,3 +38,6 @@ class DupeFilterBase(metaclass=ABCMeta):
|
|
|
37
38
|
self.logdupes = False
|
|
38
39
|
|
|
39
40
|
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
|
41
|
+
|
|
42
|
+
async def done(self, request: Request, done_type: Literal["request_done", "parse_done"]) -> None:
|
|
43
|
+
""" deal fingerprint on task successful """
|