aio-scrapy 2.1.0__tar.gz → 2.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio-scrapy-2.1.0/aio_scrapy.egg-info → aio-scrapy-2.1.3}/PKG-INFO +4 -4
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/README.md +1 -1
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3/aio_scrapy.egg-info}/PKG-INFO +4 -4
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/requires.txt +2 -2
- aio-scrapy-2.1.3/aioscrapy/VERSION +1 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/__init__.py +10 -2
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/aiohttp.py +8 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/curl_cffi.py +10 -4
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/httpx.py +9 -1
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/playwright/__init__.py +10 -2
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/pyhttpx.py +8 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/requests.py +9 -2
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/scheduler.py +0 -1
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/scraper.py +13 -2
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/db/aiomongo.py +10 -2
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/dupefilters/__init__.py +8 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/dupefilters/redis.py +65 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/exceptions.py +5 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/retry.py +6 -55
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/mongo.py +7 -2
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/spiders/__init__.py +2 -1
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/templates/spiders/single.tmpl +6 -5
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/python.py +1 -6
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/setup.py +2 -2
- aio-scrapy-2.1.0/aioscrapy/VERSION +0 -1
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/LICENSE +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/MANIFEST.in +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/SOURCES.txt +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/dependency_links.txt +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/entry_points.txt +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/not-zip-safe +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aio_scrapy.egg-info/top_level.txt +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/__main__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/cmdline.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/commands/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/commands/crawl.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/commands/genspider.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/commands/list.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/commands/runspider.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/commands/settings.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/commands/startproject.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/commands/version.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/engine.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/crawler.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/db/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/db/absmanager.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/db/aiomysql.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/db/aiopg.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/db/aiorabbitmq.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/db/aioredis.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/dupefilters/disk.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/http/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/http/headers.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/http/request/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/http/request/form.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/http/request/json_request.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/http/response/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/http/response/html.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/http/response/playwright.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/http/response/text.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/http/response/xml.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/defaultheaders.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/downloadtimeout.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/ja3fingerprint.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/stats.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/downloader/useragent.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/closespider.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/corestats.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/logstats.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/metric.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/extensions/throttle.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/csv.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/execl.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/mysql.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/pipelines/pg.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/depth.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/httperror.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/offsite.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/referer.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/libs/spider/urllength.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/link.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/logformatter.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/middleware/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/middleware/absmanager.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/middleware/downloader.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/middleware/extension.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/middleware/itempipeline.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/middleware/spider.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/process.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/proxy/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/proxy/redis.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/queue/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/queue/memory.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/queue/rabbitmq.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/queue/redis.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/scrapyd/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/scrapyd/runner.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/serializer.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/settings/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/settings/default_settings.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/signalmanager.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/signals.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/spiderloader.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/statscollectors.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/templates/project/aioscrapy.cfg +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/middlewares.py.tmpl +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/pipelines.py.tmpl +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/settings.py.tmpl +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/spiders/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/templates/spiders/basic.tmpl +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/__init__.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/conf.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/curl.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/decorators.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/deprecate.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/httpobj.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/log.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/misc.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/ossignal.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/project.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/reqser.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/request.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/response.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/signal.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/spider.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/template.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/tools.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/trackref.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/utils/url.py +0 -0
- {aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: aio-scrapy
|
|
3
|
-
Version: 2.1.
|
|
3
|
+
Version: 2.1.3
|
|
4
4
|
Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
|
|
5
5
|
Home-page: https://github.com/conlin-huang/aio-scrapy.git
|
|
6
6
|
Author: conlin
|
|
@@ -33,7 +33,7 @@ Requires-Dist: aiomysql>=0.1.1; extra == "all"
|
|
|
33
33
|
Requires-Dist: httpx[http2]>=0.23.0; extra == "all"
|
|
34
34
|
Requires-Dist: aio-pika>=8.1.1; extra == "all"
|
|
35
35
|
Requires-Dist: cryptography; extra == "all"
|
|
36
|
-
Requires-Dist: motor>=
|
|
36
|
+
Requires-Dist: motor>=2.1.0; extra == "all"
|
|
37
37
|
Requires-Dist: pyhttpx>=2.10.1; extra == "all"
|
|
38
38
|
Requires-Dist: asyncpg>=0.27.0; extra == "all"
|
|
39
39
|
Requires-Dist: XlsxWriter>=3.1.2; extra == "all"
|
|
@@ -48,7 +48,7 @@ Requires-Dist: httpx[http2]>=0.23.0; extra == "httpx"
|
|
|
48
48
|
Provides-Extra: aio-pika
|
|
49
49
|
Requires-Dist: aio-pika>=8.1.1; extra == "aio-pika"
|
|
50
50
|
Provides-Extra: mongo
|
|
51
|
-
Requires-Dist: motor>=
|
|
51
|
+
Requires-Dist: motor>=2.1.0; extra == "mongo"
|
|
52
52
|
Provides-Extra: playwright
|
|
53
53
|
Requires-Dist: playwright>=1.31.1; extra == "playwright"
|
|
54
54
|
Provides-Extra: pyhttpx
|
|
@@ -89,7 +89,7 @@ The quick way:
|
|
|
89
89
|
|
|
90
90
|
```shell
|
|
91
91
|
# Install the latest aio-scrapy
|
|
92
|
-
pip install git+https://github.com/
|
|
92
|
+
pip install git+https://github.com/ConlinH/aio-scrapy
|
|
93
93
|
|
|
94
94
|
# default
|
|
95
95
|
pip install aio-scrapy
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: aio-scrapy
|
|
3
|
-
Version: 2.1.
|
|
3
|
+
Version: 2.1.3
|
|
4
4
|
Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
|
|
5
5
|
Home-page: https://github.com/conlin-huang/aio-scrapy.git
|
|
6
6
|
Author: conlin
|
|
@@ -33,7 +33,7 @@ Requires-Dist: aiomysql>=0.1.1; extra == "all"
|
|
|
33
33
|
Requires-Dist: httpx[http2]>=0.23.0; extra == "all"
|
|
34
34
|
Requires-Dist: aio-pika>=8.1.1; extra == "all"
|
|
35
35
|
Requires-Dist: cryptography; extra == "all"
|
|
36
|
-
Requires-Dist: motor>=
|
|
36
|
+
Requires-Dist: motor>=2.1.0; extra == "all"
|
|
37
37
|
Requires-Dist: pyhttpx>=2.10.1; extra == "all"
|
|
38
38
|
Requires-Dist: asyncpg>=0.27.0; extra == "all"
|
|
39
39
|
Requires-Dist: XlsxWriter>=3.1.2; extra == "all"
|
|
@@ -48,7 +48,7 @@ Requires-Dist: httpx[http2]>=0.23.0; extra == "httpx"
|
|
|
48
48
|
Provides-Extra: aio-pika
|
|
49
49
|
Requires-Dist: aio-pika>=8.1.1; extra == "aio-pika"
|
|
50
50
|
Provides-Extra: mongo
|
|
51
|
-
Requires-Dist: motor>=
|
|
51
|
+
Requires-Dist: motor>=2.1.0; extra == "mongo"
|
|
52
52
|
Provides-Extra: playwright
|
|
53
53
|
Requires-Dist: playwright>=1.31.1; extra == "playwright"
|
|
54
54
|
Provides-Extra: pyhttpx
|
|
@@ -89,7 +89,7 @@ The quick way:
|
|
|
89
89
|
|
|
90
90
|
```shell
|
|
91
91
|
# Install the latest aio-scrapy
|
|
92
|
-
pip install git+https://github.com/
|
|
92
|
+
pip install git+https://github.com/ConlinH/aio-scrapy
|
|
93
93
|
|
|
94
94
|
# default
|
|
95
95
|
pip install aio-scrapy
|
|
@@ -20,7 +20,7 @@ aiomysql>=0.1.1
|
|
|
20
20
|
httpx[http2]>=0.23.0
|
|
21
21
|
aio-pika>=8.1.1
|
|
22
22
|
cryptography
|
|
23
|
-
motor>=
|
|
23
|
+
motor>=2.1.0
|
|
24
24
|
pyhttpx>=2.10.1
|
|
25
25
|
asyncpg>=0.27.0
|
|
26
26
|
XlsxWriter>=3.1.2
|
|
@@ -39,7 +39,7 @@ pillow>=9.4.0
|
|
|
39
39
|
httpx[http2]>=0.23.0
|
|
40
40
|
|
|
41
41
|
[mongo]
|
|
42
|
-
motor>=
|
|
42
|
+
motor>=2.1.0
|
|
43
43
|
|
|
44
44
|
[pg]
|
|
45
45
|
asyncpg>=0.27.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
2.1.3
|
|
@@ -138,14 +138,16 @@ class Downloader(BaseDownloader):
|
|
|
138
138
|
|
|
139
139
|
@classmethod
|
|
140
140
|
async def from_crawler(cls, crawler) -> "Downloader":
|
|
141
|
+
df = crawler.settings.get('DUPEFILTER_CLASS') and await load_instance(crawler.settings['DUPEFILTER_CLASS'],
|
|
142
|
+
crawler=crawler)
|
|
143
|
+
crawler.spider.dupefilter = df # 将指纹绑定到Spider 在解析成功的时候 调用DUPEFILTER_CLASS的success方法
|
|
141
144
|
return cls(
|
|
142
145
|
crawler,
|
|
143
146
|
await call_helper(DownloadHandlerManager.for_crawler, crawler),
|
|
144
147
|
await call_helper(DownloaderMiddlewareManager.from_crawler, crawler),
|
|
145
148
|
proxy=crawler.settings.get("PROXY_HANDLER") and await load_instance(crawler.settings["PROXY_HANDLER"],
|
|
146
149
|
crawler=crawler),
|
|
147
|
-
dupefilter=
|
|
148
|
-
crawler.settings['DUPEFILTER_CLASS'], crawler=crawler)
|
|
150
|
+
dupefilter=df
|
|
149
151
|
)
|
|
150
152
|
|
|
151
153
|
async def fetch(self, request: Request) -> None:
|
|
@@ -204,11 +206,17 @@ class Downloader(BaseDownloader):
|
|
|
204
206
|
slot.transferring.remove(request)
|
|
205
207
|
slot.active.remove(request)
|
|
206
208
|
self.active.remove(request)
|
|
209
|
+
|
|
207
210
|
if isinstance(result, Response):
|
|
208
211
|
await self.signals.send_catch_log(signal=signals.response_downloaded,
|
|
209
212
|
response=result,
|
|
210
213
|
request=request,
|
|
211
214
|
spider=self.spider)
|
|
215
|
+
# 控制指纹是否移除
|
|
216
|
+
self.dupefilter and \
|
|
217
|
+
not request.dont_filter and \
|
|
218
|
+
await self.dupefilter.done(request, done_type="request_ok" if isinstance(result, Response) else "request_err")
|
|
219
|
+
|
|
212
220
|
await self._call_engine(result, request)
|
|
213
221
|
await self._process_queue(slot)
|
|
214
222
|
|
|
@@ -4,9 +4,11 @@ import ssl
|
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
6
|
import aiohttp
|
|
7
|
+
from aiohttp.client_exceptions import ClientError
|
|
7
8
|
|
|
8
9
|
from aioscrapy import Request
|
|
9
10
|
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
11
|
+
from aioscrapy.exceptions import DownloadError
|
|
10
12
|
from aioscrapy.http import HtmlResponse
|
|
11
13
|
from aioscrapy.settings import Settings
|
|
12
14
|
from aioscrapy.utils.log import logger
|
|
@@ -32,6 +34,12 @@ class AioHttpDownloadHandler(BaseDownloadHandler):
|
|
|
32
34
|
return self.session
|
|
33
35
|
|
|
34
36
|
async def download_request(self, request: Request, _) -> HtmlResponse:
|
|
37
|
+
try:
|
|
38
|
+
return await self._download_request(request)
|
|
39
|
+
except ClientError as e:
|
|
40
|
+
raise DownloadError(e) from e
|
|
41
|
+
|
|
42
|
+
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
35
43
|
kwargs = {
|
|
36
44
|
'verify_ssl': request.meta.get('verify_ssl', self.verify_ssl),
|
|
37
45
|
'timeout': request.meta.get('download_timeout', 180),
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
1
|
+
from curl_cffi.curl import CurlError
|
|
3
2
|
from curl_cffi.requests import AsyncSession
|
|
4
3
|
|
|
5
4
|
from aioscrapy import Request
|
|
6
5
|
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
6
|
+
from aioscrapy.exceptions import DownloadError
|
|
7
7
|
from aioscrapy.http import HtmlResponse
|
|
8
8
|
from aioscrapy.settings import Settings
|
|
9
9
|
from aioscrapy.utils.log import logger
|
|
@@ -21,6 +21,12 @@ class CurlCffiDownloadHandler(BaseDownloadHandler):
|
|
|
21
21
|
return cls(settings)
|
|
22
22
|
|
|
23
23
|
async def download_request(self, request: Request, _) -> HtmlResponse:
|
|
24
|
+
try:
|
|
25
|
+
return await self._download_request(request)
|
|
26
|
+
except CurlError as e:
|
|
27
|
+
raise DownloadError(e) from e
|
|
28
|
+
|
|
29
|
+
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
24
30
|
kwargs = {
|
|
25
31
|
'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
|
|
26
32
|
'cookies': dict(request.cookies),
|
|
@@ -52,8 +58,8 @@ class CurlCffiDownloadHandler(BaseDownloadHandler):
|
|
|
52
58
|
str(response.url),
|
|
53
59
|
status=response.status_code,
|
|
54
60
|
headers=response.headers,
|
|
55
|
-
body=response.
|
|
56
|
-
cookies=
|
|
61
|
+
body=response.content,
|
|
62
|
+
cookies={j.name: j.value or '' for j in response.cookies.jar},
|
|
57
63
|
encoding=response.encoding
|
|
58
64
|
)
|
|
59
65
|
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import ssl
|
|
2
2
|
|
|
3
3
|
import httpx
|
|
4
|
+
from httpx import HTTPError as HttpxError
|
|
4
5
|
|
|
5
6
|
from aioscrapy import Request
|
|
6
7
|
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
8
|
+
from aioscrapy.exceptions import DownloadError
|
|
7
9
|
from aioscrapy.http import HtmlResponse
|
|
8
10
|
from aioscrapy.settings import Settings
|
|
9
11
|
from aioscrapy.utils.log import logger
|
|
@@ -27,6 +29,12 @@ class HttpxDownloadHandler(BaseDownloadHandler):
|
|
|
27
29
|
return cls(settings)
|
|
28
30
|
|
|
29
31
|
async def download_request(self, request: Request, _) -> HtmlResponse:
|
|
32
|
+
try:
|
|
33
|
+
return await self._download_request(request)
|
|
34
|
+
except HttpxError as e:
|
|
35
|
+
raise DownloadError(e) from e
|
|
36
|
+
|
|
37
|
+
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
30
38
|
kwargs = {
|
|
31
39
|
'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
|
|
32
40
|
'cookies': dict(request.cookies),
|
|
@@ -68,7 +76,7 @@ class HttpxDownloadHandler(BaseDownloadHandler):
|
|
|
68
76
|
status=response.status_code,
|
|
69
77
|
headers=response.headers,
|
|
70
78
|
body=content,
|
|
71
|
-
cookies=
|
|
79
|
+
cookies={j.name: j.value or '' for j in response.cookies.jar},
|
|
72
80
|
encoding=response.encoding
|
|
73
81
|
)
|
|
74
82
|
|
{aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/playwright/__init__.py
RENAMED
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
from functools import wraps
|
|
2
2
|
|
|
3
|
+
from playwright._impl._api_types import Error
|
|
3
4
|
from playwright.async_api._generated import Response as EventResponse
|
|
4
5
|
|
|
5
|
-
from aioscrapy import Request
|
|
6
|
+
from aioscrapy import Request, Spider
|
|
6
7
|
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
7
8
|
from aioscrapy.core.downloader.handlers.playwright.driverpool import WebDriverPool
|
|
8
9
|
from aioscrapy.core.downloader.handlers.playwright.webdriver import PlaywrightDriver
|
|
10
|
+
from aioscrapy.exceptions import DownloadError
|
|
9
11
|
from aioscrapy.http import PlaywrightResponse
|
|
10
12
|
from aioscrapy.settings import Settings
|
|
11
13
|
from aioscrapy.utils.tools import call_helper
|
|
@@ -24,7 +26,13 @@ class PlaywrightHandler(BaseDownloadHandler):
|
|
|
24
26
|
def from_settings(cls, settings: Settings):
|
|
25
27
|
return cls(settings)
|
|
26
28
|
|
|
27
|
-
async def download_request(self, request: Request, spider) -> PlaywrightResponse:
|
|
29
|
+
async def download_request(self, request: Request, spider: Spider) -> PlaywrightResponse:
|
|
30
|
+
try:
|
|
31
|
+
return await self._download_request(request, spider)
|
|
32
|
+
except Error as e:
|
|
33
|
+
raise DownloadError(e) from e
|
|
34
|
+
|
|
35
|
+
async def _download_request(self, request: Request, spider) -> PlaywrightResponse:
|
|
28
36
|
cookies = dict(request.cookies)
|
|
29
37
|
timeout = request.meta.get('download_timeout', 30) * 1000
|
|
30
38
|
user_agent = request.headers.get("User-Agent")
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
|
|
3
3
|
import pyhttpx
|
|
4
|
+
from pyhttpx.exception import BaseExpetion as PyHttpxError
|
|
4
5
|
|
|
5
6
|
from aioscrapy import Request
|
|
6
7
|
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
8
|
+
from aioscrapy.exceptions import DownloadError
|
|
7
9
|
from aioscrapy.http import HtmlResponse
|
|
8
10
|
from aioscrapy.settings import Settings
|
|
9
11
|
from aioscrapy.utils.log import logger
|
|
@@ -22,6 +24,12 @@ class PyhttpxDownloadHandler(BaseDownloadHandler):
|
|
|
22
24
|
return cls(settings)
|
|
23
25
|
|
|
24
26
|
async def download_request(self, request: Request, _) -> HtmlResponse:
|
|
27
|
+
try:
|
|
28
|
+
return await self._download_request(request)
|
|
29
|
+
except PyHttpxError as e:
|
|
30
|
+
raise DownloadError(e) from e
|
|
31
|
+
|
|
32
|
+
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
25
33
|
kwargs = {
|
|
26
34
|
'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
|
|
27
35
|
'cookies': dict(request.cookies),
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
|
|
3
3
|
import requests
|
|
4
|
+
from requests.exceptions import RequestException as RequestsError
|
|
4
5
|
|
|
5
6
|
from aioscrapy import Request
|
|
6
7
|
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
8
|
+
from aioscrapy.exceptions import DownloadError
|
|
7
9
|
from aioscrapy.http import HtmlResponse
|
|
8
10
|
from aioscrapy.settings import Settings
|
|
9
11
|
from aioscrapy.utils.log import logger
|
|
@@ -14,13 +16,18 @@ class RequestsDownloadHandler(BaseDownloadHandler):
|
|
|
14
16
|
def __init__(self, settings):
|
|
15
17
|
self.settings: Settings = settings
|
|
16
18
|
self.verify_ssl: bool = self.settings.get("VERIFY_SSL", True)
|
|
17
|
-
self.loop = asyncio.get_running_loop()
|
|
18
19
|
|
|
19
20
|
@classmethod
|
|
20
21
|
def from_settings(cls, settings: Settings):
|
|
21
22
|
return cls(settings)
|
|
22
23
|
|
|
23
24
|
async def download_request(self, request: Request, _) -> HtmlResponse:
|
|
25
|
+
try:
|
|
26
|
+
return await self._download_request(request)
|
|
27
|
+
except RequestsError as e:
|
|
28
|
+
raise DownloadError(e) from e
|
|
29
|
+
|
|
30
|
+
async def _download_request(self, request: Request) -> HtmlResponse:
|
|
24
31
|
kwargs = {
|
|
25
32
|
'timeout': self.settings.get('DOWNLOAD_TIMEOUT'),
|
|
26
33
|
'cookies': dict(request.cookies),
|
|
@@ -48,7 +55,7 @@ class RequestsDownloadHandler(BaseDownloadHandler):
|
|
|
48
55
|
status=response.status_code,
|
|
49
56
|
headers=response.headers,
|
|
50
57
|
body=response.content,
|
|
51
|
-
cookies=
|
|
58
|
+
cookies={k: v or '' for k, v in response.cookies.items()},
|
|
52
59
|
encoding=response.encoding
|
|
53
60
|
)
|
|
54
61
|
|
|
@@ -191,7 +191,6 @@ class Scheduler(BaseScheduler):
|
|
|
191
191
|
self.stats.inc_value(self.queue.inc_key, spider=self.spider)
|
|
192
192
|
yield request
|
|
193
193
|
|
|
194
|
-
|
|
195
194
|
async def has_pending_requests(self) -> bool:
|
|
196
195
|
return await call_helper(self.queue.len) if self.cache_queue is None \
|
|
197
196
|
else (await call_helper(self.queue.len) + await call_helper(self.cache_queue.len)) > 0
|
|
@@ -110,9 +110,14 @@ class Scraper:
|
|
|
110
110
|
await self.handle_spider_error(e, request, result)
|
|
111
111
|
else:
|
|
112
112
|
await self.handle_spider_output(output, request, result)
|
|
113
|
-
except BaseException:
|
|
114
|
-
|
|
113
|
+
except BaseException as e:
|
|
114
|
+
await self.handle_spider_error(e, request, result)
|
|
115
115
|
finally:
|
|
116
|
+
# 控制指纹是否移除
|
|
117
|
+
self.spider.dupefilter and \
|
|
118
|
+
not request.dont_filter and \
|
|
119
|
+
await self.spider.dupefilter.done(request, done_type="parse_ok" if getattr(request, "parse_ok", False) else "parse_err")
|
|
120
|
+
|
|
116
121
|
if isinstance(result, PlaywrightResponse):
|
|
117
122
|
await result.release()
|
|
118
123
|
|
|
@@ -162,16 +167,22 @@ class Scraper:
|
|
|
162
167
|
if not result:
|
|
163
168
|
return
|
|
164
169
|
|
|
170
|
+
parse_ok = True
|
|
165
171
|
while True:
|
|
166
172
|
try:
|
|
167
173
|
output = await result.__anext__()
|
|
168
174
|
except StopAsyncIteration:
|
|
169
175
|
break
|
|
170
176
|
except Exception as e:
|
|
177
|
+
parse_ok = False
|
|
171
178
|
await self.handle_spider_error(e, request, response)
|
|
172
179
|
else:
|
|
173
180
|
await self._process_spidermw_output(output, request, response)
|
|
174
181
|
|
|
182
|
+
self.spider.dupefilter and \
|
|
183
|
+
not request.dont_filter and \
|
|
184
|
+
setattr(request, "parse_ok", parse_ok)
|
|
185
|
+
|
|
175
186
|
async def _process_spidermw_output(self, output: Any, request: Request, response: Response) -> None:
|
|
176
187
|
"""Process each Request/Item (given in the output parameter) returned from the given spider"""
|
|
177
188
|
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from motor.motor_asyncio import AsyncIOMotorClient
|
|
2
|
+
from pymongo.errors import NetworkTimeout
|
|
2
3
|
|
|
3
4
|
import aioscrapy
|
|
4
5
|
from aioscrapy.db.absmanager import AbsDBPoolManager
|
|
6
|
+
from loguru import logger
|
|
5
7
|
|
|
6
8
|
|
|
7
9
|
class MongoExecutor:
|
|
@@ -9,10 +11,16 @@ class MongoExecutor:
|
|
|
9
11
|
self.alias = alias
|
|
10
12
|
self.pool_manager = pool_manager
|
|
11
13
|
|
|
12
|
-
async def insert(self, table_name, values, db_name=None):
|
|
14
|
+
async def insert(self, table_name, values, db_name=None, ordered=False, retry_times=3):
|
|
13
15
|
client, db_name_default = self.pool_manager.get_pool(self.alias)
|
|
14
16
|
db_name = db_name or db_name_default
|
|
15
|
-
|
|
17
|
+
for _ in range(retry_times):
|
|
18
|
+
try:
|
|
19
|
+
return await client[f'{db_name}'][f'{table_name}'].insert_many(values, ordered=ordered)
|
|
20
|
+
except NetworkTimeout:
|
|
21
|
+
logger.warning("mongo insert error by NetworkTimeout, retrying...")
|
|
22
|
+
|
|
23
|
+
raise NetworkTimeout
|
|
16
24
|
|
|
17
25
|
def __getattr__(self, table_name: str):
|
|
18
26
|
client, db_name_default = self.pool_manager.get_pool(self.alias)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from typing import Literal
|
|
1
2
|
from abc import ABCMeta, abstractmethod
|
|
2
3
|
|
|
3
4
|
from aioscrapy import Request, Spider
|
|
@@ -37,3 +38,10 @@ class DupeFilterBase(metaclass=ABCMeta):
|
|
|
37
38
|
self.logdupes = False
|
|
38
39
|
|
|
39
40
|
spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
|
|
41
|
+
|
|
42
|
+
async def done(
|
|
43
|
+
self,
|
|
44
|
+
request: Request,
|
|
45
|
+
done_type: Literal["request_ok", "request_err", "parse_ok", "parse_err"]
|
|
46
|
+
) -> None:
|
|
47
|
+
""" 根据done_type的状态 控制指纹的移除 """
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
|
|
1
3
|
from aioscrapy import Request
|
|
2
4
|
from aioscrapy.db import db_manager
|
|
3
5
|
from aioscrapy.dupefilters import DupeFilterBase
|
|
@@ -128,5 +130,68 @@ class RedisBloomDupeFilter(RedisRFPDupeFilter):
|
|
|
128
130
|
return False
|
|
129
131
|
|
|
130
132
|
|
|
133
|
+
class ExRedisBloomDupeFilter(RedisBloomDupeFilter):
|
|
134
|
+
|
|
135
|
+
def __init__(self, server, key, key_set, ttl, debug, bit, hash_number, keep_on_close, info):
|
|
136
|
+
super().__init__(server, key, debug, bit, hash_number, keep_on_close, info)
|
|
137
|
+
self.key_set = key_set
|
|
138
|
+
self.ttl = ttl
|
|
139
|
+
|
|
140
|
+
@classmethod
|
|
141
|
+
async def from_crawler(cls, crawler: "aioscrapy.crawler.Crawler"):
|
|
142
|
+
server = db_manager.redis.queue
|
|
143
|
+
dupefilter_key = crawler.settings.get("SCHEDULER_DUPEFILTER_KEY", '%(spider)s:bloomfilter')
|
|
144
|
+
keep_on_close = crawler.settings.getbool("KEEP_DUPEFILTER_DATA_ON_CLOSE", True)
|
|
145
|
+
key = dupefilter_key % {'spider': crawler.spider.name}
|
|
146
|
+
debug = crawler.settings.getbool('DUPEFILTER_DEBUG', False)
|
|
147
|
+
info = crawler.settings.getbool('DUPEFILTER_INFO', False)
|
|
148
|
+
bit = crawler.settings.getint('BLOOMFILTER_BIT', 30)
|
|
149
|
+
hash_number = crawler.settings.getint('BLOOMFILTER_HASH_NUMBER', 6)
|
|
150
|
+
ttl = crawler.settings.getint('DUPEFILTER_SET_KEY_TTL', 180)
|
|
151
|
+
return cls(server, key=key, key_set=key + "_set", ttl=ttl, debug=debug, bit=bit, hash_number=hash_number,
|
|
152
|
+
keep_on_close=keep_on_close, info=info)
|
|
153
|
+
|
|
154
|
+
async def request_seen(self, request: Request) -> bool:
|
|
155
|
+
fp = await self.bf.exists(request.fingerprint)
|
|
156
|
+
if fp:
|
|
157
|
+
return True
|
|
158
|
+
async with self.server.pipeline() as pipe:
|
|
159
|
+
pipe.sadd(self.key_set, request.fingerprint)
|
|
160
|
+
pipe.expire(self.key_set, self.ttl)
|
|
161
|
+
ret, _ = await pipe.execute()
|
|
162
|
+
return ret == 0
|
|
163
|
+
|
|
164
|
+
async def done(
|
|
165
|
+
self,
|
|
166
|
+
request: Request,
|
|
167
|
+
done_type: Literal["request_ok", "request_err", "parse_ok", "parse_err"]
|
|
168
|
+
):
|
|
169
|
+
if done_type == "request_ok" or done_type == "request_err":
|
|
170
|
+
await self.server.srem(self.key_set, request.fingerprint)
|
|
171
|
+
elif done_type == "parse_ok":
|
|
172
|
+
await self.bf.insert(request.fingerprint)
|
|
173
|
+
|
|
174
|
+
async def close(self, reason=''):
|
|
175
|
+
if not self.keep_on_close:
|
|
176
|
+
await self.clear()
|
|
177
|
+
await self.server.delete(self.key_set)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class ExRedisRFPDupeFilter(RedisRFPDupeFilter):
|
|
181
|
+
|
|
182
|
+
async def done(
|
|
183
|
+
self,
|
|
184
|
+
request: Request,
|
|
185
|
+
done_type: Literal["request_ok", "request_err", "parse_ok", "parse_err"]
|
|
186
|
+
):
|
|
187
|
+
# 当请求失败或解析失败的时候 从Redis的Set中移除指纹
|
|
188
|
+
if done_type == "request_err" or done_type == "parse_err":
|
|
189
|
+
await self.server.srem(self.key, request.fingerprint)
|
|
190
|
+
|
|
191
|
+
|
|
131
192
|
RFPDupeFilter = RedisRFPDupeFilter
|
|
193
|
+
ExRFPDupeFilter = ExRedisRFPDupeFilter
|
|
132
194
|
BloomDupeFilter = RedisBloomDupeFilter
|
|
195
|
+
ExBloomDupeFilter = ExRedisBloomDupeFilter
|
|
196
|
+
BloomSetDupeFilter = ExRedisBloomDupeFilter
|
|
197
|
+
|
|
@@ -10,70 +10,21 @@ Failed pages are collected on the scraping process and rescheduled at the end,
|
|
|
10
10
|
once the spider has finished crawling all regular (non failed) pages.
|
|
11
11
|
"""
|
|
12
12
|
from typing import Optional, Union
|
|
13
|
-
|
|
13
|
+
|
|
14
|
+
from anyio import EndOfStream
|
|
14
15
|
|
|
15
16
|
try:
|
|
16
17
|
from asyncio.exceptions import TimeoutError
|
|
17
18
|
except:
|
|
18
19
|
from concurrent.futures._base import TimeoutError
|
|
19
20
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
try:
|
|
23
|
-
from aiohttp.client_exceptions import ClientError
|
|
24
|
-
|
|
25
|
-
NEED_RETRY_ERROR += (ClientError,)
|
|
26
|
-
except ImportError:
|
|
27
|
-
pass
|
|
28
|
-
|
|
29
|
-
try:
|
|
30
|
-
from anyio import EndOfStream
|
|
31
|
-
|
|
32
|
-
NEED_RETRY_ERROR += (EndOfStream,)
|
|
33
|
-
except ImportError:
|
|
34
|
-
pass
|
|
35
|
-
|
|
36
|
-
try:
|
|
37
|
-
from httpx import HTTPError as HttpxError
|
|
38
|
-
|
|
39
|
-
NEED_RETRY_ERROR += (HttpxError,)
|
|
40
|
-
except ImportError:
|
|
41
|
-
pass
|
|
42
|
-
|
|
43
|
-
try:
|
|
44
|
-
from pyhttpx.exception import BaseExpetion as PyHttpxError
|
|
45
|
-
|
|
46
|
-
NEED_RETRY_ERROR += (PyHttpxError,)
|
|
47
|
-
except ImportError:
|
|
48
|
-
pass
|
|
49
|
-
|
|
50
|
-
try:
|
|
51
|
-
from requests.exceptions import RequestException as RequestsError
|
|
52
|
-
|
|
53
|
-
NEED_RETRY_ERROR += (RequestsError,)
|
|
54
|
-
except ImportError:
|
|
55
|
-
pass
|
|
56
|
-
|
|
57
|
-
try:
|
|
58
|
-
from playwright._impl._api_types import Error as PlaywrightError
|
|
59
|
-
|
|
60
|
-
NEED_RETRY_ERROR += (PlaywrightError,)
|
|
61
|
-
except ImportError:
|
|
62
|
-
pass
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
try:
|
|
66
|
-
from curl_cffi.curl import CurlError
|
|
67
|
-
|
|
68
|
-
NEED_RETRY_ERROR += (CurlError,)
|
|
69
|
-
except ImportError:
|
|
70
|
-
pass
|
|
71
|
-
|
|
72
|
-
from aioscrapy.exceptions import NotConfigured
|
|
21
|
+
from aioscrapy.exceptions import ProxyException, DownloadError, NotConfigured
|
|
73
22
|
from aioscrapy.http.request import Request
|
|
74
23
|
from aioscrapy.spiders import Spider
|
|
75
|
-
from aioscrapy.utils.python import global_object_name
|
|
76
24
|
from aioscrapy.utils.log import logger as retry_logger
|
|
25
|
+
from aioscrapy.utils.python import global_object_name
|
|
26
|
+
|
|
27
|
+
NEED_RETRY_ERROR = (TimeoutError, ConnectionRefusedError, IOError, ProxyException, DownloadError, EndOfStream)
|
|
77
28
|
|
|
78
29
|
|
|
79
30
|
def get_retry_request(
|
|
@@ -9,6 +9,8 @@ class MongoPipeline(DBPipelineBase):
|
|
|
9
9
|
def __init__(self, settings, db_type: str):
|
|
10
10
|
super().__init__(settings, db_type)
|
|
11
11
|
self.db_cache = {}
|
|
12
|
+
self.ordered_cache = {}
|
|
13
|
+
self.retry_times = settings.getint("MONGO_TIMEOUT_RETRY_TIMES", 3)
|
|
12
14
|
|
|
13
15
|
@classmethod
|
|
14
16
|
def from_settings(cls, settings):
|
|
@@ -17,17 +19,19 @@ class MongoPipeline(DBPipelineBase):
|
|
|
17
19
|
def parse_item_to_cache(self, item: dict, save_info: dict):
|
|
18
20
|
db_name = save_info.get('db_name')
|
|
19
21
|
table_name = save_info.get('table_name')
|
|
22
|
+
ordered = save_info.get('ordered', False)
|
|
20
23
|
assert table_name is not None, 'please set table_name'
|
|
21
24
|
db_alias = save_info.get('db_alias', ['default'])
|
|
22
25
|
if isinstance(db_alias, str):
|
|
23
26
|
db_alias = [db_alias]
|
|
24
27
|
|
|
25
|
-
cache_key = ''.join(db_alias) + (db_name or '') + table_name
|
|
28
|
+
cache_key = ''.join(db_alias) + (db_name or '') + table_name + str(ordered)
|
|
26
29
|
|
|
27
30
|
if self.table_cache.get(cache_key) is None:
|
|
28
31
|
self.db_alias_cache[cache_key] = db_alias
|
|
29
32
|
self.table_cache[cache_key] = table_name
|
|
30
33
|
self.db_cache[cache_key] = db_name
|
|
34
|
+
self.ordered_cache[cache_key] = ordered
|
|
31
35
|
self.item_cache[cache_key] = []
|
|
32
36
|
|
|
33
37
|
self.item_cache[cache_key].append(item)
|
|
@@ -40,7 +44,8 @@ class MongoPipeline(DBPipelineBase):
|
|
|
40
44
|
try:
|
|
41
45
|
executor = db_manager.mongo.executor(alias)
|
|
42
46
|
result = await executor.insert(
|
|
43
|
-
table_name, self.item_cache[cache_key], db_name=self.db_cache[cache_key]
|
|
47
|
+
table_name, self.item_cache[cache_key], db_name=self.db_cache[cache_key],
|
|
48
|
+
ordered=self.ordered_cache[cache_key], retry_times=self.retry_times
|
|
44
49
|
)
|
|
45
50
|
logger.info(
|
|
46
51
|
f'table:{alias}->{table_name} sum:{len(self.item_cache[cache_key])} ok:{len(result.inserted_ids)}'
|
|
@@ -22,6 +22,7 @@ class Spider(object):
|
|
|
22
22
|
|
|
23
23
|
name: Optional[str] = None
|
|
24
24
|
proxy: Optional["aioscrapy.proxy.AbsProxy"] = None
|
|
25
|
+
dupefilter: Optional["aioscrapy.dupefilters.DupeFilterBase"] = None
|
|
25
26
|
custom_settings: Optional[dict] = None
|
|
26
27
|
stats: Optional[StatsCollector] = None
|
|
27
28
|
|
|
@@ -77,7 +78,7 @@ class Spider(object):
|
|
|
77
78
|
yield Request(url)
|
|
78
79
|
|
|
79
80
|
async def request_from_dict(self, d: dict):
|
|
80
|
-
"""
|
|
81
|
+
"""继承成后重写改方法,将队列中的json根据情况构建成Request对象"""
|
|
81
82
|
pass
|
|
82
83
|
|
|
83
84
|
async def _parse(self, response: Response, **kwargs):
|
|
@@ -24,11 +24,12 @@ class $classname(Spider):
|
|
|
24
24
|
pass
|
|
25
25
|
|
|
26
26
|
async def parse(self, response):
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
27
|
+
for quote in response.css('div.quote'):
|
|
28
|
+
item = {
|
|
29
|
+
'author': quote.xpath('span/small/text()').get(),
|
|
30
|
+
'text': quote.css('span.text::text').get(),
|
|
31
|
+
}
|
|
32
|
+
yield item
|
|
32
33
|
|
|
33
34
|
async def process_item(self, item):
|
|
34
35
|
logger.info(item)
|
|
@@ -1,16 +1,12 @@
|
|
|
1
1
|
"""
|
|
2
2
|
This module contains essential stuff that should've come with Python itself ;)
|
|
3
3
|
"""
|
|
4
|
-
import errno
|
|
5
4
|
import gc
|
|
6
|
-
import inspect
|
|
7
5
|
import re
|
|
8
6
|
import sys
|
|
9
|
-
import warnings
|
|
10
7
|
import weakref
|
|
11
|
-
from functools import
|
|
8
|
+
from functools import wraps
|
|
12
9
|
|
|
13
|
-
from aioscrapy.exceptions import AioScrapyDeprecationWarning
|
|
14
10
|
from aioscrapy.utils.decorators import deprecated
|
|
15
11
|
|
|
16
12
|
|
|
@@ -150,4 +146,3 @@ if hasattr(sys, "pypy_version_info"):
|
|
|
150
146
|
else:
|
|
151
147
|
def garbage_collect():
|
|
152
148
|
gc.collect()
|
|
153
|
-
|
|
@@ -18,13 +18,13 @@ install_requires = [
|
|
|
18
18
|
extras_require = {
|
|
19
19
|
"all": [
|
|
20
20
|
"aiomysql>=0.1.1", "httpx[http2]>=0.23.0", "aio-pika>=8.1.1",
|
|
21
|
-
"cryptography", "motor>=
|
|
21
|
+
"cryptography", "motor>=2.1.0", "pyhttpx>=2.10.1", "asyncpg>=0.27.0",
|
|
22
22
|
"XlsxWriter>=3.1.2", "pillow>=9.4.0", "requests>=2.28.2", "curl_cffi"
|
|
23
23
|
],
|
|
24
24
|
"aiomysql": ["aiomysql>=0.1.1", "cryptography"],
|
|
25
25
|
"httpx": ["httpx[http2]>=0.23.0"],
|
|
26
26
|
"aio-pika": ["aio-pika>=8.1.1"],
|
|
27
|
-
"mongo": ["motor>=
|
|
27
|
+
"mongo": ["motor>=2.1.0"],
|
|
28
28
|
"playwright": ["playwright>=1.31.1"],
|
|
29
29
|
"pyhttpx": ["pyhttpx>=2.10.4"],
|
|
30
30
|
"curl_cffi": ["curl_cffi>=0.6.1"],
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
2.1.0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/playwright/driverpool.py
RENAMED
|
File without changes
|
{aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/core/downloader/handlers/playwright/webdriver.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/middlewares.py.tmpl
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{aio-scrapy-2.1.0 → aio-scrapy-2.1.3}/aioscrapy/templates/project/module/spiders/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|