aio-scrapy 2.1.7__tar.gz → 2.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.7/aio_scrapy.egg-info → aio_scrapy-2.1.9}/PKG-INFO +54 -4
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/README.md +45 -2
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9/aio_scrapy.egg-info}/PKG-INFO +54 -4
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aio_scrapy.egg-info/SOURCES.txt +2 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aio_scrapy.egg-info/requires.txt +8 -0
- aio_scrapy-2.1.9/aioscrapy/VERSION +1 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/curl_cffi.py +13 -3
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/webdriver/__init__.py +1 -0
- aio_scrapy-2.1.9/aioscrapy/core/downloader/handlers/webdriver/sbcdp.py +404 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/http/request/__init__.py +89 -5
- aio_scrapy-2.1.9/aioscrapy/libs/pipelines/redis.py +122 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/settings/default_settings.py +7 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/setup.py +4 -1
- aio_scrapy-2.1.7/aioscrapy/VERSION +0 -1
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/LICENSE +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/MANIFEST.in +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aio_scrapy.egg-info/dependency_links.txt +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aio_scrapy.egg-info/entry_points.txt +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aio_scrapy.egg-info/not-zip-safe +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aio_scrapy.egg-info/top_level.txt +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/__main__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/cmdline.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/commands/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/commands/crawl.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/commands/genspider.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/commands/list.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/commands/runspider.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/commands/settings.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/commands/startproject.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/commands/version.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/core/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/aiohttp.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/httpx.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/pyhttpx.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/requests.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/webdriver/driverpool.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/core/downloader/handlers/webdriver/playwright.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/core/engine.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/core/scheduler.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/core/scraper.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/crawler.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/db/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/db/absmanager.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/db/aiomongo.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/db/aiomysql.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/db/aiopg.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/db/aiorabbitmq.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/db/aioredis.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/dupefilters/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/dupefilters/disk.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/dupefilters/redis.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/exceptions.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/http/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/http/headers.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/http/request/form.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/http/request/json_request.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/http/response/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/http/response/html.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/http/response/text.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/http/response/web_driver.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/http/response/xml.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/downloader/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/downloader/defaultheaders.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/downloader/downloadtimeout.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/downloader/ja3fingerprint.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/downloader/retry.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/downloader/stats.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/downloader/useragent.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/extensions/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/extensions/closespider.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/extensions/corestats.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/extensions/logstats.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/extensions/metric.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/extensions/throttle.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/pipelines/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/pipelines/csv.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/pipelines/excel.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/pipelines/mongo.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/pipelines/mysql.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/pipelines/pg.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/spider/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/spider/depth.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/spider/httperror.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/spider/offsite.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/spider/referer.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/libs/spider/urllength.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/link.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/logformatter.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/middleware/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/middleware/absmanager.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/middleware/downloader.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/middleware/extension.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/middleware/itempipeline.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/middleware/spider.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/process.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/proxy/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/proxy/redis.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/queue/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/queue/memory.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/queue/rabbitmq.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/queue/redis.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/scrapyd/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/scrapyd/runner.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/serializer.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/settings/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/signalmanager.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/signals.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/spiderloader.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/spiders/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/statscollectors.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/templates/project/aioscrapy.cfg +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/templates/project/module/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/templates/project/module/middlewares.py.tmpl +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/templates/project/module/pipelines.py.tmpl +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/templates/project/module/settings.py.tmpl +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/templates/project/module/spiders/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/templates/spiders/basic.tmpl +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/templates/spiders/single.tmpl +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/__init__.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/conf.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/curl.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/decorators.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/deprecate.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/httpobj.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/log.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/misc.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/ossignal.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/project.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/python.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/reqser.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/request.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/response.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/signal.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/spider.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/template.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/tools.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/trackref.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/aioscrapy/utils/url.py +0 -0
- {aio_scrapy-2.1.7 → aio_scrapy-2.1.9}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: aio-scrapy
|
|
3
|
-
Version: 2.1.
|
|
3
|
+
Version: 2.1.9
|
|
4
4
|
Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
|
|
5
5
|
Home-page: https://github.com/conlin-huang/aio-scrapy.git
|
|
6
6
|
Author: conlin
|
|
@@ -41,6 +41,8 @@ Requires-Dist: XlsxWriter>=3.1.2; extra == "all"
|
|
|
41
41
|
Requires-Dist: pillow>=9.4.0; extra == "all"
|
|
42
42
|
Requires-Dist: requests>=2.28.2; extra == "all"
|
|
43
43
|
Requires-Dist: curl_cffi; extra == "all"
|
|
44
|
+
Requires-Dist: sbcdp; extra == "all"
|
|
45
|
+
Requires-Dist: DrissionPage; extra == "all"
|
|
44
46
|
Provides-Extra: aiomysql
|
|
45
47
|
Requires-Dist: aiomysql>=0.1.1; extra == "aiomysql"
|
|
46
48
|
Requires-Dist: cryptography; extra == "aiomysql"
|
|
@@ -52,6 +54,10 @@ Provides-Extra: mongo
|
|
|
52
54
|
Requires-Dist: motor>=2.1.0; extra == "mongo"
|
|
53
55
|
Provides-Extra: playwright
|
|
54
56
|
Requires-Dist: playwright>=1.31.1; extra == "playwright"
|
|
57
|
+
Provides-Extra: sbcdp
|
|
58
|
+
Requires-Dist: sbcdp; extra == "sbcdp"
|
|
59
|
+
Provides-Extra: dp
|
|
60
|
+
Requires-Dist: DrissionPage; extra == "dp"
|
|
55
61
|
Provides-Extra: pyhttpx
|
|
56
62
|
Requires-Dist: pyhttpx>=2.10.4; extra == "pyhttpx"
|
|
57
63
|
Provides-Extra: curl-cffi
|
|
@@ -71,6 +77,7 @@ Dynamic: description-content-type
|
|
|
71
77
|
Dynamic: home-page
|
|
72
78
|
Dynamic: keywords
|
|
73
79
|
Dynamic: license
|
|
80
|
+
Dynamic: license-file
|
|
74
81
|
Dynamic: provides-extra
|
|
75
82
|
Dynamic: requires-dist
|
|
76
83
|
Dynamic: requires-python
|
|
@@ -84,7 +91,7 @@ AioScrapy is a powerful asynchronous web crawling framework built on Python's as
|
|
|
84
91
|
## 特性 | Features
|
|
85
92
|
|
|
86
93
|
- **完全异步**:基于Python的asyncio库,实现高效的并发爬取
|
|
87
|
-
- **多种下载处理程序**:支持多种HTTP客户端,包括aiohttp、httpx、requests、pyhttpx、curl_cffi、DrissionPage和
|
|
94
|
+
- **多种下载处理程序**:支持多种HTTP客户端,包括aiohttp、httpx、requests、pyhttpx、curl_cffi、DrissionPage、playwright和sbcdp
|
|
88
95
|
- **灵活的中间件系统**:轻松添加自定义功能和处理逻辑
|
|
89
96
|
- **强大的数据处理管道**:支持多种数据库存储选项
|
|
90
97
|
- **内置信号系统**:方便的事件处理机制
|
|
@@ -94,7 +101,7 @@ AioScrapy is a powerful asynchronous web crawling framework built on Python's as
|
|
|
94
101
|
|
|
95
102
|
|
|
96
103
|
- **Fully Asynchronous**: Built on Python's asyncio for efficient concurrent crawling
|
|
97
|
-
- **Multiple Download Handlers**: Support for various HTTP clients including aiohttp, httpx, requests, pyhttpx, curl_cffi, DrissionPage and
|
|
104
|
+
- **Multiple Download Handlers**: Support for various HTTP clients including aiohttp, httpx, requests, pyhttpx, curl_cffi, DrissionPage, playwright and sbcdp
|
|
98
105
|
- **Flexible Middleware System**: Easily add custom functionality and processing logic
|
|
99
106
|
- **Powerful Data Processing Pipelines**: Support for various database storage options
|
|
100
107
|
- **Built-in Signal System**: Convenient event handling mechanism
|
|
@@ -117,6 +124,49 @@ pip install aio-scrapy
|
|
|
117
124
|
# pip install git+https://github.com/ConlinH/aio-scrapy
|
|
118
125
|
```
|
|
119
126
|
|
|
127
|
+
### 开始 | Start
|
|
128
|
+
```python
|
|
129
|
+
from aioscrapy import Spider, logger
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class MyspiderSpider(Spider):
|
|
133
|
+
name = 'myspider'
|
|
134
|
+
custom_settings = {
|
|
135
|
+
"CLOSE_SPIDER_ON_IDLE": True
|
|
136
|
+
}
|
|
137
|
+
start_urls = ["https://quotes.toscrape.com"]
|
|
138
|
+
|
|
139
|
+
@staticmethod
|
|
140
|
+
async def process_request(request, spider):
|
|
141
|
+
""" request middleware """
|
|
142
|
+
pass
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
async def process_response(request, response, spider):
|
|
146
|
+
""" response middleware """
|
|
147
|
+
return response
|
|
148
|
+
|
|
149
|
+
@staticmethod
|
|
150
|
+
async def process_exception(request, exception, spider):
|
|
151
|
+
""" exception middleware """
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
async def parse(self, response):
|
|
155
|
+
for quote in response.css('div.quote'):
|
|
156
|
+
item = {
|
|
157
|
+
'author': quote.xpath('span/small/text()').get(),
|
|
158
|
+
'text': quote.css('span.text::text').get(),
|
|
159
|
+
}
|
|
160
|
+
yield item
|
|
161
|
+
|
|
162
|
+
async def process_item(self, item):
|
|
163
|
+
logger.info(item)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
if __name__ == '__main__':
|
|
167
|
+
MyspiderSpider.start()
|
|
168
|
+
```
|
|
169
|
+
|
|
120
170
|
## 文档 | Documentation
|
|
121
171
|
|
|
122
172
|
## 文档目录 | Documentation Contents
|
|
@@ -6,7 +6,7 @@ AioScrapy is a powerful asynchronous web crawling framework built on Python's as
|
|
|
6
6
|
## 特性 | Features
|
|
7
7
|
|
|
8
8
|
- **完全异步**:基于Python的asyncio库,实现高效的并发爬取
|
|
9
|
-
- **多种下载处理程序**:支持多种HTTP客户端,包括aiohttp、httpx、requests、pyhttpx、curl_cffi、DrissionPage和
|
|
9
|
+
- **多种下载处理程序**:支持多种HTTP客户端,包括aiohttp、httpx、requests、pyhttpx、curl_cffi、DrissionPage、playwright和sbcdp
|
|
10
10
|
- **灵活的中间件系统**:轻松添加自定义功能和处理逻辑
|
|
11
11
|
- **强大的数据处理管道**:支持多种数据库存储选项
|
|
12
12
|
- **内置信号系统**:方便的事件处理机制
|
|
@@ -16,7 +16,7 @@ AioScrapy is a powerful asynchronous web crawling framework built on Python's as
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
- **Fully Asynchronous**: Built on Python's asyncio for efficient concurrent crawling
|
|
19
|
-
- **Multiple Download Handlers**: Support for various HTTP clients including aiohttp, httpx, requests, pyhttpx, curl_cffi, DrissionPage and
|
|
19
|
+
- **Multiple Download Handlers**: Support for various HTTP clients including aiohttp, httpx, requests, pyhttpx, curl_cffi, DrissionPage, playwright and sbcdp
|
|
20
20
|
- **Flexible Middleware System**: Easily add custom functionality and processing logic
|
|
21
21
|
- **Powerful Data Processing Pipelines**: Support for various database storage options
|
|
22
22
|
- **Built-in Signal System**: Convenient event handling mechanism
|
|
@@ -39,6 +39,49 @@ pip install aio-scrapy
|
|
|
39
39
|
# pip install git+https://github.com/ConlinH/aio-scrapy
|
|
40
40
|
```
|
|
41
41
|
|
|
42
|
+
### 开始 | Start
|
|
43
|
+
```python
|
|
44
|
+
from aioscrapy import Spider, logger
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class MyspiderSpider(Spider):
|
|
48
|
+
name = 'myspider'
|
|
49
|
+
custom_settings = {
|
|
50
|
+
"CLOSE_SPIDER_ON_IDLE": True
|
|
51
|
+
}
|
|
52
|
+
start_urls = ["https://quotes.toscrape.com"]
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
async def process_request(request, spider):
|
|
56
|
+
""" request middleware """
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
async def process_response(request, response, spider):
|
|
61
|
+
""" response middleware """
|
|
62
|
+
return response
|
|
63
|
+
|
|
64
|
+
@staticmethod
|
|
65
|
+
async def process_exception(request, exception, spider):
|
|
66
|
+
""" exception middleware """
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
async def parse(self, response):
|
|
70
|
+
for quote in response.css('div.quote'):
|
|
71
|
+
item = {
|
|
72
|
+
'author': quote.xpath('span/small/text()').get(),
|
|
73
|
+
'text': quote.css('span.text::text').get(),
|
|
74
|
+
}
|
|
75
|
+
yield item
|
|
76
|
+
|
|
77
|
+
async def process_item(self, item):
|
|
78
|
+
logger.info(item)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
if __name__ == '__main__':
|
|
82
|
+
MyspiderSpider.start()
|
|
83
|
+
```
|
|
84
|
+
|
|
42
85
|
## 文档 | Documentation
|
|
43
86
|
|
|
44
87
|
## 文档目录 | Documentation Contents
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: aio-scrapy
|
|
3
|
-
Version: 2.1.
|
|
3
|
+
Version: 2.1.9
|
|
4
4
|
Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
|
|
5
5
|
Home-page: https://github.com/conlin-huang/aio-scrapy.git
|
|
6
6
|
Author: conlin
|
|
@@ -41,6 +41,8 @@ Requires-Dist: XlsxWriter>=3.1.2; extra == "all"
|
|
|
41
41
|
Requires-Dist: pillow>=9.4.0; extra == "all"
|
|
42
42
|
Requires-Dist: requests>=2.28.2; extra == "all"
|
|
43
43
|
Requires-Dist: curl_cffi; extra == "all"
|
|
44
|
+
Requires-Dist: sbcdp; extra == "all"
|
|
45
|
+
Requires-Dist: DrissionPage; extra == "all"
|
|
44
46
|
Provides-Extra: aiomysql
|
|
45
47
|
Requires-Dist: aiomysql>=0.1.1; extra == "aiomysql"
|
|
46
48
|
Requires-Dist: cryptography; extra == "aiomysql"
|
|
@@ -52,6 +54,10 @@ Provides-Extra: mongo
|
|
|
52
54
|
Requires-Dist: motor>=2.1.0; extra == "mongo"
|
|
53
55
|
Provides-Extra: playwright
|
|
54
56
|
Requires-Dist: playwright>=1.31.1; extra == "playwright"
|
|
57
|
+
Provides-Extra: sbcdp
|
|
58
|
+
Requires-Dist: sbcdp; extra == "sbcdp"
|
|
59
|
+
Provides-Extra: dp
|
|
60
|
+
Requires-Dist: DrissionPage; extra == "dp"
|
|
55
61
|
Provides-Extra: pyhttpx
|
|
56
62
|
Requires-Dist: pyhttpx>=2.10.4; extra == "pyhttpx"
|
|
57
63
|
Provides-Extra: curl-cffi
|
|
@@ -71,6 +77,7 @@ Dynamic: description-content-type
|
|
|
71
77
|
Dynamic: home-page
|
|
72
78
|
Dynamic: keywords
|
|
73
79
|
Dynamic: license
|
|
80
|
+
Dynamic: license-file
|
|
74
81
|
Dynamic: provides-extra
|
|
75
82
|
Dynamic: requires-dist
|
|
76
83
|
Dynamic: requires-python
|
|
@@ -84,7 +91,7 @@ AioScrapy is a powerful asynchronous web crawling framework built on Python's as
|
|
|
84
91
|
## 特性 | Features
|
|
85
92
|
|
|
86
93
|
- **完全异步**:基于Python的asyncio库,实现高效的并发爬取
|
|
87
|
-
- **多种下载处理程序**:支持多种HTTP客户端,包括aiohttp、httpx、requests、pyhttpx、curl_cffi、DrissionPage和
|
|
94
|
+
- **多种下载处理程序**:支持多种HTTP客户端,包括aiohttp、httpx、requests、pyhttpx、curl_cffi、DrissionPage、playwright和sbcdp
|
|
88
95
|
- **灵活的中间件系统**:轻松添加自定义功能和处理逻辑
|
|
89
96
|
- **强大的数据处理管道**:支持多种数据库存储选项
|
|
90
97
|
- **内置信号系统**:方便的事件处理机制
|
|
@@ -94,7 +101,7 @@ AioScrapy is a powerful asynchronous web crawling framework built on Python's as
|
|
|
94
101
|
|
|
95
102
|
|
|
96
103
|
- **Fully Asynchronous**: Built on Python's asyncio for efficient concurrent crawling
|
|
97
|
-
- **Multiple Download Handlers**: Support for various HTTP clients including aiohttp, httpx, requests, pyhttpx, curl_cffi, DrissionPage and
|
|
104
|
+
- **Multiple Download Handlers**: Support for various HTTP clients including aiohttp, httpx, requests, pyhttpx, curl_cffi, DrissionPage, playwright and sbcdp
|
|
98
105
|
- **Flexible Middleware System**: Easily add custom functionality and processing logic
|
|
99
106
|
- **Powerful Data Processing Pipelines**: Support for various database storage options
|
|
100
107
|
- **Built-in Signal System**: Convenient event handling mechanism
|
|
@@ -117,6 +124,49 @@ pip install aio-scrapy
|
|
|
117
124
|
# pip install git+https://github.com/ConlinH/aio-scrapy
|
|
118
125
|
```
|
|
119
126
|
|
|
127
|
+
### 开始 | Start
|
|
128
|
+
```python
|
|
129
|
+
from aioscrapy import Spider, logger
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class MyspiderSpider(Spider):
|
|
133
|
+
name = 'myspider'
|
|
134
|
+
custom_settings = {
|
|
135
|
+
"CLOSE_SPIDER_ON_IDLE": True
|
|
136
|
+
}
|
|
137
|
+
start_urls = ["https://quotes.toscrape.com"]
|
|
138
|
+
|
|
139
|
+
@staticmethod
|
|
140
|
+
async def process_request(request, spider):
|
|
141
|
+
""" request middleware """
|
|
142
|
+
pass
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
async def process_response(request, response, spider):
|
|
146
|
+
""" response middleware """
|
|
147
|
+
return response
|
|
148
|
+
|
|
149
|
+
@staticmethod
|
|
150
|
+
async def process_exception(request, exception, spider):
|
|
151
|
+
""" exception middleware """
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
async def parse(self, response):
|
|
155
|
+
for quote in response.css('div.quote'):
|
|
156
|
+
item = {
|
|
157
|
+
'author': quote.xpath('span/small/text()').get(),
|
|
158
|
+
'text': quote.css('span.text::text').get(),
|
|
159
|
+
}
|
|
160
|
+
yield item
|
|
161
|
+
|
|
162
|
+
async def process_item(self, item):
|
|
163
|
+
logger.info(item)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
if __name__ == '__main__':
|
|
167
|
+
MyspiderSpider.start()
|
|
168
|
+
```
|
|
169
|
+
|
|
120
170
|
## 文档 | Documentation
|
|
121
171
|
|
|
122
172
|
## 文档目录 | Documentation Contents
|
|
@@ -46,6 +46,7 @@ aioscrapy/core/downloader/handlers/webdriver/__init__.py
|
|
|
46
46
|
aioscrapy/core/downloader/handlers/webdriver/drissionpage.py
|
|
47
47
|
aioscrapy/core/downloader/handlers/webdriver/driverpool.py
|
|
48
48
|
aioscrapy/core/downloader/handlers/webdriver/playwright.py
|
|
49
|
+
aioscrapy/core/downloader/handlers/webdriver/sbcdp.py
|
|
49
50
|
aioscrapy/db/__init__.py
|
|
50
51
|
aioscrapy/db/absmanager.py
|
|
51
52
|
aioscrapy/db/aiomongo.py
|
|
@@ -86,6 +87,7 @@ aioscrapy/libs/pipelines/excel.py
|
|
|
86
87
|
aioscrapy/libs/pipelines/mongo.py
|
|
87
88
|
aioscrapy/libs/pipelines/mysql.py
|
|
88
89
|
aioscrapy/libs/pipelines/pg.py
|
|
90
|
+
aioscrapy/libs/pipelines/redis.py
|
|
89
91
|
aioscrapy/libs/spider/__init__.py
|
|
90
92
|
aioscrapy/libs/spider/depth.py
|
|
91
93
|
aioscrapy/libs/spider/httperror.py
|
|
@@ -28,10 +28,15 @@ XlsxWriter>=3.1.2
|
|
|
28
28
|
pillow>=9.4.0
|
|
29
29
|
requests>=2.28.2
|
|
30
30
|
curl_cffi
|
|
31
|
+
sbcdp
|
|
32
|
+
DrissionPage
|
|
31
33
|
|
|
32
34
|
[curl_cffi]
|
|
33
35
|
curl_cffi>=0.6.1
|
|
34
36
|
|
|
37
|
+
[dp]
|
|
38
|
+
DrissionPage
|
|
39
|
+
|
|
35
40
|
[execl]
|
|
36
41
|
XlsxWriter>=3.1.2
|
|
37
42
|
pillow>=9.4.0
|
|
@@ -53,3 +58,6 @@ pyhttpx>=2.10.4
|
|
|
53
58
|
|
|
54
59
|
[requests]
|
|
55
60
|
requests>=2.28.2
|
|
61
|
+
|
|
62
|
+
[sbcdp]
|
|
63
|
+
sbcdp
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
2.1.9
|
|
@@ -8,8 +8,9 @@ It supports features like browser impersonation, proxies, and cookies.
|
|
|
8
8
|
它支持浏览器模拟、代理和Cookie等功能。
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
|
+
import asyncio
|
|
11
12
|
from curl_cffi.curl import CurlError
|
|
12
|
-
from curl_cffi.requests import AsyncSession
|
|
13
|
+
from curl_cffi.requests import AsyncSession, Session
|
|
13
14
|
|
|
14
15
|
from aioscrapy import Request
|
|
15
16
|
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
@@ -50,6 +51,9 @@ class CurlCffiDownloadHandler(BaseDownloadHandler):
|
|
|
50
51
|
# SSL验证设置
|
|
51
52
|
self.verify_ssl: bool = self.settings.get("VERIFY_SSL", True)
|
|
52
53
|
|
|
54
|
+
# 是否在线程中执行
|
|
55
|
+
self.use_thread: bool = self.settings.get("USE_THREAD", False)
|
|
56
|
+
|
|
53
57
|
@classmethod
|
|
54
58
|
def from_settings(cls, settings: Settings):
|
|
55
59
|
"""
|
|
@@ -160,8 +164,14 @@ class CurlCffiDownloadHandler(BaseDownloadHandler):
|
|
|
160
164
|
|
|
161
165
|
# Perform the request
|
|
162
166
|
# 执行请求
|
|
163
|
-
|
|
164
|
-
|
|
167
|
+
if self.use_thread:
|
|
168
|
+
with Session(**session_args) as session:
|
|
169
|
+
# Run the synchronous curl-cffi request in a thread pool
|
|
170
|
+
# 在线程池中运行同步的curl-cffi请求
|
|
171
|
+
response = await asyncio.to_thread(session.request, request.method, request.url, **kwargs)
|
|
172
|
+
else:
|
|
173
|
+
async with AsyncSession(**session_args) as session:
|
|
174
|
+
response = await session.request(request.method, request.url, **kwargs)
|
|
165
175
|
|
|
166
176
|
# Convert curl_cffi response to HtmlResponse
|
|
167
177
|
# 将curl_cffi响应转换为HtmlResponse
|