aio-scrapy 2.1.4__py3-none-any.whl → 2.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/LICENSE +1 -1
- aio_scrapy-2.1.7.dist-info/METADATA +147 -0
- aio_scrapy-2.1.7.dist-info/RECORD +134 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/WHEEL +1 -1
- aioscrapy/VERSION +1 -1
- aioscrapy/cmdline.py +438 -5
- aioscrapy/core/downloader/__init__.py +522 -17
- aioscrapy/core/downloader/handlers/__init__.py +187 -5
- aioscrapy/core/downloader/handlers/aiohttp.py +190 -6
- aioscrapy/core/downloader/handlers/curl_cffi.py +126 -5
- aioscrapy/core/downloader/handlers/httpx.py +135 -5
- aioscrapy/core/downloader/handlers/pyhttpx.py +137 -5
- aioscrapy/core/downloader/handlers/requests.py +120 -2
- aioscrapy/core/downloader/handlers/webdriver/__init__.py +2 -0
- aioscrapy/core/downloader/handlers/webdriver/drissionpage.py +493 -0
- aioscrapy/core/downloader/handlers/webdriver/driverpool.py +234 -0
- aioscrapy/core/downloader/handlers/webdriver/playwright.py +498 -0
- aioscrapy/core/engine.py +381 -20
- aioscrapy/core/scheduler.py +350 -36
- aioscrapy/core/scraper.py +509 -33
- aioscrapy/crawler.py +392 -10
- aioscrapy/db/__init__.py +149 -0
- aioscrapy/db/absmanager.py +212 -6
- aioscrapy/db/aiomongo.py +292 -10
- aioscrapy/db/aiomysql.py +363 -10
- aioscrapy/db/aiopg.py +299 -2
- aioscrapy/db/aiorabbitmq.py +444 -4
- aioscrapy/db/aioredis.py +260 -11
- aioscrapy/dupefilters/__init__.py +110 -5
- aioscrapy/dupefilters/disk.py +124 -2
- aioscrapy/dupefilters/redis.py +598 -32
- aioscrapy/exceptions.py +151 -13
- aioscrapy/http/__init__.py +1 -1
- aioscrapy/http/headers.py +237 -3
- aioscrapy/http/request/__init__.py +257 -11
- aioscrapy/http/request/form.py +83 -3
- aioscrapy/http/request/json_request.py +121 -9
- aioscrapy/http/response/__init__.py +306 -33
- aioscrapy/http/response/html.py +42 -3
- aioscrapy/http/response/text.py +496 -49
- aioscrapy/http/response/web_driver.py +144 -0
- aioscrapy/http/response/xml.py +45 -3
- aioscrapy/libs/downloader/defaultheaders.py +66 -2
- aioscrapy/libs/downloader/downloadtimeout.py +91 -2
- aioscrapy/libs/downloader/ja3fingerprint.py +95 -2
- aioscrapy/libs/downloader/retry.py +192 -6
- aioscrapy/libs/downloader/stats.py +142 -0
- aioscrapy/libs/downloader/useragent.py +93 -2
- aioscrapy/libs/extensions/closespider.py +166 -4
- aioscrapy/libs/extensions/corestats.py +151 -1
- aioscrapy/libs/extensions/logstats.py +145 -1
- aioscrapy/libs/extensions/metric.py +370 -1
- aioscrapy/libs/extensions/throttle.py +235 -1
- aioscrapy/libs/pipelines/__init__.py +345 -2
- aioscrapy/libs/pipelines/csv.py +242 -0
- aioscrapy/libs/pipelines/excel.py +545 -0
- aioscrapy/libs/pipelines/mongo.py +132 -0
- aioscrapy/libs/pipelines/mysql.py +67 -0
- aioscrapy/libs/pipelines/pg.py +67 -0
- aioscrapy/libs/spider/depth.py +141 -3
- aioscrapy/libs/spider/httperror.py +144 -4
- aioscrapy/libs/spider/offsite.py +202 -2
- aioscrapy/libs/spider/referer.py +396 -21
- aioscrapy/libs/spider/urllength.py +97 -1
- aioscrapy/link.py +115 -8
- aioscrapy/logformatter.py +199 -8
- aioscrapy/middleware/absmanager.py +328 -2
- aioscrapy/middleware/downloader.py +218 -0
- aioscrapy/middleware/extension.py +50 -1
- aioscrapy/middleware/itempipeline.py +96 -0
- aioscrapy/middleware/spider.py +360 -7
- aioscrapy/process.py +200 -0
- aioscrapy/proxy/__init__.py +142 -3
- aioscrapy/proxy/redis.py +136 -2
- aioscrapy/queue/__init__.py +168 -16
- aioscrapy/scrapyd/runner.py +124 -3
- aioscrapy/serializer.py +182 -2
- aioscrapy/settings/__init__.py +610 -128
- aioscrapy/settings/default_settings.py +314 -14
- aioscrapy/signalmanager.py +151 -20
- aioscrapy/signals.py +183 -1
- aioscrapy/spiderloader.py +165 -12
- aioscrapy/spiders/__init__.py +233 -6
- aioscrapy/statscollectors.py +312 -1
- aioscrapy/utils/conf.py +345 -17
- aioscrapy/utils/curl.py +168 -16
- aioscrapy/utils/decorators.py +76 -6
- aioscrapy/utils/deprecate.py +212 -19
- aioscrapy/utils/httpobj.py +55 -3
- aioscrapy/utils/log.py +79 -0
- aioscrapy/utils/misc.py +189 -21
- aioscrapy/utils/ossignal.py +67 -5
- aioscrapy/utils/project.py +165 -3
- aioscrapy/utils/python.py +254 -44
- aioscrapy/utils/reqser.py +75 -1
- aioscrapy/utils/request.py +173 -12
- aioscrapy/utils/response.py +91 -6
- aioscrapy/utils/signal.py +196 -14
- aioscrapy/utils/spider.py +51 -4
- aioscrapy/utils/template.py +93 -6
- aioscrapy/utils/tools.py +191 -17
- aioscrapy/utils/trackref.py +198 -12
- aioscrapy/utils/url.py +341 -36
- aio_scrapy-2.1.4.dist-info/METADATA +0 -239
- aio_scrapy-2.1.4.dist-info/RECORD +0 -133
- aioscrapy/core/downloader/handlers/playwright/__init__.py +0 -115
- aioscrapy/core/downloader/handlers/playwright/driverpool.py +0 -59
- aioscrapy/core/downloader/handlers/playwright/webdriver.py +0 -96
- aioscrapy/http/response/playwright.py +0 -36
- aioscrapy/libs/pipelines/execl.py +0 -169
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/entry_points.txt +0 -0
- {aio_scrapy-2.1.4.dist-info → aio_scrapy-2.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,239 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: aio-scrapy
|
|
3
|
-
Version: 2.1.4
|
|
4
|
-
Summary: A high-level Web Crawling and Web Scraping framework based on Asyncio
|
|
5
|
-
Home-page: https://github.com/conlin-huang/aio-scrapy.git
|
|
6
|
-
Author: conlin
|
|
7
|
-
Author-email: 995018884@qq.com
|
|
8
|
-
License: MIT
|
|
9
|
-
Keywords: aio-scrapy,scrapy,aioscrapy,scrapy redis,asyncio,spider
|
|
10
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
-
Classifier: Intended Audience :: Developers
|
|
12
|
-
Classifier: Operating System :: OS Independent
|
|
13
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
-
Classifier: Topic :: Internet :: WWW/HTTP
|
|
17
|
-
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
18
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
-
Requires-Python: >=3.9
|
|
20
|
-
Description-Content-Type: text/markdown
|
|
21
|
-
License-File: LICENSE
|
|
22
|
-
Requires-Dist: aiohttp
|
|
23
|
-
Requires-Dist: ujson
|
|
24
|
-
Requires-Dist: w3lib >=1.17.0
|
|
25
|
-
Requires-Dist: parsel >=1.5.0
|
|
26
|
-
Requires-Dist: PyDispatcher >=2.0.5
|
|
27
|
-
Requires-Dist: zope.interface >=5.1.0
|
|
28
|
-
Requires-Dist: redis >=4.3.1
|
|
29
|
-
Requires-Dist: aiomultiprocess >=0.9.0
|
|
30
|
-
Requires-Dist: loguru >=0.7.0
|
|
31
|
-
Requires-Dist: anyio >=3.6.2
|
|
32
|
-
Provides-Extra: aio-pika
|
|
33
|
-
Requires-Dist: aio-pika >=8.1.1 ; extra == 'aio-pika'
|
|
34
|
-
Provides-Extra: aiomysql
|
|
35
|
-
Requires-Dist: aiomysql >=0.1.1 ; extra == 'aiomysql'
|
|
36
|
-
Requires-Dist: cryptography ; extra == 'aiomysql'
|
|
37
|
-
Provides-Extra: all
|
|
38
|
-
Requires-Dist: aiomysql >=0.1.1 ; extra == 'all'
|
|
39
|
-
Requires-Dist: httpx[http2] >=0.23.0 ; extra == 'all'
|
|
40
|
-
Requires-Dist: aio-pika >=8.1.1 ; extra == 'all'
|
|
41
|
-
Requires-Dist: cryptography ; extra == 'all'
|
|
42
|
-
Requires-Dist: motor >=2.1.0 ; extra == 'all'
|
|
43
|
-
Requires-Dist: pyhttpx >=2.10.1 ; extra == 'all'
|
|
44
|
-
Requires-Dist: asyncpg >=0.27.0 ; extra == 'all'
|
|
45
|
-
Requires-Dist: XlsxWriter >=3.1.2 ; extra == 'all'
|
|
46
|
-
Requires-Dist: pillow >=9.4.0 ; extra == 'all'
|
|
47
|
-
Requires-Dist: requests >=2.28.2 ; extra == 'all'
|
|
48
|
-
Requires-Dist: curl-cffi ; extra == 'all'
|
|
49
|
-
Provides-Extra: curl_cffi
|
|
50
|
-
Requires-Dist: curl-cffi >=0.6.1 ; extra == 'curl_cffi'
|
|
51
|
-
Provides-Extra: execl
|
|
52
|
-
Requires-Dist: XlsxWriter >=3.1.2 ; extra == 'execl'
|
|
53
|
-
Requires-Dist: pillow >=9.4.0 ; extra == 'execl'
|
|
54
|
-
Provides-Extra: httpx
|
|
55
|
-
Requires-Dist: httpx[http2] >=0.23.0 ; extra == 'httpx'
|
|
56
|
-
Provides-Extra: mongo
|
|
57
|
-
Requires-Dist: motor >=2.1.0 ; extra == 'mongo'
|
|
58
|
-
Provides-Extra: pg
|
|
59
|
-
Requires-Dist: asyncpg >=0.27.0 ; extra == 'pg'
|
|
60
|
-
Provides-Extra: playwright
|
|
61
|
-
Requires-Dist: playwright >=1.31.1 ; extra == 'playwright'
|
|
62
|
-
Provides-Extra: pyhttpx
|
|
63
|
-
Requires-Dist: pyhttpx >=2.10.4 ; extra == 'pyhttpx'
|
|
64
|
-
Provides-Extra: requests
|
|
65
|
-
Requires-Dist: requests >=2.28.2 ; extra == 'requests'
|
|
66
|
-
|
|
67
|
-
<!--
|
|
68
|
-

|
|
69
|
-
-->
|
|
70
|
-
### aio-scrapy
|
|
71
|
-
|
|
72
|
-
An asyncio + aiolibs crawler imitate scrapy framework
|
|
73
|
-
|
|
74
|
-
English | [中文](./doc/README_ZH.md)
|
|
75
|
-
|
|
76
|
-
### Overview
|
|
77
|
-
- aio-scrapy framework is base on opensource project Scrapy & scrapy_redis.
|
|
78
|
-
- aio-scrapy implements compatibility with scrapyd.
|
|
79
|
-
- aio-scrapy implements redis queue and rabbitmq queue.
|
|
80
|
-
- aio-scrapy is a fast high-level web crawling and web scraping framework, used to crawl websites and extract structured data from their pages.
|
|
81
|
-
- Distributed crawling/scraping.
|
|
82
|
-
### Requirements
|
|
83
|
-
|
|
84
|
-
- Python 3.9+
|
|
85
|
-
- Works on Linux, Windows, macOS, BSD
|
|
86
|
-
|
|
87
|
-
### Install
|
|
88
|
-
|
|
89
|
-
The quick way:
|
|
90
|
-
|
|
91
|
-
```shell
|
|
92
|
-
# Install the latest aio-scrapy
|
|
93
|
-
pip install git+https://github.com/ConlinH/aio-scrapy
|
|
94
|
-
|
|
95
|
-
# default
|
|
96
|
-
pip install aio-scrapy
|
|
97
|
-
|
|
98
|
-
# Install all dependencies
|
|
99
|
-
pip install aio-scrapy[all]
|
|
100
|
-
|
|
101
|
-
# When you need to use mysql/httpx/rabbitmq/mongo
|
|
102
|
-
pip install aio-scrapy[aiomysql,httpx,aio-pika,mongo]
|
|
103
|
-
```
|
|
104
|
-
|
|
105
|
-
### Usage
|
|
106
|
-
|
|
107
|
-
#### create project spider:
|
|
108
|
-
|
|
109
|
-
```shell
|
|
110
|
-
aioscrapy startproject project_quotes
|
|
111
|
-
```
|
|
112
|
-
|
|
113
|
-
```
|
|
114
|
-
cd project_quotes
|
|
115
|
-
aioscrapy genspider quotes
|
|
116
|
-
```
|
|
117
|
-
|
|
118
|
-
quotes.py
|
|
119
|
-
|
|
120
|
-
```python
|
|
121
|
-
from aioscrapy.spiders import Spider
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
class QuotesMemorySpider(Spider):
|
|
125
|
-
name = 'QuotesMemorySpider'
|
|
126
|
-
|
|
127
|
-
start_urls = ['https://quotes.toscrape.com']
|
|
128
|
-
|
|
129
|
-
async def parse(self, response):
|
|
130
|
-
for quote in response.css('div.quote'):
|
|
131
|
-
yield {
|
|
132
|
-
'author': quote.xpath('span/small/text()').get(),
|
|
133
|
-
'text': quote.css('span.text::text').get(),
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
next_page = response.css('li.next a::attr("href")').get()
|
|
137
|
-
if next_page is not None:
|
|
138
|
-
yield response.follow(next_page, self.parse)
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
if __name__ == '__main__':
|
|
142
|
-
QuotesMemorySpider.start()
|
|
143
|
-
|
|
144
|
-
```
|
|
145
|
-
|
|
146
|
-
run the spider:
|
|
147
|
-
|
|
148
|
-
```shell
|
|
149
|
-
aioscrapy crawl quotes
|
|
150
|
-
```
|
|
151
|
-
|
|
152
|
-
#### create single script spider:
|
|
153
|
-
|
|
154
|
-
```shell
|
|
155
|
-
aioscrapy genspider single_quotes -t single
|
|
156
|
-
```
|
|
157
|
-
|
|
158
|
-
single_quotes.py:
|
|
159
|
-
|
|
160
|
-
```python
|
|
161
|
-
from aioscrapy.spiders import Spider
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
class QuotesMemorySpider(Spider):
|
|
165
|
-
name = 'QuotesMemorySpider'
|
|
166
|
-
custom_settings = {
|
|
167
|
-
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
|
|
168
|
-
'CLOSE_SPIDER_ON_IDLE': True,
|
|
169
|
-
# 'DOWNLOAD_DELAY': 3,
|
|
170
|
-
# 'RANDOMIZE_DOWNLOAD_DELAY': True,
|
|
171
|
-
# 'CONCURRENT_REQUESTS': 1,
|
|
172
|
-
# 'LOG_LEVEL': 'INFO'
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
start_urls = ['https://quotes.toscrape.com']
|
|
176
|
-
|
|
177
|
-
@staticmethod
|
|
178
|
-
async def process_request(request, spider):
|
|
179
|
-
""" request middleware """
|
|
180
|
-
pass
|
|
181
|
-
|
|
182
|
-
@staticmethod
|
|
183
|
-
async def process_response(request, response, spider):
|
|
184
|
-
""" response middleware """
|
|
185
|
-
return response
|
|
186
|
-
|
|
187
|
-
@staticmethod
|
|
188
|
-
async def process_exception(request, exception, spider):
|
|
189
|
-
""" exception middleware """
|
|
190
|
-
pass
|
|
191
|
-
|
|
192
|
-
async def parse(self, response):
|
|
193
|
-
for quote in response.css('div.quote'):
|
|
194
|
-
yield {
|
|
195
|
-
'author': quote.xpath('span/small/text()').get(),
|
|
196
|
-
'text': quote.css('span.text::text').get(),
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
next_page = response.css('li.next a::attr("href")').get()
|
|
200
|
-
if next_page is not None:
|
|
201
|
-
yield response.follow(next_page, self.parse)
|
|
202
|
-
|
|
203
|
-
async def process_item(self, item):
|
|
204
|
-
print(item)
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
if __name__ == '__main__':
|
|
208
|
-
QuotesMemorySpider.start()
|
|
209
|
-
|
|
210
|
-
```
|
|
211
|
-
|
|
212
|
-
run the spider:
|
|
213
|
-
|
|
214
|
-
```shell
|
|
215
|
-
aioscrapy runspider quotes.py
|
|
216
|
-
```
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
### more commands:
|
|
220
|
-
|
|
221
|
-
```shell
|
|
222
|
-
aioscrapy -h
|
|
223
|
-
```
|
|
224
|
-
|
|
225
|
-
#### [more example](./example)
|
|
226
|
-
|
|
227
|
-
### Documentation
|
|
228
|
-
[doc](./doc/documentation.md)
|
|
229
|
-
|
|
230
|
-
### Ready
|
|
231
|
-
|
|
232
|
-
please submit your sugguestion to owner by issue
|
|
233
|
-
|
|
234
|
-
## Thanks
|
|
235
|
-
|
|
236
|
-
[aiohttp](https://github.com/aio-libs/aiohttp/)
|
|
237
|
-
|
|
238
|
-
[scrapy](https://github.com/scrapy/scrapy)
|
|
239
|
-
|
|
@@ -1,133 +0,0 @@
|
|
|
1
|
-
aioscrapy/VERSION,sha256=Z7BD32ByWBAJNuaSjQBe7W_NFoIm-41YzXKXt3z-bUI,5
|
|
2
|
-
aioscrapy/__init__.py,sha256=esJeH66Mz9WV7XbotvZEjNn49jc589YZ_L2DKoD0JvA,858
|
|
3
|
-
aioscrapy/__main__.py,sha256=rvTdJ0cQwbi29aucPj3jJRpccx5SBzvRcV7qvxvX2NQ,80
|
|
4
|
-
aioscrapy/cmdline.py,sha256=1qhNg2Edl-Obmf2re2K4V8pJG7ubGfZZCzcHdKtdE_s,5159
|
|
5
|
-
aioscrapy/crawler.py,sha256=6-ptivIjIGKdojOlZqXV0hV3x1Gont81tOC5u5JqIME,10330
|
|
6
|
-
aioscrapy/exceptions.py,sha256=B1UZUXF_dZNJ5b1wltDemijK8iCNpH-EF2sOooH9AsA,2628
|
|
7
|
-
aioscrapy/link.py,sha256=fXMqsHvYEzsuYi-sNDcElS7jV6Lusq0tjPkPUGOlyZw,1867
|
|
8
|
-
aioscrapy/logformatter.py,sha256=y3etd28ACbpTbcGprJ_cQ086gxQY3k_QX_yxYFoF1AU,3028
|
|
9
|
-
aioscrapy/process.py,sha256=uFkj2wzaBu0Vs3pGFKdJ4R-0Gn7hROX6EU-B5zddnyQ,1603
|
|
10
|
-
aioscrapy/serializer.py,sha256=eTMSMHQZJidJ3-LzjmfJd-pkkG29G3NnrtYUx-eKB8w,734
|
|
11
|
-
aioscrapy/signalmanager.py,sha256=S_dTEa8Y75x8SSHEQem014o-OmxwvfwaMH9w34lCkTc,2402
|
|
12
|
-
aioscrapy/signals.py,sha256=rXkPNS6c9rIv_TzKU3lqYGQc8UA3CbqSsHL9O1U6MTE,610
|
|
13
|
-
aioscrapy/spiderloader.py,sha256=XJ-lUUtf9Xy7172VzlvPzLes-2ym3GRw7lAdJ7sankc,3426
|
|
14
|
-
aioscrapy/statscollectors.py,sha256=L1ykz0zCqt7Qw85NyhWH8zfwVOtnQOWMI0065YP03CQ,2036
|
|
15
|
-
aioscrapy/commands/__init__.py,sha256=hNPzpaYpnX3mxLjQgKM211xAuMgjhxMwjpDRCrvFVwI,4769
|
|
16
|
-
aioscrapy/commands/crawl.py,sha256=fKrQEdR-YJuiXpDIX3R0QdhOgjDwDye7C6ilrj77C1A,1020
|
|
17
|
-
aioscrapy/commands/genspider.py,sha256=381W8-pjbASGXvYjrrgYWorJD9BjURa25oolDm0QI1Q,5426
|
|
18
|
-
aioscrapy/commands/list.py,sha256=j62GHgngGRdW-AGKCJBvrc_h26yj0JmEFBkXTvjRoi8,346
|
|
19
|
-
aioscrapy/commands/runspider.py,sha256=albdJM3SUbWzhiOK1zsWklp3HqGSPRISTshg33EnZ_c,2067
|
|
20
|
-
aioscrapy/commands/settings.py,sha256=sc0rwwfBQNySKX8uV3iJqv3i7SelFwNcrlHYxDupKOg,1798
|
|
21
|
-
aioscrapy/commands/startproject.py,sha256=Rcc7JkN75Jp2t2aZIxBzPsWbLXChNAUSByDhcW_6Ig8,4001
|
|
22
|
-
aioscrapy/commands/version.py,sha256=yqqTMlZkkiQhtbU9w_IqUWLMOAjqYlv24friEkPRQYM,485
|
|
23
|
-
aioscrapy/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
-
aioscrapy/core/engine.py,sha256=h02-K2lQqlCxvNIlURgPpnhHCbyiJRIWrFJt5Ys7vZY,9843
|
|
25
|
-
aioscrapy/core/scheduler.py,sha256=czCx5oHknXuHadpISTfoEMSKXXrlwJTmLTUQtHdtaTc,7407
|
|
26
|
-
aioscrapy/core/scraper.py,sha256=eS_qEX_Q9fXZnK8Ou1wDtBJhRKk9JoUSnbn4c04u1cA,10750
|
|
27
|
-
aioscrapy/core/downloader/__init__.py,sha256=OCg21payZbmQPcZ1_Wrhhgos7angRB-w9qya3CxrmSU,10040
|
|
28
|
-
aioscrapy/core/downloader/handlers/__init__.py,sha256=KwID2qt3dhFvvBIF3CJnPR4w4a4_qz4uKaXgQI5b59o,3199
|
|
29
|
-
aioscrapy/core/downloader/handlers/aiohttp.py,sha256=qt8Wys8NrbLatBqEob5lzjKmy_C2Nl9XxLyA2npdv6A,4277
|
|
30
|
-
aioscrapy/core/downloader/handlers/curl_cffi.py,sha256=hYlUf2BzS6GrWaPKLJhuqj8fxOt9AANBoeAp9vx7-KU,2590
|
|
31
|
-
aioscrapy/core/downloader/handlers/httpx.py,sha256=HqidohwQr8G7GNhrS1v23rYmD2dzNW69bObcO0X_6Qs,3398
|
|
32
|
-
aioscrapy/core/downloader/handlers/pyhttpx.py,sha256=djxaNoYVD6TJSN3UruviQBx8_oLVtCn4d__qwsoxRJA,2573
|
|
33
|
-
aioscrapy/core/downloader/handlers/requests.py,sha256=RdRi6Izj-jvWa_8T8axW9EzcUfMqfman7eFKTFjOro4,2328
|
|
34
|
-
aioscrapy/core/downloader/handlers/playwright/__init__.py,sha256=xjPNlvM0zzR8lOIzgJeDnq1p0x1VHGhGiyMQmihdkmM,4676
|
|
35
|
-
aioscrapy/core/downloader/handlers/playwright/driverpool.py,sha256=IlkYB8TlSuDq7-sTLlGvtAsFMalNvzpTJR7wEMYe2jE,1595
|
|
36
|
-
aioscrapy/core/downloader/handlers/playwright/webdriver.py,sha256=QFtAT--2Ea_Gg4x1EhMidyOwQjbqljUl4sKGB_hAA00,3530
|
|
37
|
-
aioscrapy/db/__init__.py,sha256=ISBXM_-cCf5CgTLc3i_emLxV163-ZAbgttkQiRxokD0,2456
|
|
38
|
-
aioscrapy/db/absmanager.py,sha256=6vlPcjDHOtZCHePiUYPe6ezRnM-TB4XLhmuw7APaWDk,1162
|
|
39
|
-
aioscrapy/db/aiomongo.py,sha256=t4JpRPBBisF7_rz02Kp6AejrphLvLWg5rF-yYLIe2MI,3071
|
|
40
|
-
aioscrapy/db/aiomysql.py,sha256=-xCLfeH7RzvghY1jqREAb_Qnz9q_dVjxoHGfz7sCqbU,3799
|
|
41
|
-
aioscrapy/db/aiopg.py,sha256=WG4s_2X0b8LQHbZpoIrwZeuGHNolKj-SvmvAZQlCk00,3213
|
|
42
|
-
aioscrapy/db/aiorabbitmq.py,sha256=tNKl4Kx7KM7H_lOj8xfeA0uD8PuBTVzySApTEn5TyAE,5583
|
|
43
|
-
aioscrapy/db/aioredis.py,sha256=UOoTRTQUvghnq29bVL8v1HvksMXYOzHaS8Btgbpn0bY,2966
|
|
44
|
-
aioscrapy/dupefilters/__init__.py,sha256=Dx_CN-wBYiatLj3cXbK0f5d66CTjzzTeex6565L1EsA,1765
|
|
45
|
-
aioscrapy/dupefilters/disk.py,sha256=EMgxeC2a6aYCGKgp4QOs5xwHp33LUsOZ8pliKBTFx1c,1551
|
|
46
|
-
aioscrapy/dupefilters/redis.py,sha256=YUUsnRQ326PjdM_FUWAWjgOWw92KwswAGnM-FmN8pv0,7559
|
|
47
|
-
aioscrapy/http/__init__.py,sha256=yeQTT5W1iwr6dKznTS5d9vnx2hsB47i9roPM57wQp_0,597
|
|
48
|
-
aioscrapy/http/headers.py,sha256=H-RJ6KqOsFFFAXORfvoyz3V-ud0I8TAj5Jt5fAACcLc,1573
|
|
49
|
-
aioscrapy/http/request/__init__.py,sha256=PFoFU3ncTN-gj6Rx01rjVa_744Qfv3EH29mooW6JX9U,7121
|
|
50
|
-
aioscrapy/http/request/form.py,sha256=pqexRCmGlTiE9FofKa3_OpoK6yoJlf9qk9hxNveDoaw,1382
|
|
51
|
-
aioscrapy/http/request/json_request.py,sha256=qtWdF5UhuGy0QmyLKWm9Y86veISkZS5HyOJiavtEhP4,2051
|
|
52
|
-
aioscrapy/http/response/__init__.py,sha256=6DyNQNVgpJ-Awd8Cu9eCTXF_nmQF87WoAPPj5aBrigA,6731
|
|
53
|
-
aioscrapy/http/response/html.py,sha256=PiBguPg2uE1lY_WNjs419EqbZv38pGsZ5JqTUYQWeMU,303
|
|
54
|
-
aioscrapy/http/response/playwright.py,sha256=0DX0L04hGv1zvtXvdYS7_UQFKbnbx2r7Sj-ZKirxtr0,1051
|
|
55
|
-
aioscrapy/http/response/text.py,sha256=VxpjTIGtnVuUepa3GdsyX5kskqnFVCjmZmfonSynbhM,9844
|
|
56
|
-
aioscrapy/http/response/xml.py,sha256=5iXsTuxFW1eBLrzYggjyf-FTDZwWZwY1ATZceIa5yxM,300
|
|
57
|
-
aioscrapy/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
58
|
-
aioscrapy/libs/downloader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
59
|
-
aioscrapy/libs/downloader/defaultheaders.py,sha256=tg_ULA0Y-41bZKG607mowFJQGVfnZ45LdR044DsjA_A,563
|
|
60
|
-
aioscrapy/libs/downloader/downloadtimeout.py,sha256=hNh3OEj7rC0ceQrv_yrhR5lb5AvfxJ6cspj3qsQWj4o,704
|
|
61
|
-
aioscrapy/libs/downloader/ja3fingerprint.py,sha256=DgTw74GXC_Bp94eD_bwoG6A_DphUHTt7bH4glBNXyV8,1058
|
|
62
|
-
aioscrapy/libs/downloader/retry.py,sha256=uKU8XuPya8Co6vTTTgs1-rFtMsZreSwz0Zo1ErgaA6I,4482
|
|
63
|
-
aioscrapy/libs/downloader/stats.py,sha256=FlkS8Zm4j3SBjHb6caXwq08HvvZ37VKORGCAjlA2U38,1376
|
|
64
|
-
aioscrapy/libs/downloader/useragent.py,sha256=E5x5dk9AxsSCGDDICJlTXwWXRkqAibWgesqG0VhAG8M,743
|
|
65
|
-
aioscrapy/libs/extensions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
|
-
aioscrapy/libs/extensions/closespider.py,sha256=LRv4G7RHwFeT9WeH6s0spO909BHvxdtlvHWLN-dlyno,2734
|
|
67
|
-
aioscrapy/libs/extensions/corestats.py,sha256=WCZ4nnk6LUP7AdGx9mnuVm96iWMxHozxdNPr41r8HmQ,1820
|
|
68
|
-
aioscrapy/libs/extensions/logstats.py,sha256=wSLbN9tmsw5I1FBxHjLfIdQo85fxJI7TmOefispaxc4,1844
|
|
69
|
-
aioscrapy/libs/extensions/metric.py,sha256=cx9UnSdj6akzrPe_uwWHh_QKTNzD82VRrEjiiHOoAuc,5479
|
|
70
|
-
aioscrapy/libs/extensions/throttle.py,sha256=yos2D3XZgH40G52kltMKv5_GeAK4MqpRwTu6FCErUh0,3512
|
|
71
|
-
aioscrapy/libs/pipelines/__init__.py,sha256=XW5Ur6bhvGLo-w-tdUeIB4jkFpZxqUU9mbajfAAztb0,5642
|
|
72
|
-
aioscrapy/libs/pipelines/csv.py,sha256=-PEZOt-3ndF0ePO7EnqjEqeCYMJR9wHv3XcpSq6QswI,2454
|
|
73
|
-
aioscrapy/libs/pipelines/execl.py,sha256=a8sfgQCHUc0MIja9cPP4TZ6ghfkxYZuAzLDIK4_nQuo,6284
|
|
74
|
-
aioscrapy/libs/pipelines/mongo.py,sha256=B3dhvspxc4lmPh2noqARYV-rFuHfivdSfZ7ZlPKnk7c,2323
|
|
75
|
-
aioscrapy/libs/pipelines/mysql.py,sha256=gN4DnyuXTQvDvy9Gu-v8F6sT8l7GZEa45AD0d-Ckv8s,1022
|
|
76
|
-
aioscrapy/libs/pipelines/pg.py,sha256=la-SflXtGFw4IQYlOn75Brw2IfmtOUcCh0gUSz_Jg-0,990
|
|
77
|
-
aioscrapy/libs/spider/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
78
|
-
aioscrapy/libs/spider/depth.py,sha256=IZoLS-JAqLl79JrQQf5p8aubN4c82g44M2iUkhRhXjM,1938
|
|
79
|
-
aioscrapy/libs/spider/httperror.py,sha256=v8_zuTpzyU7nszo-97TEE5iniKPdrvKrPKx6bKDtQyI,1857
|
|
80
|
-
aioscrapy/libs/spider/offsite.py,sha256=QQx15cozRPsjIvYobeiIgTeMwgtgDk5yMgf6Y6qT9s4,2941
|
|
81
|
-
aioscrapy/libs/spider/referer.py,sha256=vdbk_uQI4o0MncCHQauVX7xcxERWSqw5ZkaJVVrYcr0,13768
|
|
82
|
-
aioscrapy/libs/spider/urllength.py,sha256=D2sEt-LeibHYfkO1tUJxENeyI1PrwIb7HrLIBNU3HdA,1151
|
|
83
|
-
aioscrapy/middleware/__init__.py,sha256=Ej7BAmxdHDqaytQT1XA3V73CpBedkZBQsRni0fduuck,396
|
|
84
|
-
aioscrapy/middleware/absmanager.py,sha256=dwm4nEyRIjP_V6BJqI-tl9NZF0HwsafDNeWyqZCwSrA,3410
|
|
85
|
-
aioscrapy/middleware/downloader.py,sha256=SSeHXtECbGE5r6o6EXLo9ZIrPX6wqkFTwEoN4F6qHgM,3199
|
|
86
|
-
aioscrapy/middleware/extension.py,sha256=LS6Q9VFYVa9oemS7DiKsyehhz8244alj4Jtlnl0f4DY,420
|
|
87
|
-
aioscrapy/middleware/itempipeline.py,sha256=_Htfrs3vzIUfajTOzQLdGaX_4xTFtSSoFzyhZJMyZw8,674
|
|
88
|
-
aioscrapy/middleware/spider.py,sha256=QvV5dchOlskBn1sXKd5dj6s9zSZmlT6LibydCjfmYjU,6361
|
|
89
|
-
aioscrapy/proxy/__init__.py,sha256=Cwua97Z-ezxtDSlud7mCOAV-iExY7RX_8O1oP5PS__k,1807
|
|
90
|
-
aioscrapy/proxy/redis.py,sha256=LFfnnkihf6Wq1-HeRzPLVEiy5e5wxJbMY7htU-C_Pd8,2711
|
|
91
|
-
aioscrapy/queue/__init__.py,sha256=MKHNOgcZRAjFHAxoKLujvsBCkB_Ne1-gz5DqNbdTYNA,2037
|
|
92
|
-
aioscrapy/queue/memory.py,sha256=_Dvkd-HXwdR8B-wsEPlHNWbAgaMH9M_UGZhF21LbnHA,3140
|
|
93
|
-
aioscrapy/queue/rabbitmq.py,sha256=rE1GCoIGxoaV4KAi_9Umt623A00FaHHIFV5ZH_nypzY,2516
|
|
94
|
-
aioscrapy/queue/redis.py,sha256=KU31ZNciLI9xxZDxsDhtOPLtmkxZQlRPOx_1z8afdwY,4788
|
|
95
|
-
aioscrapy/scrapyd/__init__.py,sha256=Ey14RVLUP7typ2XqP8RWcUum2fuFyigdhuhBBiEheIo,68
|
|
96
|
-
aioscrapy/scrapyd/runner.py,sha256=tewEkdNTMrBoredCbhmdrswSrF-GWsU3MLgC__ntnzQ,1777
|
|
97
|
-
aioscrapy/settings/__init__.py,sha256=GuiVhezV8U2J1B-WJwSvxxeH_1YWYD_Wighr9owC4HU,15781
|
|
98
|
-
aioscrapy/settings/default_settings.py,sha256=PrUOFYNnPIS8eCdqvRylMLBK-4tT-2MYuU6Nn8dQrx0,5639
|
|
99
|
-
aioscrapy/spiders/__init__.py,sha256=oM_FzqWa46P6cjzarOO1cfDTQD2AuIPgaWZrmdMcuTI,4085
|
|
100
|
-
aioscrapy/templates/project/aioscrapy.cfg,sha256=_nRHP5wtPnZaBi7wCmjWv5BgUu5NYFJZhvCTRVSipyM,112
|
|
101
|
-
aioscrapy/templates/project/module/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
102
|
-
aioscrapy/templates/project/module/middlewares.py.tmpl,sha256=0eEf2LC0vYcWPH82HNqieYSORyUuIo3Bgl5t-neRAJ4,3469
|
|
103
|
-
aioscrapy/templates/project/module/pipelines.py.tmpl,sha256=-MYA7MFAffH8FTG1VGAkhIJLQ6MOGMrWVDO7cI9jw9A,164
|
|
104
|
-
aioscrapy/templates/project/module/settings.py.tmpl,sha256=AO2jmyokUhuhFqxMvsMihPgSY4ZrldsMs-BuOEVfvQY,1421
|
|
105
|
-
aioscrapy/templates/project/module/spiders/__init__.py,sha256=Zg1uss1vaNjvld9s9Ccua50SxVZwpFTPwqpBHoCrWdU,164
|
|
106
|
-
aioscrapy/templates/spiders/basic.tmpl,sha256=oO1vh7-TZLjvpwdrYC49TGe-A6Kulc8UIG4Sa0QhDfI,375
|
|
107
|
-
aioscrapy/templates/spiders/single.tmpl,sha256=Ptmo_uFDGEffvpEMyxec7sxIyBbP05x0Grhn5u6lZbQ,1011
|
|
108
|
-
aioscrapy/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
109
|
-
aioscrapy/utils/conf.py,sha256=NkSmKjOE7xVvrAWQu4ne3jOzNGucgZdWHPhGbpz8dPU,7208
|
|
110
|
-
aioscrapy/utils/curl.py,sha256=I8eZWFNgvyUiJ2YS9-s3HltGNVG8XMMU0HPhlMxuxdA,3295
|
|
111
|
-
aioscrapy/utils/decorators.py,sha256=gMQArNxF9QQc1bENA0IqDchAjqmfWvHGKOyUdjXdg6A,794
|
|
112
|
-
aioscrapy/utils/deprecate.py,sha256=STy55Q8kZI8q8CQUfxK4QQVu1Rs4En3rlhzWc7p7T00,5467
|
|
113
|
-
aioscrapy/utils/httpobj.py,sha256=ytec7IZzsQY_GwR___051hdbOWs1ZM6S57HwcNiu2es,708
|
|
114
|
-
aioscrapy/utils/log.py,sha256=NRDivw8w21J77qEUeqqLdC4sgdIKaj2UAP6lDvWGotM,1697
|
|
115
|
-
aioscrapy/utils/misc.py,sha256=9NOssEl7CP_c6R9skxyXwmz4bd-nZ_gkw6F0EybeLTQ,3509
|
|
116
|
-
aioscrapy/utils/ossignal.py,sha256=jAsCIKu17KV45-9dZwEkFJHF31Y13KP_zxY0x49j1jo,896
|
|
117
|
-
aioscrapy/utils/project.py,sha256=cT98HaR5JaNmm-Y1UzSuzXj6B5S7GlmMshUfMhjpjJY,2905
|
|
118
|
-
aioscrapy/utils/python.py,sha256=38oD-OSjeGb3XZFJn3bt74PwGbejnBfLWC5-lkUL0g8,4462
|
|
119
|
-
aioscrapy/utils/reqser.py,sha256=qjrYut6KtvGpLLd-HDM0cncNzWCtXgpH6NyERu_5A9g,487
|
|
120
|
-
aioscrapy/utils/request.py,sha256=bkFaLDeebAOp7pF-7vta9LKOB2OR2s7V9jVKfA-XlqA,2418
|
|
121
|
-
aioscrapy/utils/response.py,sha256=UPR1wTTAYZkLGiiIs28kJLhlF7WPrgLuW31l9LZuYKM,1341
|
|
122
|
-
aioscrapy/utils/signal.py,sha256=bkqRgGMqQ82dly_D4tDe_0pHBbc9QUxBJqSsH9RSQf0,2282
|
|
123
|
-
aioscrapy/utils/spider.py,sha256=Usq3UlCaDUvXGp0ojFt39UPKFrR2rbInlJc_q0Xk7Qc,610
|
|
124
|
-
aioscrapy/utils/template.py,sha256=HR97X4lpv2WuqhuPfzTgaBN66fYnzHVpP6zQ5IoTwcI,833
|
|
125
|
-
aioscrapy/utils/tools.py,sha256=WJowViZB8XEs2CFqjVvbqXK3H5Uvf4BgWgBD_RcHMaM,2319
|
|
126
|
-
aioscrapy/utils/trackref.py,sha256=0nIpelT1d5WYxALl8SGA8vHNYsh-jS0Z2lwVEAhwx8E,2019
|
|
127
|
-
aioscrapy/utils/url.py,sha256=8W8tAhU7lgfPOfzKp3ejJGEcLj1i_PnA_53Jv5LpxiY,5464
|
|
128
|
-
aio_scrapy-2.1.4.dist-info/LICENSE,sha256=L-UoAEM3fQSjKA7FVWxQM7gwSCbeue6gZRAnpRS_UCo,1088
|
|
129
|
-
aio_scrapy-2.1.4.dist-info/METADATA,sha256=9R1Kw1XYe7yrLJ3h4SeiV69tPphz8sTaIf2Sizfh0GU,6536
|
|
130
|
-
aio_scrapy-2.1.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
|
131
|
-
aio_scrapy-2.1.4.dist-info/entry_points.txt,sha256=WWhoVHZvqhW8a5uFg97K0EP_GjG3uuCIFLkyqDICgaw,56
|
|
132
|
-
aio_scrapy-2.1.4.dist-info/top_level.txt,sha256=8l08KyMt22wfX_5BmhrGH0PgwZdzZIPq-hBUa1GNir4,10
|
|
133
|
-
aio_scrapy-2.1.4.dist-info/RECORD,,
|
|
@@ -1,115 +0,0 @@
|
|
|
1
|
-
from functools import wraps
|
|
2
|
-
|
|
3
|
-
try:
|
|
4
|
-
from playwright._impl._errors import Error
|
|
5
|
-
except ImportError:
|
|
6
|
-
from playwright._impl._api_types import Error
|
|
7
|
-
|
|
8
|
-
from playwright.async_api._generated import Response as EventResponse
|
|
9
|
-
|
|
10
|
-
from aioscrapy import Request, Spider
|
|
11
|
-
from aioscrapy.core.downloader.handlers import BaseDownloadHandler
|
|
12
|
-
from aioscrapy.core.downloader.handlers.playwright.driverpool import WebDriverPool
|
|
13
|
-
from aioscrapy.core.downloader.handlers.playwright.webdriver import PlaywrightDriver
|
|
14
|
-
from aioscrapy.exceptions import DownloadError
|
|
15
|
-
from aioscrapy.http import PlaywrightResponse
|
|
16
|
-
from aioscrapy.settings import Settings
|
|
17
|
-
from aioscrapy.utils.tools import call_helper
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class PlaywrightHandler(BaseDownloadHandler):
|
|
21
|
-
def __init__(self, settings: Settings):
|
|
22
|
-
self.settings = settings
|
|
23
|
-
playwright_client_args = settings.getdict('PLAYWRIGHT_CLIENT_ARGS')
|
|
24
|
-
use_pool = settings.getbool('PLAYWRIGHT_USE_POOL', True)
|
|
25
|
-
self.wait_until = playwright_client_args.get('wait_until', 'domcontentloaded')
|
|
26
|
-
self.url_regexes = playwright_client_args.pop('url_regexes', [])
|
|
27
|
-
pool_size = playwright_client_args.pop('pool_size', settings.getint("CONCURRENT_REQUESTS", 1))
|
|
28
|
-
self._webdriver_pool = WebDriverPool(use_pool=use_pool, pool_size=pool_size, driver_cls=PlaywrightDriver, **playwright_client_args)
|
|
29
|
-
|
|
30
|
-
@classmethod
|
|
31
|
-
def from_settings(cls, settings: Settings):
|
|
32
|
-
return cls(settings)
|
|
33
|
-
|
|
34
|
-
async def download_request(self, request: Request, spider: Spider) -> PlaywrightResponse:
|
|
35
|
-
try:
|
|
36
|
-
return await self._download_request(request, spider)
|
|
37
|
-
except Error as e:
|
|
38
|
-
raise DownloadError(real_error=e) from e
|
|
39
|
-
|
|
40
|
-
async def _download_request(self, request: Request, spider) -> PlaywrightResponse:
|
|
41
|
-
cookies = dict(request.cookies)
|
|
42
|
-
timeout = request.meta.get('download_timeout', 30) * 1000
|
|
43
|
-
user_agent = request.headers.get("User-Agent")
|
|
44
|
-
proxy: str = request.meta.get("proxy")
|
|
45
|
-
url = request.url
|
|
46
|
-
|
|
47
|
-
cache_response = {}
|
|
48
|
-
|
|
49
|
-
# 为了获取监听事件中的响应结果
|
|
50
|
-
def on_event_wrap_handler(func):
|
|
51
|
-
@wraps(func)
|
|
52
|
-
async def inner(response):
|
|
53
|
-
ret = await func(response)
|
|
54
|
-
if ret:
|
|
55
|
-
cache_response[ret[0]] = ret[1]
|
|
56
|
-
|
|
57
|
-
return inner
|
|
58
|
-
|
|
59
|
-
kwargs = dict()
|
|
60
|
-
if proxy:
|
|
61
|
-
kwargs['proxy'] = proxy
|
|
62
|
-
if user_agent:
|
|
63
|
-
kwargs['user_agent'] = user_agent
|
|
64
|
-
|
|
65
|
-
driver: PlaywrightDriver = await self._webdriver_pool.get(**kwargs)
|
|
66
|
-
|
|
67
|
-
# 移除所有的事件监听事件后 重新添加
|
|
68
|
-
driver.page._events = dict()
|
|
69
|
-
for name in dir(spider):
|
|
70
|
-
if not name.startswith('on_event_'):
|
|
71
|
-
continue
|
|
72
|
-
driver.page.on(name.replace('on_event_', ''), on_event_wrap_handler(getattr(spider, name)))
|
|
73
|
-
|
|
74
|
-
try:
|
|
75
|
-
if cookies:
|
|
76
|
-
driver.url = url
|
|
77
|
-
await driver.set_cookies(cookies)
|
|
78
|
-
await driver.page.goto(url, wait_until=request.meta.get('wait_until', self.wait_until), timeout=timeout)
|
|
79
|
-
|
|
80
|
-
if process_action_fn := getattr(spider, 'process_action', None):
|
|
81
|
-
action_result = await call_helper(process_action_fn, driver)
|
|
82
|
-
if action_result:
|
|
83
|
-
cache_response[action_result[0]] = action_result[1]
|
|
84
|
-
|
|
85
|
-
for cache_key in list(cache_response.keys()):
|
|
86
|
-
if isinstance(cache_response[cache_key], EventResponse):
|
|
87
|
-
cache_ret = cache_response[cache_key]
|
|
88
|
-
cache_response[cache_key] = PlaywrightResponse(
|
|
89
|
-
url=cache_ret.url,
|
|
90
|
-
request=request,
|
|
91
|
-
intercept_request=dict(
|
|
92
|
-
url=cache_ret.request.url,
|
|
93
|
-
headers=cache_ret.request.headers,
|
|
94
|
-
data=cache_ret.request.post_data,
|
|
95
|
-
),
|
|
96
|
-
headers=cache_ret.headers,
|
|
97
|
-
body=await cache_ret.body(),
|
|
98
|
-
status=cache_ret.status,
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
return PlaywrightResponse(
|
|
102
|
-
url=driver.page.url,
|
|
103
|
-
status=200,
|
|
104
|
-
text=await driver.page.content(),
|
|
105
|
-
cookies=await driver.get_cookies(),
|
|
106
|
-
cache_response=cache_response,
|
|
107
|
-
driver=driver,
|
|
108
|
-
driver_pool=self._webdriver_pool
|
|
109
|
-
)
|
|
110
|
-
except Exception as e:
|
|
111
|
-
await self._webdriver_pool.remove(driver)
|
|
112
|
-
raise e
|
|
113
|
-
|
|
114
|
-
async def close(self):
|
|
115
|
-
await self._webdriver_pool.close()
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
|
-
from asyncio import Lock
|
|
4
|
-
from asyncio.queues import Queue
|
|
5
|
-
|
|
6
|
-
from aioscrapy.utils.tools import singleton
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
@singleton
|
|
10
|
-
class WebDriverPool:
|
|
11
|
-
def __init__(
|
|
12
|
-
self, use_pool=True, pool_size=5, driver_cls=None, **kwargs
|
|
13
|
-
):
|
|
14
|
-
self.use_pool = use_pool
|
|
15
|
-
self.pool_size = pool_size
|
|
16
|
-
self.driver_cls = driver_cls
|
|
17
|
-
self.kwargs = kwargs
|
|
18
|
-
|
|
19
|
-
self.queue = Queue(maxsize=pool_size)
|
|
20
|
-
self.lock = Lock()
|
|
21
|
-
self.driver_count = 0
|
|
22
|
-
|
|
23
|
-
@property
|
|
24
|
-
def is_full(self):
|
|
25
|
-
return self.driver_count >= self.pool_size
|
|
26
|
-
|
|
27
|
-
async def create_driver(self, **args):
|
|
28
|
-
kwargs = self.kwargs.copy()
|
|
29
|
-
kwargs.update(args)
|
|
30
|
-
driver = self.driver_cls(**kwargs)
|
|
31
|
-
await driver.setup()
|
|
32
|
-
return driver
|
|
33
|
-
|
|
34
|
-
async def get(self, **kwargs):
|
|
35
|
-
async with self.lock:
|
|
36
|
-
if not self.use_pool:
|
|
37
|
-
return await self.create_driver(**kwargs)
|
|
38
|
-
if not self.is_full:
|
|
39
|
-
driver = await self.create_driver(**kwargs)
|
|
40
|
-
self.driver_count += 1
|
|
41
|
-
else:
|
|
42
|
-
driver = await self.queue.get()
|
|
43
|
-
return driver
|
|
44
|
-
|
|
45
|
-
async def release(self, driver):
|
|
46
|
-
if not self.use_pool:
|
|
47
|
-
await driver.quit()
|
|
48
|
-
return
|
|
49
|
-
await self.queue.put(driver)
|
|
50
|
-
|
|
51
|
-
async def remove(self, driver):
|
|
52
|
-
await driver.quit()
|
|
53
|
-
self.driver_count -= 1
|
|
54
|
-
|
|
55
|
-
async def close(self):
|
|
56
|
-
while not self.queue.empty():
|
|
57
|
-
driver = await self.queue.get()
|
|
58
|
-
await driver.quit()
|
|
59
|
-
self.driver_count -= 1
|
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
from typing import Dict, Optional, Tuple
|
|
5
|
-
|
|
6
|
-
try:
|
|
7
|
-
from typing import Literal # python >= 3.8
|
|
8
|
-
except ImportError: # python <3.8
|
|
9
|
-
from typing_extensions import Literal
|
|
10
|
-
|
|
11
|
-
from urllib.parse import urlparse, urlunparse
|
|
12
|
-
|
|
13
|
-
from playwright.async_api import Page, BrowserContext, ViewportSize, ProxySettings
|
|
14
|
-
from playwright.async_api import Playwright, Browser
|
|
15
|
-
from playwright.async_api import async_playwright
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class PlaywrightDriver:
|
|
19
|
-
def __init__(
|
|
20
|
-
self,
|
|
21
|
-
*,
|
|
22
|
-
driver_type: Literal["chromium", "firefox", "webkit"] = "chromium",
|
|
23
|
-
proxy: Optional[str] = None,
|
|
24
|
-
browser_args: Optional[Dict] = None,
|
|
25
|
-
context_args: Optional[Dict] = None,
|
|
26
|
-
window_size: Optional[Tuple[int, int]] = None,
|
|
27
|
-
user_agent: str = None,
|
|
28
|
-
**kwargs
|
|
29
|
-
):
|
|
30
|
-
|
|
31
|
-
self.driver_type = driver_type
|
|
32
|
-
self.proxy = proxy and self.format_context_proxy(proxy)
|
|
33
|
-
self.viewport = window_size and ViewportSize(width=window_size[0], height=window_size[1])
|
|
34
|
-
self.browser_args = browser_args or {}
|
|
35
|
-
self.context_args = context_args or {}
|
|
36
|
-
self.user_agent = user_agent
|
|
37
|
-
|
|
38
|
-
self.driver: Optional[Playwright] = None
|
|
39
|
-
self.browser: Optional[Browser] = None
|
|
40
|
-
self.context: Optional[BrowserContext] = None
|
|
41
|
-
self.page: Optional[Page] = None
|
|
42
|
-
self.url = None
|
|
43
|
-
|
|
44
|
-
async def setup(self):
|
|
45
|
-
browser_args = self.browser_args.copy()
|
|
46
|
-
context_args = self.context_args.copy()
|
|
47
|
-
if browser_args.get('args') is None:
|
|
48
|
-
browser_args.update({'args': ["--no-sandbox"]})
|
|
49
|
-
|
|
50
|
-
if context_args.get("storage_state") is not None:
|
|
51
|
-
storage_state_path = context_args.get("storage_state")
|
|
52
|
-
os.makedirs(os.path.dirname(storage_state_path), exist_ok=True)
|
|
53
|
-
|
|
54
|
-
if self.proxy:
|
|
55
|
-
browser_args.update({'proxy': self.proxy})
|
|
56
|
-
context_args.update({'proxy': self.proxy})
|
|
57
|
-
if self.viewport:
|
|
58
|
-
context_args.update({"viewport": self.viewport})
|
|
59
|
-
context_args.update({"screen": self.viewport})
|
|
60
|
-
if self.user_agent:
|
|
61
|
-
context_args.update({'user_agent': self.user_agent})
|
|
62
|
-
|
|
63
|
-
self.driver = await async_playwright().start()
|
|
64
|
-
self.browser: Browser = await getattr(self.driver, self.driver_type).launch(**browser_args)
|
|
65
|
-
self.context = await self.browser.new_context(**context_args)
|
|
66
|
-
self.page = await self.context.new_page()
|
|
67
|
-
|
|
68
|
-
@staticmethod
|
|
69
|
-
def format_context_proxy(proxy) -> ProxySettings:
|
|
70
|
-
parsed_url = urlparse(proxy)
|
|
71
|
-
return ProxySettings(
|
|
72
|
-
server=urlunparse(parsed_url._replace(netloc=parsed_url.netloc.split('@')[-1])),
|
|
73
|
-
username=parsed_url.username,
|
|
74
|
-
password=parsed_url.password,
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
async def quit(self):
|
|
78
|
-
await self.page.close()
|
|
79
|
-
try:
|
|
80
|
-
await self.context.close()
|
|
81
|
-
except:
|
|
82
|
-
pass
|
|
83
|
-
finally:
|
|
84
|
-
await self.browser.close()
|
|
85
|
-
await self.driver.stop()
|
|
86
|
-
|
|
87
|
-
async def get_cookies(self):
|
|
88
|
-
return {
|
|
89
|
-
cookie["name"]: cookie["value"]
|
|
90
|
-
for cookie in await self.page.context.cookies()
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
async def set_cookies(self, cookies: dict):
|
|
94
|
-
await self.page.context.add_cookies([
|
|
95
|
-
{"name": key, "value": value, "url": self.url or self.page.url} for key, value in cookies.items()
|
|
96
|
-
])
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
from typing import Optional, Any
|
|
2
|
-
|
|
3
|
-
from aioscrapy.http.response.text import TextResponse
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class PlaywrightResponse(TextResponse):
|
|
7
|
-
def __init__(
|
|
8
|
-
self,
|
|
9
|
-
*args,
|
|
10
|
-
text: str = '',
|
|
11
|
-
cache_response: Optional[dict] = None,
|
|
12
|
-
driver: Optional["PlaywrightDriver"] = None,
|
|
13
|
-
driver_pool: Optional["WebDriverPool"] = None,
|
|
14
|
-
intercept_request: Optional[dict] = None,
|
|
15
|
-
**kwargs
|
|
16
|
-
):
|
|
17
|
-
self.driver = driver
|
|
18
|
-
self.driver_pool = driver_pool
|
|
19
|
-
self._text = text
|
|
20
|
-
self.cache_response = cache_response or {}
|
|
21
|
-
self.intercept_request = intercept_request
|
|
22
|
-
super().__init__(*args, **kwargs)
|
|
23
|
-
|
|
24
|
-
async def release(self):
|
|
25
|
-
self.driver_pool and self.driver and await self.driver_pool.release(self.driver)
|
|
26
|
-
|
|
27
|
-
@property
|
|
28
|
-
def text(self):
|
|
29
|
-
return self._text or super().text
|
|
30
|
-
|
|
31
|
-
@text.setter
|
|
32
|
-
def text(self, text):
|
|
33
|
-
self._text = text
|
|
34
|
-
|
|
35
|
-
def get_response(self, key) -> Any:
|
|
36
|
-
return self.cache_response.get(key)
|