scrapy-seleniumbase-cdp 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapy_seleniumbase_cdp-0.0.1/.github/workflows/publish.yml +21 -0
- scrapy_seleniumbase_cdp-0.0.1/.github/workflows/release.yml +41 -0
- scrapy_seleniumbase_cdp-0.0.1/.gitignore +10 -0
- scrapy_seleniumbase_cdp-0.0.1/LICENSE +22 -0
- scrapy_seleniumbase_cdp-0.0.1/PKG-INFO +161 -0
- scrapy_seleniumbase_cdp-0.0.1/README.md +146 -0
- scrapy_seleniumbase_cdp-0.0.1/pyproject.toml +24 -0
- scrapy_seleniumbase_cdp-0.0.1/scrapy_seleniumbase_cdp/__init__.py +4 -0
- scrapy_seleniumbase_cdp-0.0.1/scrapy_seleniumbase_cdp/middleware_async.py +75 -0
- scrapy_seleniumbase_cdp-0.0.1/scrapy_seleniumbase_cdp/request.py +43 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [ published ]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
publish:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
permissions:
|
|
11
|
+
id-token: write
|
|
12
|
+
|
|
13
|
+
steps:
|
|
14
|
+
- name: Download release assets
|
|
15
|
+
run: |
|
|
16
|
+
gh release download ${{ github.event.release.tag_name }} -D dist -R ${{ github.repository }}
|
|
17
|
+
env:
|
|
18
|
+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
19
|
+
|
|
20
|
+
- name: Publish to PyPI
|
|
21
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
name: Create GitHub release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- v[0-9]+.[0-9]+.[0-9]+
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
create_release:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
permissions:
|
|
12
|
+
contents: write
|
|
13
|
+
|
|
14
|
+
steps:
|
|
15
|
+
- name: Checkout repository
|
|
16
|
+
uses: actions/checkout@v4
|
|
17
|
+
with:
|
|
18
|
+
fetch-depth: 0
|
|
19
|
+
|
|
20
|
+
- name: Set up Python
|
|
21
|
+
uses: actions/setup-python@v5
|
|
22
|
+
with:
|
|
23
|
+
python-version: '3.x'
|
|
24
|
+
|
|
25
|
+
- name: Install build tools
|
|
26
|
+
run: |
|
|
27
|
+
python -m pip install --upgrade pip
|
|
28
|
+
pip install build
|
|
29
|
+
|
|
30
|
+
- name: Build package
|
|
31
|
+
run: python -m build
|
|
32
|
+
|
|
33
|
+
- name: Generate changelog
|
|
34
|
+
uses: orhun/git-cliff-action@v4
|
|
35
|
+
with:
|
|
36
|
+
args: -vv --latest --strip all
|
|
37
|
+
|
|
38
|
+
- name: Create release
|
|
39
|
+
run: gh release create ${{ github.ref_name }} -F git-cliff/CHANGELOG.md dist/*
|
|
40
|
+
env:
|
|
41
|
+
GH_TOKEN: ${{ secrets.RELEASE_TOKEN }}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Original code Copyright (c) Quartz-Core
|
|
4
|
+
Modifications Copyright (c) nyg
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
SOFTWARE.
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scrapy-seleniumbase-cdp
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Scrapy downloader middleware that uses SeleniumBase's pure CDP mode to make requests.
|
|
5
|
+
Project-URL: Homepage, https://github.com/nyg/scrapy-seleniumbase-cdp
|
|
6
|
+
Project-URL: Issues, https://github.com/nyg/scrapy-seleniumbase-cdp/issue
|
|
7
|
+
Author: nyg
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Requires-Python: >=3.8
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# scrapy-selenium-cdp
|
|
17
|
+
|
|
18
|
+
Scrapy downloader middleware that uses SeleniumBase's pure CDP mode to make
|
|
19
|
+
requests, allowing to bypass most anti-bot protections (e.g. CloudFlare).
|
|
20
|
+
|
|
21
|
+
Using Selenium's pure CDP mode also makes the middle more platform independent
|
|
22
|
+
as no WebDriver is required.
|
|
23
|
+
|
|
24
|
+
🚧 Work in progress 🚧
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
pip install git+https://github.com/nyg/scrapy-seleniumbase-cdp
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Configuration
|
|
33
|
+
|
|
34
|
+
1. Provide keyword arguments for Driver in dict. For example:
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
SELENIUMBASE_DRIVER_KWARGS = {
|
|
38
|
+
"browser": "chrome",
|
|
39
|
+
"uc": True,
|
|
40
|
+
"headless": True,
|
|
41
|
+
"block_images": True,
|
|
42
|
+
}
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
2. Add the `SeleniumBaseMiddleware` to the downloader middlewares:
|
|
46
|
+
```python
|
|
47
|
+
DOWNLOADER_MIDDLEWARES = {
|
|
48
|
+
'scrapy_seleniumbase.SeleniumBaseMiddleware': 800
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Usage
|
|
53
|
+
|
|
54
|
+
Use the `scrapy_seleniumbase.SeleniumBaseRequest` instead of the scrapy built-in
|
|
55
|
+
`Request` like below:
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from scrapy_seleniumbase import SeleniumBaseRequest
|
|
59
|
+
|
|
60
|
+
yield SeleniumBaseRequest(url=url, callback=self.parse_result)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
The request will be handled by seleniumbase, and the request will have an
|
|
64
|
+
additional `meta` key, named `driver` containing the seleniumbase driver with
|
|
65
|
+
the request processed.
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
def parse_result(self, response):
|
|
69
|
+
print(response.request.meta['driver'].title)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
For more information about the available driver methods and attributes, refer to
|
|
73
|
+
the [selenium python documentation][1] (all vanilla selenium driver methods are
|
|
74
|
+
available) and [seleniumbase documentation][2] (look for "driver" specific
|
|
75
|
+
methods, located at the end of the page).
|
|
76
|
+
|
|
77
|
+
The `selector` response attribute work as usual (but contains the html processed
|
|
78
|
+
by the selenium driver).
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
def parse_result(self, response):
|
|
82
|
+
print(response.selector.xpath('//title/@text'))
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Additional arguments
|
|
86
|
+
|
|
87
|
+
The `scrapy_selenium.SeleniumBaseRequest` accept 5 additional arguments:
|
|
88
|
+
|
|
89
|
+
#### `wait_time` / `wait_until`
|
|
90
|
+
|
|
91
|
+
When used, webdriver will perform an [explicit wait][3] before returning the
|
|
92
|
+
response to the spider.
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from selenium.webdriver.common.by import By
|
|
96
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
97
|
+
|
|
98
|
+
yield SeleniumBaseRequest(
|
|
99
|
+
url=url,
|
|
100
|
+
callback=self.parse_result,
|
|
101
|
+
wait_time=10,
|
|
102
|
+
wait_until=EC.element_to_be_clickable((By.ID, 'someid'))
|
|
103
|
+
)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
#### `screenshot`
|
|
107
|
+
|
|
108
|
+
When used, webdriver will take a screenshot of the page and the binary data of
|
|
109
|
+
the .png captured will be added to the response `meta`:
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
yield SeleniumBaseRequest(
|
|
113
|
+
url=url,
|
|
114
|
+
callback=self.parse_result,
|
|
115
|
+
screenshot=True
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def parse_result(self, response):
|
|
120
|
+
with open('image.png', 'wb') as image_file:
|
|
121
|
+
image_file.write(response.meta['screenshot'])
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
#### `script`
|
|
125
|
+
|
|
126
|
+
When used, webdriver will execute custom JavaScript code.
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
yield SeleniumBaseRequest(
|
|
130
|
+
url=url,
|
|
131
|
+
callback=self.parse_result,
|
|
132
|
+
script='window.scrollTo(0, document.body.scrollHeight);',
|
|
133
|
+
)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
#### `driver_methods`
|
|
137
|
+
|
|
138
|
+
When used, seleniumbase webdriver will execute methods, provided as strings in a
|
|
139
|
+
list, before returning page's html.
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
def start_requests(self):
|
|
143
|
+
for url in self.start_urls:
|
|
144
|
+
yield SeleniumRequest(
|
|
145
|
+
url=url,
|
|
146
|
+
driver_methods=['''.find_element("xpath","some_xpath").click()'''])
|
|
147
|
+
|
|
148
|
+
)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## License
|
|
152
|
+
|
|
153
|
+
This project is licensed under the MIT License. It is a fork
|
|
154
|
+
of [Quartz-Core/scrapy-seleniumbase](https://github.com/Quartz-Core/scrapy-seleniumbase)
|
|
155
|
+
which was originally released under the WTFPL.
|
|
156
|
+
|
|
157
|
+
[1]: http://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.remote.webdriver
|
|
158
|
+
|
|
159
|
+
[2]: https://seleniumbase.io/help_docs/method_summary/#seleniumbase-methods-api-reference
|
|
160
|
+
|
|
161
|
+
[3]: http://selenium-python.readthedocs.io/waits.html#explicit-waits
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# scrapy-selenium-cdp
|
|
2
|
+
|
|
3
|
+
Scrapy downloader middleware that uses SeleniumBase's pure CDP mode to make
|
|
4
|
+
requests, allowing to bypass most anti-bot protections (e.g. CloudFlare).
|
|
5
|
+
|
|
6
|
+
Using Selenium's pure CDP mode also makes the middle more platform independent
|
|
7
|
+
as no WebDriver is required.
|
|
8
|
+
|
|
9
|
+
🚧 Work in progress 🚧
|
|
10
|
+
|
|
11
|
+
## Installation
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
pip install git+https://github.com/nyg/scrapy-seleniumbase-cdp
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Configuration
|
|
18
|
+
|
|
19
|
+
1. Provide keyword arguments for Driver in dict. For example:
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
SELENIUMBASE_DRIVER_KWARGS = {
|
|
23
|
+
"browser": "chrome",
|
|
24
|
+
"uc": True,
|
|
25
|
+
"headless": True,
|
|
26
|
+
"block_images": True,
|
|
27
|
+
}
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
2. Add the `SeleniumBaseMiddleware` to the downloader middlewares:
|
|
31
|
+
```python
|
|
32
|
+
DOWNLOADER_MIDDLEWARES = {
|
|
33
|
+
'scrapy_seleniumbase.SeleniumBaseMiddleware': 800
|
|
34
|
+
}
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
|
|
39
|
+
Use the `scrapy_seleniumbase.SeleniumBaseRequest` instead of the scrapy built-in
|
|
40
|
+
`Request` like below:
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from scrapy_seleniumbase import SeleniumBaseRequest
|
|
44
|
+
|
|
45
|
+
yield SeleniumBaseRequest(url=url, callback=self.parse_result)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
The request will be handled by seleniumbase, and the request will have an
|
|
49
|
+
additional `meta` key, named `driver` containing the seleniumbase driver with
|
|
50
|
+
the request processed.
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
def parse_result(self, response):
|
|
54
|
+
print(response.request.meta['driver'].title)
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
For more information about the available driver methods and attributes, refer to
|
|
58
|
+
the [selenium python documentation][1] (all vanilla selenium driver methods are
|
|
59
|
+
available) and [seleniumbase documentation][2] (look for "driver" specific
|
|
60
|
+
methods, located at the end of the page).
|
|
61
|
+
|
|
62
|
+
The `selector` response attribute work as usual (but contains the html processed
|
|
63
|
+
by the selenium driver).
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
def parse_result(self, response):
|
|
67
|
+
print(response.selector.xpath('//title/@text'))
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Additional arguments
|
|
71
|
+
|
|
72
|
+
The `scrapy_selenium.SeleniumBaseRequest` accept 5 additional arguments:
|
|
73
|
+
|
|
74
|
+
#### `wait_time` / `wait_until`
|
|
75
|
+
|
|
76
|
+
When used, webdriver will perform an [explicit wait][3] before returning the
|
|
77
|
+
response to the spider.
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from selenium.webdriver.common.by import By
|
|
81
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
82
|
+
|
|
83
|
+
yield SeleniumBaseRequest(
|
|
84
|
+
url=url,
|
|
85
|
+
callback=self.parse_result,
|
|
86
|
+
wait_time=10,
|
|
87
|
+
wait_until=EC.element_to_be_clickable((By.ID, 'someid'))
|
|
88
|
+
)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
#### `screenshot`
|
|
92
|
+
|
|
93
|
+
When used, webdriver will take a screenshot of the page and the binary data of
|
|
94
|
+
the .png captured will be added to the response `meta`:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
yield SeleniumBaseRequest(
|
|
98
|
+
url=url,
|
|
99
|
+
callback=self.parse_result,
|
|
100
|
+
screenshot=True
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def parse_result(self, response):
|
|
105
|
+
with open('image.png', 'wb') as image_file:
|
|
106
|
+
image_file.write(response.meta['screenshot'])
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
#### `script`
|
|
110
|
+
|
|
111
|
+
When used, webdriver will execute custom JavaScript code.
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
yield SeleniumBaseRequest(
|
|
115
|
+
url=url,
|
|
116
|
+
callback=self.parse_result,
|
|
117
|
+
script='window.scrollTo(0, document.body.scrollHeight);',
|
|
118
|
+
)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
#### `driver_methods`
|
|
122
|
+
|
|
123
|
+
When used, seleniumbase webdriver will execute methods, provided as strings in a
|
|
124
|
+
list, before returning page's html.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
def start_requests(self):
|
|
128
|
+
for url in self.start_urls:
|
|
129
|
+
yield SeleniumRequest(
|
|
130
|
+
url=url,
|
|
131
|
+
driver_methods=['''.find_element("xpath","some_xpath").click()'''])
|
|
132
|
+
|
|
133
|
+
)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## License
|
|
137
|
+
|
|
138
|
+
This project is licensed under the MIT License. It is a fork
|
|
139
|
+
of [Quartz-Core/scrapy-seleniumbase](https://github.com/Quartz-Core/scrapy-seleniumbase)
|
|
140
|
+
which was originally released under the WTFPL.
|
|
141
|
+
|
|
142
|
+
[1]: http://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.remote.webdriver
|
|
143
|
+
|
|
144
|
+
[2]: https://seleniumbase.io/help_docs/method_summary/#seleniumbase-methods-api-reference
|
|
145
|
+
|
|
146
|
+
[3]: http://selenium-python.readthedocs.io/waits.html#explicit-waits
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "scrapy-seleniumbase-cdp"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
authors = [
|
|
5
|
+
{ name = "nyg" },
|
|
6
|
+
]
|
|
7
|
+
description = "Scrapy downloader middleware that uses SeleniumBase's pure CDP mode to make requests."
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.8"
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Intended Audience :: Developers",
|
|
12
|
+
"Programming Language :: Python :: 3",
|
|
13
|
+
"Operating System :: OS Independent",
|
|
14
|
+
]
|
|
15
|
+
license = { text = "MIT" }
|
|
16
|
+
license-files = ["LICEN[CS]E*"]
|
|
17
|
+
|
|
18
|
+
[project.urls]
|
|
19
|
+
Homepage = "https://github.com/nyg/scrapy-seleniumbase-cdp"
|
|
20
|
+
Issues = "https://github.com/nyg/scrapy-seleniumbase-cdp/issue"
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["hatchling"]
|
|
24
|
+
build-backend = "hatchling.build"
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# SeleniumBase middleware using the pure CDP mode instead of the UC mode.
|
|
2
|
+
#
|
|
3
|
+
# Uses the async/await API of SeleniumBase because the event loop of the pure
|
|
4
|
+
# CDP mode conflicts with Scrapy's own event loop. For the moment this means
|
|
5
|
+
# less features.
|
|
6
|
+
#
|
|
7
|
+
# The pure CDP mode does not require any WebDriver and can therefore run on
|
|
8
|
+
# platforms where no such drivers are available (e.g., Raspberry Pi)
|
|
9
|
+
#
|
|
10
|
+
# Based on https://github.com/Quartz-Core/scrapy-seleniumbase.
|
|
11
|
+
#
|
|
12
|
+
# Doc: https://github.com/seleniumbase/SeleniumBase/blob/master/help_docs/syntax_formats.md#sb_sf_24
|
|
13
|
+
# https://github.com/seleniumbase/SeleniumBase/discussions/3955
|
|
14
|
+
|
|
15
|
+
from importlib import import_module
|
|
16
|
+
|
|
17
|
+
from scrapy import signals
|
|
18
|
+
from scrapy.http import HtmlResponse
|
|
19
|
+
from scrapy_seleniumbase import SeleniumBaseRequest
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SeleniumBaseAsyncCDPMiddleware:
|
|
23
|
+
"""Scrapy middleware handling the requests using seleniumbase"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, driver_kwargs):
|
|
26
|
+
"""Initialize the selenium webdriver
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
driver_kwargs: dict
|
|
31
|
+
A dictionary of keyword arguments to initialize the driver with.
|
|
32
|
+
"""
|
|
33
|
+
seleniumbase_cdp = import_module("seleniumbase")
|
|
34
|
+
cdp_module = getattr(seleniumbase_cdp, 'cdp_driver')
|
|
35
|
+
self.start_async_driver = getattr(cdp_module, "start_async")
|
|
36
|
+
self.driver = None
|
|
37
|
+
self.driver_kwargs = driver_kwargs
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def from_crawler(cls, crawler):
|
|
41
|
+
"""Initialize the middleware with the crawler settings"""
|
|
42
|
+
driver_kwargs = crawler.settings.get('SELENIUMBASE_DRIVER_KWARGS', {})
|
|
43
|
+
middleware = cls(driver_kwargs)
|
|
44
|
+
crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
|
|
45
|
+
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
|
|
46
|
+
return middleware
|
|
47
|
+
|
|
48
|
+
async def process_request(self, request, spider):
|
|
49
|
+
"""Process a request using the selenium driver if applicable"""
|
|
50
|
+
|
|
51
|
+
if not isinstance(request, SeleniumBaseRequest):
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
page = await self.driver.get(request.url)
|
|
55
|
+
|
|
56
|
+
if request.wait_until:
|
|
57
|
+
try:
|
|
58
|
+
await page.select(request.wait_until, timeout=request.wait_time if hasattr(request, 'wait_time') else 10)
|
|
59
|
+
except Exception as e:
|
|
60
|
+
spider.logger.warning(f'Element not found: {request.wait_until}, {e}')
|
|
61
|
+
|
|
62
|
+
request.meta.update({'driver': self.driver})
|
|
63
|
+
|
|
64
|
+
page_source = await page.evaluate('document.documentElement.outerHTML')
|
|
65
|
+
body = str.encode(page_source)
|
|
66
|
+
|
|
67
|
+
return HtmlResponse(await page.evaluate('window.location.href'), body=body, encoding='utf-8', request=request)
|
|
68
|
+
|
|
69
|
+
async def spider_opened(self, spider):
|
|
70
|
+
"""Start the CDP driver when spider opens"""
|
|
71
|
+
self.driver = await self.start_async_driver(**self.driver_kwargs)
|
|
72
|
+
|
|
73
|
+
def spider_closed(self):
|
|
74
|
+
"""Shutdown the driver when spider is closed"""
|
|
75
|
+
self.driver.stop()
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""This module contains the ``SeleniumBaseRequest`` class"""
|
|
2
|
+
|
|
3
|
+
from scrapy import Request
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SeleniumBaseRequest(Request):
|
|
7
|
+
"""Scrapy ``Request`` subclass providing additional arguments"""
|
|
8
|
+
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
wait_time=None,
|
|
12
|
+
wait_until=None,
|
|
13
|
+
screenshot=False,
|
|
14
|
+
script=None,
|
|
15
|
+
driver_methods=None,
|
|
16
|
+
*args,
|
|
17
|
+
**kwargs
|
|
18
|
+
):
|
|
19
|
+
"""Initialize a new selenium request
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
wait_time: int
|
|
24
|
+
The number of seconds to wait.
|
|
25
|
+
wait_until: method
|
|
26
|
+
One of the "selenium.webdriver.support.expected_conditions". The response
|
|
27
|
+
will be returned until the given condition is fulfilled.
|
|
28
|
+
screenshot: bool
|
|
29
|
+
If True, a screenshot of the page will be taken and the data of the screenshot
|
|
30
|
+
will be returned in the response "meta" attribute.
|
|
31
|
+
script: str
|
|
32
|
+
JavaScript code to execute.
|
|
33
|
+
driver_methods: list
|
|
34
|
+
List of seleniumbase driver methods as strings to execute. (e.g., [".find_element(...).click()", ...])
|
|
35
|
+
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
self.wait_time = wait_time
|
|
39
|
+
self.wait_until = wait_until
|
|
40
|
+
self.screenshot = screenshot
|
|
41
|
+
self.script = script
|
|
42
|
+
self.driver_methods = driver_methods
|
|
43
|
+
super().__init__(*args, **kwargs)
|