python-proxy-headers 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- python_proxy_headers-0.2.0/PKG-INFO +176 -0
- python_proxy_headers-0.2.0/README.md +155 -0
- {python_proxy_headers-0.1.0 → python_proxy_headers-0.2.0}/pyproject.toml +4 -2
- python_proxy_headers-0.2.0/python_proxy_headers/autoscraper_proxy.py +344 -0
- python_proxy_headers-0.2.0/python_proxy_headers/cloudscraper_proxy.py +213 -0
- python_proxy_headers-0.2.0/python_proxy_headers/pycurl_proxy.py +379 -0
- python_proxy_headers-0.2.0/python_proxy_headers.egg-info/PKG-INFO +176 -0
- {python_proxy_headers-0.1.0 → python_proxy_headers-0.2.0}/python_proxy_headers.egg-info/SOURCES.txt +3 -0
- python_proxy_headers-0.1.0/PKG-INFO +0 -151
- python_proxy_headers-0.1.0/README.md +0 -133
- python_proxy_headers-0.1.0/python_proxy_headers.egg-info/PKG-INFO +0 -151
- {python_proxy_headers-0.1.0 → python_proxy_headers-0.2.0}/LICENSE +0 -0
- {python_proxy_headers-0.1.0 → python_proxy_headers-0.2.0}/python_proxy_headers/__init__.py +0 -0
- {python_proxy_headers-0.1.0 → python_proxy_headers-0.2.0}/python_proxy_headers/aiohttp_proxy.py +0 -0
- {python_proxy_headers-0.1.0 → python_proxy_headers-0.2.0}/python_proxy_headers/httpx_proxy.py +0 -0
- {python_proxy_headers-0.1.0 → python_proxy_headers-0.2.0}/python_proxy_headers/requests_adapter.py +0 -0
- {python_proxy_headers-0.1.0 → python_proxy_headers-0.2.0}/python_proxy_headers/urllib3_proxy_manager.py +0 -0
- {python_proxy_headers-0.1.0 → python_proxy_headers-0.2.0}/python_proxy_headers.egg-info/dependency_links.txt +0 -0
- {python_proxy_headers-0.1.0 → python_proxy_headers-0.2.0}/python_proxy_headers.egg-info/top_level.txt +0 -0
- {python_proxy_headers-0.1.0 → python_proxy_headers-0.2.0}/setup.cfg +0 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: python-proxy-headers
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Handle custom proxy headers for http & https requests in various python libraries
|
|
5
|
+
Author-email: ProxyMesh <support@proxymesh.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/proxymesh/python-proxy-headers
|
|
7
|
+
Project-URL: Changelog, https://github.com/proxymesh/python-proxy-headers/commits/main/
|
|
8
|
+
Project-URL: Issues, https://github.com/proxymesh/python-proxy-headers/issues
|
|
9
|
+
Project-URL: Documentation, https://python-proxy-headers.readthedocs.io/en/latest/
|
|
10
|
+
Project-URL: ProxyMesh, https://proxymesh.com
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Requires-Python: >=3.8
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# Python Proxy Headers
|
|
23
|
+
|
|
24
|
+
[](https://python-proxy-headers.readthedocs.io/en/latest/?badge=latest)
|
|
25
|
+
[](https://badge.fury.io/py/python-proxy-headers)
|
|
26
|
+
|
|
27
|
+
Extensions for Python HTTP libraries to support **sending and receiving custom proxy headers** during HTTPS CONNECT tunneling.
|
|
28
|
+
|
|
29
|
+
## The Problem
|
|
30
|
+
|
|
31
|
+
When making HTTPS requests through a proxy, the connection is established via a CONNECT tunnel. During this process:
|
|
32
|
+
|
|
33
|
+
1. **Sending headers to the proxy** - Most Python HTTP libraries don't provide an easy way to send custom headers (like `X-ProxyMesh-Country`) to the proxy server during the CONNECT handshake.
|
|
34
|
+
|
|
35
|
+
2. **Receiving headers from the proxy** - The proxy's response headers from the CONNECT request are typically discarded, making it impossible to read custom headers (like `X-ProxyMesh-IP`) that the proxy sends back.
|
|
36
|
+
|
|
37
|
+
This library solves both problems for popular Python HTTP libraries.
|
|
38
|
+
|
|
39
|
+
## Supported Libraries
|
|
40
|
+
|
|
41
|
+
| Library | Module | Use Case |
|
|
42
|
+
|---------|--------|----------|
|
|
43
|
+
| [urllib3](https://python-proxy-headers.readthedocs.io/en/latest/urllib3.html) | `urllib3_proxy_manager` | Low-level HTTP client |
|
|
44
|
+
| [requests](https://python-proxy-headers.readthedocs.io/en/latest/requests.html) | `requests_adapter` | Simple HTTP requests |
|
|
45
|
+
| [aiohttp](https://python-proxy-headers.readthedocs.io/en/latest/aiohttp.html) | `aiohttp_proxy` | Async HTTP client |
|
|
46
|
+
| [httpx](https://python-proxy-headers.readthedocs.io/en/latest/httpx.html) | `httpx_proxy` | Modern HTTP client |
|
|
47
|
+
| [pycurl](https://python-proxy-headers.readthedocs.io/en/latest/pycurl.html) | `pycurl_proxy` | libcurl bindings |
|
|
48
|
+
| [cloudscraper](https://python-proxy-headers.readthedocs.io/en/latest/cloudscraper.html) | `cloudscraper_proxy` | Cloudflare bypass |
|
|
49
|
+
| [autoscraper](https://python-proxy-headers.readthedocs.io/en/latest/autoscraper.html) | `autoscraper_proxy` | Automatic web scraping |
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install python-proxy-headers
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Then install the HTTP library you want to use (e.g., `pip install requests`).
|
|
58
|
+
|
|
59
|
+
> **Note:** This package has no dependencies by default - install only what you need.
|
|
60
|
+
|
|
61
|
+
## Quick Start
|
|
62
|
+
|
|
63
|
+
### requests
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from python_proxy_headers.requests_adapter import ProxySession
|
|
67
|
+
|
|
68
|
+
with ProxySession(proxy_headers={'X-ProxyMesh-Country': 'US'}) as session:
|
|
69
|
+
session.proxies = {'https': 'http://user:pass@proxy.example.com:8080'}
|
|
70
|
+
response = session.get('https://httpbin.org/ip')
|
|
71
|
+
|
|
72
|
+
# Proxy headers are merged into response.headers
|
|
73
|
+
print(response.headers.get('X-ProxyMesh-IP'))
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### httpx
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from python_proxy_headers.httpx_proxy import get
|
|
80
|
+
|
|
81
|
+
response = get(
|
|
82
|
+
'https://httpbin.org/ip',
|
|
83
|
+
proxy='http://user:pass@proxy.example.com:8080'
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Proxy CONNECT response headers are merged into response.headers
|
|
87
|
+
print(response.headers.get('X-ProxyMesh-IP'))
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### aiohttp
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import asyncio
|
|
94
|
+
from python_proxy_headers.aiohttp_proxy import ProxyClientSession
|
|
95
|
+
|
|
96
|
+
async def main():
|
|
97
|
+
async with ProxyClientSession() as session:
|
|
98
|
+
async with session.get(
|
|
99
|
+
'https://httpbin.org/ip',
|
|
100
|
+
proxy='http://user:pass@proxy.example.com:8080'
|
|
101
|
+
) as response:
|
|
102
|
+
# Proxy headers merged into response.headers
|
|
103
|
+
print(response.headers.get('X-ProxyMesh-IP'))
|
|
104
|
+
|
|
105
|
+
asyncio.run(main())
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### pycurl (low-level)
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
import pycurl
|
|
112
|
+
from python_proxy_headers.pycurl_proxy import set_proxy_headers, HeaderCapture
|
|
113
|
+
|
|
114
|
+
c = pycurl.Curl()
|
|
115
|
+
c.setopt(pycurl.URL, 'https://httpbin.org/ip')
|
|
116
|
+
c.setopt(pycurl.PROXY, 'http://proxy.example.com:8080')
|
|
117
|
+
|
|
118
|
+
# Add these two lines to any existing pycurl code
|
|
119
|
+
set_proxy_headers(c, {'X-ProxyMesh-Country': 'US'})
|
|
120
|
+
capture = HeaderCapture(c)
|
|
121
|
+
|
|
122
|
+
c.perform()
|
|
123
|
+
|
|
124
|
+
print(capture.proxy_headers) # Headers from proxy CONNECT response
|
|
125
|
+
c.close()
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### cloudscraper
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
from python_proxy_headers.cloudscraper_proxy import create_scraper
|
|
132
|
+
|
|
133
|
+
# Drop-in replacement for cloudscraper.create_scraper()
|
|
134
|
+
scraper = create_scraper(proxy_headers={'X-ProxyMesh-Country': 'US'})
|
|
135
|
+
scraper.proxies = {'https': 'http://proxy.example.com:8080'}
|
|
136
|
+
|
|
137
|
+
response = scraper.get('https://example.com')
|
|
138
|
+
# All CloudScraper features (Cloudflare bypass) preserved
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Testing
|
|
142
|
+
|
|
143
|
+
A test harness is included to verify proxy header functionality:
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
# Set your proxy
|
|
147
|
+
export PROXY_URL='http://user:pass@proxy.example.com:8080'
|
|
148
|
+
|
|
149
|
+
# Test all modules
|
|
150
|
+
python test_proxy_headers.py
|
|
151
|
+
|
|
152
|
+
# Test specific modules
|
|
153
|
+
python test_proxy_headers.py requests httpx
|
|
154
|
+
|
|
155
|
+
# Verbose output (show header values)
|
|
156
|
+
python test_proxy_headers.py -v
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## Documentation
|
|
160
|
+
|
|
161
|
+
For detailed documentation, API reference, and more examples:
|
|
162
|
+
|
|
163
|
+
- **Full Documentation:** [python-proxy-headers.readthedocs.io](https://python-proxy-headers.readthedocs.io/en/latest/)
|
|
164
|
+
- **Example Code:** [proxy-examples for Python](https://github.com/proxymesh/proxy-examples/tree/main/python)
|
|
165
|
+
|
|
166
|
+
## Related Projects
|
|
167
|
+
|
|
168
|
+
- **[scrapy-proxy-headers](https://github.com/proxymesh/scrapy-proxy-headers)** - Proxy header support for Scrapy
|
|
169
|
+
|
|
170
|
+
## About
|
|
171
|
+
|
|
172
|
+
Created by [ProxyMesh](https://proxymesh.com) to help our customers use custom headers to control proxy behavior. Works with any proxy that supports custom headers.
|
|
173
|
+
|
|
174
|
+
## License
|
|
175
|
+
|
|
176
|
+
MIT License
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# Python Proxy Headers
|
|
2
|
+
|
|
3
|
+
[](https://python-proxy-headers.readthedocs.io/en/latest/?badge=latest)
|
|
4
|
+
[](https://badge.fury.io/py/python-proxy-headers)
|
|
5
|
+
|
|
6
|
+
Extensions for Python HTTP libraries to support **sending and receiving custom proxy headers** during HTTPS CONNECT tunneling.
|
|
7
|
+
|
|
8
|
+
## The Problem
|
|
9
|
+
|
|
10
|
+
When making HTTPS requests through a proxy, the connection is established via a CONNECT tunnel. During this process:
|
|
11
|
+
|
|
12
|
+
1. **Sending headers to the proxy** - Most Python HTTP libraries don't provide an easy way to send custom headers (like `X-ProxyMesh-Country`) to the proxy server during the CONNECT handshake.
|
|
13
|
+
|
|
14
|
+
2. **Receiving headers from the proxy** - The proxy's response headers from the CONNECT request are typically discarded, making it impossible to read custom headers (like `X-ProxyMesh-IP`) that the proxy sends back.
|
|
15
|
+
|
|
16
|
+
This library solves both problems for popular Python HTTP libraries.
|
|
17
|
+
|
|
18
|
+
## Supported Libraries
|
|
19
|
+
|
|
20
|
+
| Library | Module | Use Case |
|
|
21
|
+
|---------|--------|----------|
|
|
22
|
+
| [urllib3](https://python-proxy-headers.readthedocs.io/en/latest/urllib3.html) | `urllib3_proxy_manager` | Low-level HTTP client |
|
|
23
|
+
| [requests](https://python-proxy-headers.readthedocs.io/en/latest/requests.html) | `requests_adapter` | Simple HTTP requests |
|
|
24
|
+
| [aiohttp](https://python-proxy-headers.readthedocs.io/en/latest/aiohttp.html) | `aiohttp_proxy` | Async HTTP client |
|
|
25
|
+
| [httpx](https://python-proxy-headers.readthedocs.io/en/latest/httpx.html) | `httpx_proxy` | Modern HTTP client |
|
|
26
|
+
| [pycurl](https://python-proxy-headers.readthedocs.io/en/latest/pycurl.html) | `pycurl_proxy` | libcurl bindings |
|
|
27
|
+
| [cloudscraper](https://python-proxy-headers.readthedocs.io/en/latest/cloudscraper.html) | `cloudscraper_proxy` | Cloudflare bypass |
|
|
28
|
+
| [autoscraper](https://python-proxy-headers.readthedocs.io/en/latest/autoscraper.html) | `autoscraper_proxy` | Automatic web scraping |
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install python-proxy-headers
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Then install the HTTP library you want to use (e.g., `pip install requests`).
|
|
37
|
+
|
|
38
|
+
> **Note:** This package has no dependencies by default - install only what you need.
|
|
39
|
+
|
|
40
|
+
## Quick Start
|
|
41
|
+
|
|
42
|
+
### requests
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from python_proxy_headers.requests_adapter import ProxySession
|
|
46
|
+
|
|
47
|
+
with ProxySession(proxy_headers={'X-ProxyMesh-Country': 'US'}) as session:
|
|
48
|
+
session.proxies = {'https': 'http://user:pass@proxy.example.com:8080'}
|
|
49
|
+
response = session.get('https://httpbin.org/ip')
|
|
50
|
+
|
|
51
|
+
# Proxy headers are merged into response.headers
|
|
52
|
+
print(response.headers.get('X-ProxyMesh-IP'))
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### httpx
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from python_proxy_headers.httpx_proxy import get
|
|
59
|
+
|
|
60
|
+
response = get(
|
|
61
|
+
'https://httpbin.org/ip',
|
|
62
|
+
proxy='http://user:pass@proxy.example.com:8080'
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Proxy CONNECT response headers are merged into response.headers
|
|
66
|
+
print(response.headers.get('X-ProxyMesh-IP'))
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### aiohttp
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
import asyncio
|
|
73
|
+
from python_proxy_headers.aiohttp_proxy import ProxyClientSession
|
|
74
|
+
|
|
75
|
+
async def main():
|
|
76
|
+
async with ProxyClientSession() as session:
|
|
77
|
+
async with session.get(
|
|
78
|
+
'https://httpbin.org/ip',
|
|
79
|
+
proxy='http://user:pass@proxy.example.com:8080'
|
|
80
|
+
) as response:
|
|
81
|
+
# Proxy headers merged into response.headers
|
|
82
|
+
print(response.headers.get('X-ProxyMesh-IP'))
|
|
83
|
+
|
|
84
|
+
asyncio.run(main())
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### pycurl (low-level)
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
import pycurl
|
|
91
|
+
from python_proxy_headers.pycurl_proxy import set_proxy_headers, HeaderCapture
|
|
92
|
+
|
|
93
|
+
c = pycurl.Curl()
|
|
94
|
+
c.setopt(pycurl.URL, 'https://httpbin.org/ip')
|
|
95
|
+
c.setopt(pycurl.PROXY, 'http://proxy.example.com:8080')
|
|
96
|
+
|
|
97
|
+
# Add these two lines to any existing pycurl code
|
|
98
|
+
set_proxy_headers(c, {'X-ProxyMesh-Country': 'US'})
|
|
99
|
+
capture = HeaderCapture(c)
|
|
100
|
+
|
|
101
|
+
c.perform()
|
|
102
|
+
|
|
103
|
+
print(capture.proxy_headers) # Headers from proxy CONNECT response
|
|
104
|
+
c.close()
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### cloudscraper
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
from python_proxy_headers.cloudscraper_proxy import create_scraper
|
|
111
|
+
|
|
112
|
+
# Drop-in replacement for cloudscraper.create_scraper()
|
|
113
|
+
scraper = create_scraper(proxy_headers={'X-ProxyMesh-Country': 'US'})
|
|
114
|
+
scraper.proxies = {'https': 'http://proxy.example.com:8080'}
|
|
115
|
+
|
|
116
|
+
response = scraper.get('https://example.com')
|
|
117
|
+
# All CloudScraper features (Cloudflare bypass) preserved
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Testing
|
|
121
|
+
|
|
122
|
+
A test harness is included to verify proxy header functionality:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
# Set your proxy
|
|
126
|
+
export PROXY_URL='http://user:pass@proxy.example.com:8080'
|
|
127
|
+
|
|
128
|
+
# Test all modules
|
|
129
|
+
python test_proxy_headers.py
|
|
130
|
+
|
|
131
|
+
# Test specific modules
|
|
132
|
+
python test_proxy_headers.py requests httpx
|
|
133
|
+
|
|
134
|
+
# Verbose output (show header values)
|
|
135
|
+
python test_proxy_headers.py -v
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Documentation
|
|
139
|
+
|
|
140
|
+
For detailed documentation, API reference, and more examples:
|
|
141
|
+
|
|
142
|
+
- **Full Documentation:** [python-proxy-headers.readthedocs.io](https://python-proxy-headers.readthedocs.io/en/latest/)
|
|
143
|
+
- **Example Code:** [proxy-examples for Python](https://github.com/proxymesh/proxy-examples/tree/main/python)
|
|
144
|
+
|
|
145
|
+
## Related Projects
|
|
146
|
+
|
|
147
|
+
- **[scrapy-proxy-headers](https://github.com/proxymesh/scrapy-proxy-headers)** - Proxy header support for Scrapy
|
|
148
|
+
|
|
149
|
+
## About
|
|
150
|
+
|
|
151
|
+
Created by [ProxyMesh](https://proxymesh.com) to help our customers use custom headers to control proxy behavior. Works with any proxy that supports custom headers.
|
|
152
|
+
|
|
153
|
+
## License
|
|
154
|
+
|
|
155
|
+
MIT License
|
|
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "python-proxy-headers"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
authors = [
|
|
9
9
|
{ name="ProxyMesh", email="support@proxymesh.com" },
|
|
10
10
|
]
|
|
11
|
-
description = "Handle custom proxy headers for http requests in various python libraries"
|
|
11
|
+
description = "Handle custom proxy headers for http & https requests in various python libraries"
|
|
12
12
|
readme = "README.md"
|
|
13
13
|
requires-python = ">=3.8"
|
|
14
14
|
classifiers = [
|
|
@@ -24,3 +24,5 @@ classifiers = [
|
|
|
24
24
|
Homepage = "https://github.com/proxymesh/python-proxy-headers"
|
|
25
25
|
Changelog = "https://github.com/proxymesh/python-proxy-headers/commits/main/"
|
|
26
26
|
Issues = "https://github.com/proxymesh/python-proxy-headers/issues"
|
|
27
|
+
Documentation = "https://python-proxy-headers.readthedocs.io/en/latest/"
|
|
28
|
+
ProxyMesh = "https://proxymesh.com"
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AutoScraper extension for sending and receiving proxy headers.
|
|
3
|
+
|
|
4
|
+
This module provides an AutoScraper subclass that enables:
|
|
5
|
+
1. Sending custom headers to proxy servers during CONNECT
|
|
6
|
+
2. Using our ProxySession for all HTTP requests
|
|
7
|
+
|
|
8
|
+
Example usage:
|
|
9
|
+
from python_proxy_headers.autoscraper_proxy import ProxyAutoScraper
|
|
10
|
+
|
|
11
|
+
scraper = ProxyAutoScraper(proxy_headers={'X-ProxyMesh-Country': 'US'})
|
|
12
|
+
|
|
13
|
+
# Build with proxy
|
|
14
|
+
result = scraper.build(
|
|
15
|
+
url='https://example.com',
|
|
16
|
+
wanted_list=['Example Domain'],
|
|
17
|
+
request_args={'proxies': {'https': 'http://proxy:8080'}}
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Get results with proxy
|
|
21
|
+
result = scraper.get_result_similar(
|
|
22
|
+
url='https://other-example.com',
|
|
23
|
+
request_args={'proxies': {'https': 'http://proxy:8080'}}
|
|
24
|
+
)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from typing import Dict, List, Optional, Any
|
|
28
|
+
from urllib.parse import urlparse
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
from autoscraper import AutoScraper
|
|
32
|
+
except ImportError:
|
|
33
|
+
raise ImportError(
|
|
34
|
+
"autoscraper is required for this module. "
|
|
35
|
+
"Install it with: pip install autoscraper"
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
from .requests_adapter import ProxySession
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ProxyAutoScraper(AutoScraper):
|
|
42
|
+
"""
|
|
43
|
+
AutoScraper with proxy header support.
|
|
44
|
+
|
|
45
|
+
This class extends AutoScraper to use our ProxySession for HTTP requests,
|
|
46
|
+
enabling custom proxy headers to be sent during CONNECT tunneling.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
proxy_headers: Dict of headers to send to proxy servers
|
|
50
|
+
stack_list: Initial stack list (rules) for the scraper
|
|
51
|
+
|
|
52
|
+
Example:
|
|
53
|
+
scraper = ProxyAutoScraper(proxy_headers={'X-ProxyMesh-Country': 'US'})
|
|
54
|
+
|
|
55
|
+
result = scraper.build(
|
|
56
|
+
url='https://finance.yahoo.com/quote/AAPL/',
|
|
57
|
+
wanted_list=['Apple Inc.'],
|
|
58
|
+
request_args={'proxies': {'https': 'http://proxy:8080'}}
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Use the learned rules on another page
|
|
62
|
+
result = scraper.get_result_similar(
|
|
63
|
+
url='https://finance.yahoo.com/quote/GOOG/',
|
|
64
|
+
request_args={'proxies': {'https': 'http://proxy:8080'}}
|
|
65
|
+
)
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
proxy_headers: Optional[Dict[str, str]] = None,
|
|
71
|
+
stack_list: Optional[List] = None
|
|
72
|
+
):
|
|
73
|
+
super().__init__(stack_list=stack_list)
|
|
74
|
+
self._proxy_headers = proxy_headers or {}
|
|
75
|
+
self._session: Optional[ProxySession] = None
|
|
76
|
+
|
|
77
|
+
def _get_session(self) -> ProxySession:
|
|
78
|
+
"""Get or create the ProxySession."""
|
|
79
|
+
if self._session is None:
|
|
80
|
+
self._session = ProxySession(proxy_headers=self._proxy_headers)
|
|
81
|
+
return self._session
|
|
82
|
+
|
|
83
|
+
def set_proxy_headers(self, proxy_headers: Dict[str, str]):
|
|
84
|
+
"""
|
|
85
|
+
Update the proxy headers.
|
|
86
|
+
|
|
87
|
+
This will close the current session and create a new one with
|
|
88
|
+
the updated headers on the next request.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
proxy_headers: New proxy headers to use
|
|
92
|
+
"""
|
|
93
|
+
self._proxy_headers = proxy_headers
|
|
94
|
+
if self._session is not None:
|
|
95
|
+
self._session.close()
|
|
96
|
+
self._session = None
|
|
97
|
+
|
|
98
|
+
def close(self):
|
|
99
|
+
"""Close the underlying session."""
|
|
100
|
+
if self._session is not None:
|
|
101
|
+
self._session.close()
|
|
102
|
+
self._session = None
|
|
103
|
+
|
|
104
|
+
def __enter__(self):
|
|
105
|
+
return self
|
|
106
|
+
|
|
107
|
+
def __exit__(self, *args):
|
|
108
|
+
self.close()
|
|
109
|
+
|
|
110
|
+
@classmethod
|
|
111
|
+
def _fetch_html(cls, url, request_args=None):
|
|
112
|
+
"""
|
|
113
|
+
Fetch HTML from URL using the standard requests.
|
|
114
|
+
|
|
115
|
+
Note: This is the class method from parent. For proxy header support,
|
|
116
|
+
use instance methods which use the ProxySession.
|
|
117
|
+
"""
|
|
118
|
+
# Fall back to parent implementation for class method calls
|
|
119
|
+
return super()._fetch_html(url, request_args)
|
|
120
|
+
|
|
121
|
+
def _fetch_html_with_proxy(self, url: str, request_args: Optional[Dict] = None) -> str:
|
|
122
|
+
"""
|
|
123
|
+
Fetch HTML from URL using ProxySession with proxy header support.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
url: URL to fetch
|
|
127
|
+
request_args: Additional request arguments (proxies, headers, etc.)
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
HTML content as string
|
|
131
|
+
"""
|
|
132
|
+
request_args = request_args or {}
|
|
133
|
+
|
|
134
|
+
# Build headers
|
|
135
|
+
headers = dict(self.request_headers)
|
|
136
|
+
if url:
|
|
137
|
+
headers["Host"] = urlparse(url).netloc
|
|
138
|
+
|
|
139
|
+
user_headers = request_args.pop("headers", {})
|
|
140
|
+
headers.update(user_headers)
|
|
141
|
+
|
|
142
|
+
# Use our ProxySession
|
|
143
|
+
session = self._get_session()
|
|
144
|
+
|
|
145
|
+
# Copy session-level settings if not in request_args
|
|
146
|
+
if 'proxies' in request_args:
|
|
147
|
+
session.proxies.update(request_args.pop('proxies'))
|
|
148
|
+
|
|
149
|
+
res = session.get(url, headers=headers, **request_args)
|
|
150
|
+
|
|
151
|
+
# Handle encoding
|
|
152
|
+
if res.encoding == "ISO-8859-1" and "ISO-8859-1" not in res.headers.get(
|
|
153
|
+
"Content-Type", ""
|
|
154
|
+
):
|
|
155
|
+
res.encoding = res.apparent_encoding
|
|
156
|
+
|
|
157
|
+
return res.text
|
|
158
|
+
|
|
159
|
+
def _get_soup_with_proxy(self, url=None, html=None, request_args=None):
|
|
160
|
+
"""
|
|
161
|
+
Get BeautifulSoup object using ProxySession.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
url: URL to fetch (optional if html is provided)
|
|
165
|
+
html: HTML string (optional if url is provided)
|
|
166
|
+
request_args: Additional request arguments
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
BeautifulSoup object
|
|
170
|
+
"""
|
|
171
|
+
from html import unescape
|
|
172
|
+
from bs4 import BeautifulSoup
|
|
173
|
+
from autoscraper.utils import normalize
|
|
174
|
+
|
|
175
|
+
if html:
|
|
176
|
+
html = normalize(unescape(html))
|
|
177
|
+
return BeautifulSoup(html, "lxml")
|
|
178
|
+
|
|
179
|
+
html = self._fetch_html_with_proxy(url, request_args)
|
|
180
|
+
html = normalize(unescape(html))
|
|
181
|
+
|
|
182
|
+
return BeautifulSoup(html, "lxml")
|
|
183
|
+
|
|
184
|
+
def build(
|
|
185
|
+
self,
|
|
186
|
+
url: Optional[str] = None,
|
|
187
|
+
wanted_list: Optional[List] = None,
|
|
188
|
+
wanted_dict: Optional[Dict] = None,
|
|
189
|
+
html: Optional[str] = None,
|
|
190
|
+
request_args: Optional[Dict] = None,
|
|
191
|
+
update: bool = False,
|
|
192
|
+
text_fuzz_ratio: float = 1.0,
|
|
193
|
+
) -> List:
|
|
194
|
+
"""
|
|
195
|
+
Build scraping rules with proxy header support.
|
|
196
|
+
|
|
197
|
+
Same as AutoScraper.build() but uses ProxySession for requests.
|
|
198
|
+
|
|
199
|
+
Parameters:
|
|
200
|
+
url: URL of the target web page
|
|
201
|
+
wanted_list: List of needed contents to be scraped
|
|
202
|
+
wanted_dict: Dict of needed contents (keys are aliases)
|
|
203
|
+
html: HTML string (alternative to URL)
|
|
204
|
+
request_args: Request arguments including proxies
|
|
205
|
+
update: If True, add to existing rules
|
|
206
|
+
text_fuzz_ratio: Fuzziness ratio for matching
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
List of similar results
|
|
210
|
+
"""
|
|
211
|
+
from html import unescape
|
|
212
|
+
from autoscraper.utils import normalize, unique_hashable, unique_stack_list
|
|
213
|
+
|
|
214
|
+
if not wanted_list and not (wanted_dict and any(wanted_dict.values())):
|
|
215
|
+
raise ValueError("No targets were supplied")
|
|
216
|
+
|
|
217
|
+
# Use our proxy-aware soup getter
|
|
218
|
+
soup = self._get_soup_with_proxy(url=url, html=html, request_args=request_args)
|
|
219
|
+
|
|
220
|
+
result_list = []
|
|
221
|
+
|
|
222
|
+
if update is False:
|
|
223
|
+
self.stack_list = []
|
|
224
|
+
|
|
225
|
+
if wanted_list:
|
|
226
|
+
wanted_dict = {"": wanted_list}
|
|
227
|
+
|
|
228
|
+
wanted_list = []
|
|
229
|
+
|
|
230
|
+
for alias, wanted_items in wanted_dict.items():
|
|
231
|
+
wanted_items = [normalize(w) for w in wanted_items]
|
|
232
|
+
wanted_list += wanted_items
|
|
233
|
+
|
|
234
|
+
for wanted in wanted_items:
|
|
235
|
+
children = self._get_children(soup, wanted, url, text_fuzz_ratio)
|
|
236
|
+
|
|
237
|
+
for child in children:
|
|
238
|
+
result, stack = self._get_result_for_child(child, soup, url)
|
|
239
|
+
stack["alias"] = alias
|
|
240
|
+
result_list += result
|
|
241
|
+
self.stack_list.append(stack)
|
|
242
|
+
|
|
243
|
+
result_list = [item.text for item in result_list]
|
|
244
|
+
result_list = unique_hashable(result_list)
|
|
245
|
+
|
|
246
|
+
self.stack_list = unique_stack_list(self.stack_list)
|
|
247
|
+
return result_list
|
|
248
|
+
|
|
249
|
+
def get_result_similar(
|
|
250
|
+
self,
|
|
251
|
+
url: Optional[str] = None,
|
|
252
|
+
html: Optional[str] = None,
|
|
253
|
+
soup=None,
|
|
254
|
+
request_args: Optional[Dict] = None,
|
|
255
|
+
grouped: bool = False,
|
|
256
|
+
group_by_alias: bool = False,
|
|
257
|
+
unique: Optional[bool] = None,
|
|
258
|
+
attr_fuzz_ratio: float = 1.0,
|
|
259
|
+
keep_blank: bool = False,
|
|
260
|
+
keep_order: bool = False,
|
|
261
|
+
contain_sibling_leaves: bool = False,
|
|
262
|
+
):
|
|
263
|
+
"""
|
|
264
|
+
Get similar results with proxy header support.
|
|
265
|
+
|
|
266
|
+
Same as AutoScraper.get_result_similar() but uses ProxySession.
|
|
267
|
+
"""
|
|
268
|
+
if soup is None and url is not None:
|
|
269
|
+
soup = self._get_soup_with_proxy(url=url, html=html, request_args=request_args)
|
|
270
|
+
|
|
271
|
+
return super().get_result_similar(
|
|
272
|
+
url=url,
|
|
273
|
+
html=html,
|
|
274
|
+
soup=soup,
|
|
275
|
+
request_args=None, # Already fetched
|
|
276
|
+
grouped=grouped,
|
|
277
|
+
group_by_alias=group_by_alias,
|
|
278
|
+
unique=unique,
|
|
279
|
+
attr_fuzz_ratio=attr_fuzz_ratio,
|
|
280
|
+
keep_blank=keep_blank,
|
|
281
|
+
keep_order=keep_order,
|
|
282
|
+
contain_sibling_leaves=contain_sibling_leaves,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
def get_result_exact(
|
|
286
|
+
self,
|
|
287
|
+
url: Optional[str] = None,
|
|
288
|
+
html: Optional[str] = None,
|
|
289
|
+
soup=None,
|
|
290
|
+
request_args: Optional[Dict] = None,
|
|
291
|
+
grouped: bool = False,
|
|
292
|
+
group_by_alias: bool = False,
|
|
293
|
+
unique: Optional[bool] = None,
|
|
294
|
+
attr_fuzz_ratio: float = 1.0,
|
|
295
|
+
keep_blank: bool = False,
|
|
296
|
+
):
|
|
297
|
+
"""
|
|
298
|
+
Get exact results with proxy header support.
|
|
299
|
+
|
|
300
|
+
Same as AutoScraper.get_result_exact() but uses ProxySession.
|
|
301
|
+
"""
|
|
302
|
+
if soup is None and url is not None:
|
|
303
|
+
soup = self._get_soup_with_proxy(url=url, html=html, request_args=request_args)
|
|
304
|
+
|
|
305
|
+
return super().get_result_exact(
|
|
306
|
+
url=url,
|
|
307
|
+
html=html,
|
|
308
|
+
soup=soup,
|
|
309
|
+
request_args=None, # Already fetched
|
|
310
|
+
grouped=grouped,
|
|
311
|
+
group_by_alias=group_by_alias,
|
|
312
|
+
unique=unique,
|
|
313
|
+
attr_fuzz_ratio=attr_fuzz_ratio,
|
|
314
|
+
keep_blank=keep_blank,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
def get_result(
|
|
318
|
+
self,
|
|
319
|
+
url: Optional[str] = None,
|
|
320
|
+
html: Optional[str] = None,
|
|
321
|
+
request_args: Optional[Dict] = None,
|
|
322
|
+
grouped: bool = False,
|
|
323
|
+
group_by_alias: bool = False,
|
|
324
|
+
unique: Optional[bool] = None,
|
|
325
|
+
attr_fuzz_ratio: float = 1.0,
|
|
326
|
+
):
|
|
327
|
+
"""
|
|
328
|
+
Get similar and exact results with proxy header support.
|
|
329
|
+
|
|
330
|
+
Same as AutoScraper.get_result() but uses ProxySession.
|
|
331
|
+
"""
|
|
332
|
+
soup = self._get_soup_with_proxy(url=url, html=html, request_args=request_args)
|
|
333
|
+
|
|
334
|
+
args = dict(
|
|
335
|
+
url=url,
|
|
336
|
+
soup=soup,
|
|
337
|
+
grouped=grouped,
|
|
338
|
+
group_by_alias=group_by_alias,
|
|
339
|
+
unique=unique,
|
|
340
|
+
attr_fuzz_ratio=attr_fuzz_ratio,
|
|
341
|
+
)
|
|
342
|
+
similar = self.get_result_similar(**args)
|
|
343
|
+
exact = self.get_result_exact(**args)
|
|
344
|
+
return similar, exact
|