askpablos-scrapy-api 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- askpablos_scrapy_api-0.1.0/PKG-INFO +103 -0
- askpablos_scrapy_api-0.1.0/README.md +91 -0
- askpablos_scrapy_api-0.1.0/askpablos_scrapy_api/__init__.py +1 -0
- askpablos_scrapy_api-0.1.0/askpablos_scrapy_api/middleware.py +98 -0
- askpablos_scrapy_api-0.1.0/askpablos_scrapy_api.egg-info/PKG-INFO +103 -0
- askpablos_scrapy_api-0.1.0/askpablos_scrapy_api.egg-info/SOURCES.txt +9 -0
- askpablos_scrapy_api-0.1.0/askpablos_scrapy_api.egg-info/dependency_links.txt +1 -0
- askpablos_scrapy_api-0.1.0/askpablos_scrapy_api.egg-info/requires.txt +2 -0
- askpablos_scrapy_api-0.1.0/askpablos_scrapy_api.egg-info/top_level.txt +1 -0
- askpablos_scrapy_api-0.1.0/pyproject.toml +21 -0
- askpablos_scrapy_api-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: askpablos-scrapy-api
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Scrapy middleware to route requests through AskPablos Proxy API
|
|
5
|
+
Author-email: Fawad Ali <fawadstar6@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/fawadss1/askpablos-scrapy-api
|
|
8
|
+
Requires-Python: >=3.7
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: scrapy>=2.6.0
|
|
11
|
+
Requires-Dist: requests>=2.25.0
|
|
12
|
+
|
|
13
|
+
# AskPablos Scrapy API Middleware
|
|
14
|
+
|
|
15
|
+
**AskPablosScrapyAPI** is a Scrapy downloader middleware that allows your spiders to transparently route specific requests through an external proxy API like AskPablos. It supports headless browser mode and rotating proxies on a per-request basis, while integrating cleanly with Scrapy's native settings system.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## 🚀 Features
|
|
20
|
+
|
|
21
|
+
- ✅ Only activates for requests that explicitly ask for proxy routing
|
|
22
|
+
- ✅ Supports rotating proxies and headless browser rendering
|
|
23
|
+
- ✅ Compatible with per-spider `CUSTOM_SETTINGS`
|
|
24
|
+
- ✅ Automatically signs requests using HMAC
|
|
25
|
+
- ✅ Clean plug-and-play design for reuse across Scrapy projects
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## 📦 Installation
|
|
30
|
+
|
|
31
|
+
Install via pip:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install askpablos-scrapy-api
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Or directly from the repository:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install git+https://github.com/fawadss1/askpablos-scrapy-api.git
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## 🔧 Quick Setup
|
|
46
|
+
|
|
47
|
+
1. Add the middleware to your Scrapy project settings:
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
# settings.py
|
|
51
|
+
DOWNLOADER_MIDDLEWARES = {
|
|
52
|
+
"askpablos_scrapy_api.AskPablosScrapyAPI": 543,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# API credentials
|
|
56
|
+
API_KEY = "your-api-key"
|
|
57
|
+
SECRET_KEY = "your-secret-key"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
2. Use in your spider by adding `askpablos_api_map` to the request meta:
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
def start_requests(self):
|
|
64
|
+
yield scrapy.Request(
|
|
65
|
+
url="https://example.com",
|
|
66
|
+
callback=self.parse,
|
|
67
|
+
meta={
|
|
68
|
+
"askpablos_api_map": {
|
|
69
|
+
"browser": True, # Use headless browser
|
|
70
|
+
"rotate_proxy": True # Use rotating proxy IP
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## 📚 Documentation
|
|
79
|
+
|
|
80
|
+
For detailed usage instructions and advanced configurations:
|
|
81
|
+
|
|
82
|
+
- [Usage Guide](https://github.com/fawadss1/askpablos-scrapy-api/blob/main/docs/usage.md)
|
|
83
|
+
- [FAQ](https://github.com/fawadss1/askpablos-scrapy-api/blob/main/docs/faq.md)
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## 📋 Requirements
|
|
88
|
+
|
|
89
|
+
- Python 3.7+
|
|
90
|
+
- Scrapy 2.6+
|
|
91
|
+
- Valid AskPablos Proxy API credentials
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## 📝 License
|
|
96
|
+
|
|
97
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## 👤 Author
|
|
102
|
+
|
|
103
|
+
Fawad Ali ([@fawadss1](https://github.com/fawadss1))
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# AskPablos Scrapy API Middleware
|
|
2
|
+
|
|
3
|
+
**AskPablosScrapyAPI** is a Scrapy downloader middleware that allows your spiders to transparently route specific requests through an external proxy API like AskPablos. It supports headless browser mode and rotating proxies on a per-request basis, while integrating cleanly with Scrapy's native settings system.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## 🚀 Features
|
|
8
|
+
|
|
9
|
+
- ✅ Only activates for requests that explicitly ask for proxy routing
|
|
10
|
+
- ✅ Supports rotating proxies and headless browser rendering
|
|
11
|
+
- ✅ Compatible with per-spider `CUSTOM_SETTINGS`
|
|
12
|
+
- ✅ Automatically signs requests using HMAC
|
|
13
|
+
- ✅ Clean plug-and-play design for reuse across Scrapy projects
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## 📦 Installation
|
|
18
|
+
|
|
19
|
+
Install via pip:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install askpablos-scrapy-api
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Or directly from the repository:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install git+https://github.com/fawadss1/askpablos-scrapy-api.git
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## 🔧 Quick Setup
|
|
34
|
+
|
|
35
|
+
1. Add the middleware to your Scrapy project settings:
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
# settings.py
|
|
39
|
+
DOWNLOADER_MIDDLEWARES = {
|
|
40
|
+
"askpablos_scrapy_api.AskPablosScrapyAPI": 543,
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
# API credentials
|
|
44
|
+
API_KEY = "your-api-key"
|
|
45
|
+
SECRET_KEY = "your-secret-key"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
2. Use in your spider by adding `askpablos_api_map` to the request meta:
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
def start_requests(self):
|
|
52
|
+
yield scrapy.Request(
|
|
53
|
+
url="https://example.com",
|
|
54
|
+
callback=self.parse,
|
|
55
|
+
meta={
|
|
56
|
+
"askpablos_api_map": {
|
|
57
|
+
"browser": True, # Use headless browser
|
|
58
|
+
"rotate_proxy": True # Use rotating proxy IP
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## 📚 Documentation
|
|
67
|
+
|
|
68
|
+
For detailed usage instructions and advanced configurations:
|
|
69
|
+
|
|
70
|
+
- [Usage Guide](https://github.com/fawadss1/askpablos-scrapy-api/blob/main/docs/usage.md)
|
|
71
|
+
- [FAQ](https://github.com/fawadss1/askpablos-scrapy-api/blob/main/docs/faq.md)
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## 📋 Requirements
|
|
76
|
+
|
|
77
|
+
- Python 3.7+
|
|
78
|
+
- Scrapy 2.6+
|
|
79
|
+
- Valid AskPablos Proxy API credentials
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## 📝 License
|
|
84
|
+
|
|
85
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## 👤 Author
|
|
90
|
+
|
|
91
|
+
Fawad Ali ([@fawadss1](https://github.com/fawadss1))
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .middleware import AskPablosScrapyAPI
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import base64
|
|
3
|
+
import hmac
|
|
4
|
+
import hashlib
|
|
5
|
+
import requests
|
|
6
|
+
from scrapy.http import HtmlResponse
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AskPablosScrapyAPI:
|
|
10
|
+
"""
|
|
11
|
+
Scrapy middleware to route selected requests through AskPablos proxy API.
|
|
12
|
+
|
|
13
|
+
This middleware activates **only** for requests that include:
|
|
14
|
+
meta = {
|
|
15
|
+
"askpablos_api_map": {
|
|
16
|
+
"browser": True, # Optional: Use headless browser
|
|
17
|
+
"rotate_proxy": True # Optional: Use rotating proxy IP
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
It will bypass any request that does not include the `askpablos_api_map` key or has it as an empty dict.
|
|
22
|
+
|
|
23
|
+
Configuration (via settings.py or `CUSTOM_SETTINGS`):
|
|
24
|
+
API_KEY = "<your API key>"
|
|
25
|
+
SECRET_KEY = "<your secret key>"
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
API_URL = "http://10.10.10.178:7500/api/proxy/"
|
|
29
|
+
|
|
30
|
+
def __init__(self, api_key, secret_key):
|
|
31
|
+
self.api_key = api_key
|
|
32
|
+
self.secret_key = secret_key
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
def from_crawler(cls, crawler):
|
|
36
|
+
api_key = crawler.settings.get("API_KEY")
|
|
37
|
+
secret_key = crawler.settings.get("SECRET_KEY")
|
|
38
|
+
|
|
39
|
+
if not api_key or not secret_key:
|
|
40
|
+
raise ValueError("API_KEY and SECRET_KEY must be defined in settings.")
|
|
41
|
+
|
|
42
|
+
return cls(api_key=api_key, secret_key=secret_key)
|
|
43
|
+
|
|
44
|
+
def process_request(self, request, spider):
|
|
45
|
+
proxy_cfg = request.meta.get("askpablos_api_map")
|
|
46
|
+
|
|
47
|
+
if not proxy_cfg or not isinstance(proxy_cfg, dict) or not proxy_cfg:
|
|
48
|
+
return None # Skip proxying
|
|
49
|
+
|
|
50
|
+
browser = proxy_cfg.get("browser", False)
|
|
51
|
+
rotate_proxy = proxy_cfg.get("rotate_proxy", False)
|
|
52
|
+
|
|
53
|
+
payload = {
|
|
54
|
+
"url": request.url,
|
|
55
|
+
"method": "GET",
|
|
56
|
+
"browser": browser,
|
|
57
|
+
"rotateProxy": rotate_proxy
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
request_json = json.dumps(payload, separators=(',', ':'))
|
|
62
|
+
signature = hmac.new(
|
|
63
|
+
self.secret_key.encode(),
|
|
64
|
+
request_json.encode(),
|
|
65
|
+
hashlib.sha256
|
|
66
|
+
).digest()
|
|
67
|
+
signature_b64 = base64.b64encode(signature).decode()
|
|
68
|
+
|
|
69
|
+
headers = {
|
|
70
|
+
"Content-Type": "application/json",
|
|
71
|
+
"X-API-Key": self.api_key,
|
|
72
|
+
"X-Signature": signature_b64
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
response = requests.post(self.API_URL, data=request_json, headers=headers, timeout=30)
|
|
76
|
+
response.raise_for_status()
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
proxy_response = response.json()
|
|
80
|
+
except ValueError:
|
|
81
|
+
spider.logger.error(f"[AskPablos API] Invalid JSON response from {self.API_URL}")
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
html_body = proxy_response.get("body")
|
|
85
|
+
if not html_body:
|
|
86
|
+
spider.logger.error(f"[AskPablos API] No 'body' in response for {request.url}")
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
return HtmlResponse(
|
|
90
|
+
url=request.url,
|
|
91
|
+
body=html_body,
|
|
92
|
+
encoding="utf-8",
|
|
93
|
+
request=request
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
except Exception as e:
|
|
97
|
+
spider.logger.error(f"[AskPablos API] Error processing {request.url}: {e}")
|
|
98
|
+
return None
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: askpablos-scrapy-api
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Scrapy middleware to route requests through AskPablos Proxy API
|
|
5
|
+
Author-email: Fawad Ali <fawadstar6@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/fawadss1/askpablos-scrapy-api
|
|
8
|
+
Requires-Python: >=3.7
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
Requires-Dist: scrapy>=2.6.0
|
|
11
|
+
Requires-Dist: requests>=2.25.0
|
|
12
|
+
|
|
13
|
+
# AskPablos Scrapy API Middleware
|
|
14
|
+
|
|
15
|
+
**AskPablosScrapyAPI** is a Scrapy downloader middleware that allows your spiders to transparently route specific requests through an external proxy API like AskPablos. It supports headless browser mode and rotating proxies on a per-request basis, while integrating cleanly with Scrapy's native settings system.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## 🚀 Features
|
|
20
|
+
|
|
21
|
+
- ✅ Only activates for requests that explicitly ask for proxy routing
|
|
22
|
+
- ✅ Supports rotating proxies and headless browser rendering
|
|
23
|
+
- ✅ Compatible with per-spider `CUSTOM_SETTINGS`
|
|
24
|
+
- ✅ Automatically signs requests using HMAC
|
|
25
|
+
- ✅ Clean plug-and-play design for reuse across Scrapy projects
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## 📦 Installation
|
|
30
|
+
|
|
31
|
+
Install via pip:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install askpablos-scrapy-api
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Or directly from the repository:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install git+https://github.com/fawadss1/askpablos-scrapy-api.git
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## 🔧 Quick Setup
|
|
46
|
+
|
|
47
|
+
1. Add the middleware to your Scrapy project settings:
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
# settings.py
|
|
51
|
+
DOWNLOADER_MIDDLEWARES = {
|
|
52
|
+
"askpablos_scrapy_api.AskPablosScrapyAPI": 543,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# API credentials
|
|
56
|
+
API_KEY = "your-api-key"
|
|
57
|
+
SECRET_KEY = "your-secret-key"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
2. Use in your spider by adding `askpablos_api_map` to the request meta:
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
def start_requests(self):
|
|
64
|
+
yield scrapy.Request(
|
|
65
|
+
url="https://example.com",
|
|
66
|
+
callback=self.parse,
|
|
67
|
+
meta={
|
|
68
|
+
"askpablos_api_map": {
|
|
69
|
+
"browser": True, # Use headless browser
|
|
70
|
+
"rotate_proxy": True # Use rotating proxy IP
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## 📚 Documentation
|
|
79
|
+
|
|
80
|
+
For detailed usage instructions and advanced configurations:
|
|
81
|
+
|
|
82
|
+
- [Usage Guide](https://github.com/fawadss1/askpablos-scrapy-api/blob/main/docs/usage.md)
|
|
83
|
+
- [FAQ](https://github.com/fawadss1/askpablos-scrapy-api/blob/main/docs/faq.md)
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## 📋 Requirements
|
|
88
|
+
|
|
89
|
+
- Python 3.7+
|
|
90
|
+
- Scrapy 2.6+
|
|
91
|
+
- Valid AskPablos Proxy API credentials
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## 📝 License
|
|
96
|
+
|
|
97
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## 👤 Author
|
|
102
|
+
|
|
103
|
+
Fawad Ali ([@fawadss1](https://github.com/fawadss1))
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
askpablos_scrapy_api/__init__.py
|
|
4
|
+
askpablos_scrapy_api/middleware.py
|
|
5
|
+
askpablos_scrapy_api.egg-info/PKG-INFO
|
|
6
|
+
askpablos_scrapy_api.egg-info/SOURCES.txt
|
|
7
|
+
askpablos_scrapy_api.egg-info/dependency_links.txt
|
|
8
|
+
askpablos_scrapy_api.egg-info/requires.txt
|
|
9
|
+
askpablos_scrapy_api.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
askpablos_scrapy_api
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "askpablos-scrapy-api"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Scrapy middleware to route requests through AskPablos Proxy API"
|
|
5
|
+
authors = [
|
|
6
|
+
{ name="Fawad Ali", email="fawadstar6@gmail.com" }
|
|
7
|
+
]
|
|
8
|
+
license = {text = "MIT"}
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.7"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"scrapy>=2.6.0",
|
|
13
|
+
"requests>=2.25.0"
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[project.urls]
|
|
17
|
+
Homepage = "https://github.com/fawadss1/askpablos-scrapy-api"
|
|
18
|
+
|
|
19
|
+
[build-system]
|
|
20
|
+
requires = ["setuptools>=61.0"]
|
|
21
|
+
build-backend = "setuptools.build_meta"
|