orbweaver-tools 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orbweaver_tools-1.1.0/LICENSE +22 -0
- orbweaver_tools-1.1.0/PKG-INFO +124 -0
- orbweaver_tools-1.1.0/README.md +96 -0
- orbweaver_tools-1.1.0/orb/__init__.py +14 -0
- orbweaver_tools-1.1.0/orb/common/__init__.py +0 -0
- orbweaver_tools-1.1.0/orb/common/design/__init__.py +0 -0
- orbweaver_tools-1.1.0/orb/common/design/welcome_page.py +99 -0
- orbweaver_tools-1.1.0/orb/common/proxies/__init__.py +0 -0
- orbweaver_tools-1.1.0/orb/common/proxies/get_proxies.py +111 -0
- orbweaver_tools-1.1.0/orb/common/proxies/test_proxies.py +31 -0
- orbweaver_tools-1.1.0/orb/common/user_agents/__init__.py +0 -0
- orbweaver_tools-1.1.0/orb/common/user_agents/user_agents.py +62 -0
- orbweaver_tools-1.1.0/orb/common/vpn/__init__.py +5 -0
- orbweaver_tools-1.1.0/orb/common/vpn/pia.py +262 -0
- orbweaver_tools-1.1.0/orb/scraper/__init__.py +0 -0
- orbweaver_tools-1.1.0/orb/scraper/utils.py +45 -0
- orbweaver_tools-1.1.0/orb/spinner/__init__.py +0 -0
- orbweaver_tools-1.1.0/orb/spinner/builder/__init__.py +0 -0
- orbweaver_tools-1.1.0/orb/spinner/core/__init__.py +0 -0
- orbweaver_tools-1.1.0/orb/spinner/core/driver.py +144 -0
- orbweaver_tools-1.1.0/orb/spinner/utils.py +205 -0
- orbweaver_tools-1.1.0/orb/utils/__init__.py +4 -0
- orbweaver_tools-1.1.0/orb/utils/decorators.py +112 -0
- orbweaver_tools-1.1.0/pyproject.toml +42 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
|
|
2
|
+
MIT License
|
|
3
|
+
|
|
4
|
+
Copyright (c) 2025 Tom Freeman
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
SOFTWARE.
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: orbweaver-tools
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: Stealth-oriented utilities for web scraping with rotating user agents, proxies, and Selenium helpers.
|
|
5
|
+
Home-page: https://github.com/Tom3man/orb-weaver
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: web-scraping,selenium,proxies,automation
|
|
8
|
+
Author: Tom Freeman
|
|
9
|
+
Requires-Python: >=3.10,<4.0
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
19
|
+
Requires-Dist: bs4 (>=0.0.1,<0.0.2)
|
|
20
|
+
Requires-Dist: fake-useragent (>=1.4.0,<2.0.0)
|
|
21
|
+
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
|
22
|
+
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
|
23
|
+
Requires-Dist: selenium (>=4.16.0,<5.0.0)
|
|
24
|
+
Requires-Dist: webdriver-manager (>=4.0.1,<5.0.0)
|
|
25
|
+
Project-URL: Repository, https://github.com/Tom3man/orb-weaver
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# Orb Weaver
|
|
29
|
+
|
|
30
|
+
Utilities for stealth-oriented web scraping with Selenium helpers, rotating headers, and proxy support.
|
|
31
|
+
|
|
32
|
+
## Features
|
|
33
|
+
|
|
34
|
+
- Selenium driver bootstrap via `OrbDriver`
|
|
35
|
+
- Randomized user-agent and request headers
|
|
36
|
+
- Proxy harvesting and validation utilities
|
|
37
|
+
- Human-like browser interaction helpers (`slow_type`, random clicking, scroll, viewport changes)
|
|
38
|
+
- Optional PIA VPN integration for IP rotation
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
From PyPI (after publish):
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install orbweaver-tools
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
From source with Poetry:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
poetry install
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Quick Start
|
|
55
|
+
|
|
56
|
+
### Build a Selenium driver
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from orb.spinner.core.driver import OrbDriver
|
|
60
|
+
|
|
61
|
+
orb_driver = OrbDriver(use_pia=False)
|
|
62
|
+
driver = orb_driver.get_webdriver(url="https://example.com")
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Send spoofed requests
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from bs4 import BeautifulSoup
|
|
69
|
+
from orb.scraper.utils import spoof_request
|
|
70
|
+
|
|
71
|
+
response = spoof_request("https://example.com", use_proxies=False)
|
|
72
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Human-like interactions
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from selenium.webdriver.common.by import By
|
|
79
|
+
from orb.spinner.utils import slow_type, human_clicking
|
|
80
|
+
|
|
81
|
+
input_box = driver.find_element(By.ID, "search")
|
|
82
|
+
slow_type(input_box, "hello world", send_keys=True)
|
|
83
|
+
|
|
84
|
+
button = driver.find_element(By.ID, "submit")
|
|
85
|
+
human_clicking(driver, button)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Development
|
|
89
|
+
|
|
90
|
+
Run tests:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
poetry run pytest -q
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Build package artifacts:
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
poetry build
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Publish to PyPI
|
|
103
|
+
|
|
104
|
+
1. Create an account on [PyPI](https://pypi.org/).
|
|
105
|
+
2. Create an API token and configure Poetry credentials:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
poetry config pypi-token.pypi <your-token>
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
3. Publish:
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
poetry publish --build
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## License
|
|
118
|
+
|
|
119
|
+
MIT. See [`LICENSE`](LICENSE).
|
|
120
|
+
|
|
121
|
+
## Notes
|
|
122
|
+
|
|
123
|
+
Use these tools responsibly and only against systems where you have permission to automate or scrape.
|
|
124
|
+
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# Orb Weaver
|
|
2
|
+
|
|
3
|
+
Utilities for stealth-oriented web scraping with Selenium helpers, rotating headers, and proxy support.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- Selenium driver bootstrap via `OrbDriver`
|
|
8
|
+
- Randomized user-agent and request headers
|
|
9
|
+
- Proxy harvesting and validation utilities
|
|
10
|
+
- Human-like browser interaction helpers (`slow_type`, random clicking, scroll, viewport changes)
|
|
11
|
+
- Optional PIA VPN integration for IP rotation
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
From PyPI (after publish):
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install orbweaver-tools
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
From source with Poetry:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
poetry install
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
### Build a Selenium driver
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from orb.spinner.core.driver import OrbDriver
|
|
33
|
+
|
|
34
|
+
orb_driver = OrbDriver(use_pia=False)
|
|
35
|
+
driver = orb_driver.get_webdriver(url="https://example.com")
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Send spoofed requests
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from bs4 import BeautifulSoup
|
|
42
|
+
from orb.scraper.utils import spoof_request
|
|
43
|
+
|
|
44
|
+
response = spoof_request("https://example.com", use_proxies=False)
|
|
45
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Human-like interactions
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from selenium.webdriver.common.by import By
|
|
52
|
+
from orb.spinner.utils import slow_type, human_clicking
|
|
53
|
+
|
|
54
|
+
input_box = driver.find_element(By.ID, "search")
|
|
55
|
+
slow_type(input_box, "hello world", send_keys=True)
|
|
56
|
+
|
|
57
|
+
button = driver.find_element(By.ID, "submit")
|
|
58
|
+
human_clicking(driver, button)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Development
|
|
62
|
+
|
|
63
|
+
Run tests:
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
poetry run pytest -q
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Build package artifacts:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
poetry build
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Publish to PyPI
|
|
76
|
+
|
|
77
|
+
1. Create an account on [PyPI](https://pypi.org/).
|
|
78
|
+
2. Create an API token and configure Poetry credentials:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
poetry config pypi-token.pypi <your-token>
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
3. Publish:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
poetry publish --build
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## License
|
|
91
|
+
|
|
92
|
+
MIT. See [`LICENSE`](LICENSE).
|
|
93
|
+
|
|
94
|
+
## Notes
|
|
95
|
+
|
|
96
|
+
Use these tools responsibly and only against systems where you have permission to automate or scrape.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
4
|
+
|
|
5
|
+
MODULE_PATH = os.path.dirname(os.path.realpath(__file__))
|
|
6
|
+
REPO_PATH = os.path.dirname(MODULE_PATH)
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
__version__ = version("orbweaver-tools")
|
|
10
|
+
except PackageNotFoundError:
|
|
11
|
+
__version__ = "0.0.0"
|
|
12
|
+
|
|
13
|
+
log = logging.getLogger("orb")
|
|
14
|
+
log.addHandler(logging.NullHandler())
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import socket
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
from selenium.webdriver.remote.webdriver import WebDriver
|
|
7
|
+
|
|
8
|
+
from orb import REPO_PATH
|
|
9
|
+
from orb.spinner.utils import get_user_agent
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def create_welcome_page(
|
|
13
|
+
proxy_info: Optional[str] = None,
|
|
14
|
+
user_agent: Optional[str] = None,
|
|
15
|
+
) -> str:
|
|
16
|
+
"""
|
|
17
|
+
Create a welcome page HTML content with IP, proxy, and user agent information.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
proxy_info (str, optional): Proxy information. Defaults to None.
|
|
21
|
+
user_agent (str, optional): User agent information. Defaults to None.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
str: The HTML content of the welcome page.
|
|
25
|
+
"""
|
|
26
|
+
ip_information = f"""
|
|
27
|
+
<h2>IP Information</h2>
|
|
28
|
+
<p>Local IP Address: {get_local_ip()}</p>
|
|
29
|
+
<p>Public IP Address: {get_public_ip()}</p>
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
proxy_info_section = f"""
|
|
33
|
+
<h2>Proxy Information</h2>
|
|
34
|
+
<p>Proxy: {proxy_info}</p>
|
|
35
|
+
""" if proxy_info else ""
|
|
36
|
+
|
|
37
|
+
user_agent_section = f"""
|
|
38
|
+
<h2>User Agent Information</h2>
|
|
39
|
+
<p>User Agent: {user_agent}</p>
|
|
40
|
+
""" if user_agent else ""
|
|
41
|
+
|
|
42
|
+
page_content = f"""
|
|
43
|
+
<html>
|
|
44
|
+
<body>
|
|
45
|
+
<h1>Welcome to the Orb Weaver Project!</h1>
|
|
46
|
+
{ip_information}
|
|
47
|
+
{proxy_info_section}
|
|
48
|
+
{user_agent_section}
|
|
49
|
+
</body>
|
|
50
|
+
</html>
|
|
51
|
+
"""
|
|
52
|
+
return page_content
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_local_ip() -> str:
|
|
56
|
+
"""
|
|
57
|
+
Get the local IP address of the machine.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
str: The local IP address.
|
|
61
|
+
"""
|
|
62
|
+
return socket.gethostbyname(socket.gethostname())
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def get_public_ip() -> str:
|
|
66
|
+
"""
|
|
67
|
+
Get the public IP address of the machine.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
str: The public IP address.
|
|
71
|
+
"""
|
|
72
|
+
response = requests.get('https://api.ipify.org')
|
|
73
|
+
return response.text
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def build_welcome_page(
|
|
77
|
+
driver: WebDriver,
|
|
78
|
+
proxy_info: Optional[str] = None,
|
|
79
|
+
) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Build a welcome page using the provided WebDriver and proxy info.
|
|
82
|
+
Webpage is then rendered with the active driver and then deleted to not store any sensitive information locally.
|
|
83
|
+
Args:
|
|
84
|
+
driver (WebDriver): The WebDriver instance.
|
|
85
|
+
proxy_info (str): Proxy information.
|
|
86
|
+
user_agent (str, optional): User agent information. Defaults to None.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
page_content = create_welcome_page(
|
|
90
|
+
user_agent=get_user_agent(driver=driver),
|
|
91
|
+
proxy_info=proxy_info,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
file_path = f"{REPO_PATH}/proxy_info.html"
|
|
95
|
+
with open(file_path, 'w') as file:
|
|
96
|
+
file.write(page_content)
|
|
97
|
+
|
|
98
|
+
driver.get(f'file://{file_path}')
|
|
99
|
+
os.remove(file_path)
|
|
File without changes
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Dict
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import requests
|
|
7
|
+
from bs4 import BeautifulSoup, Tag
|
|
8
|
+
|
|
9
|
+
from orb.common.proxies.test_proxies import test_proxy
|
|
10
|
+
|
|
11
|
+
log = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class GetProxies:
|
|
15
|
+
"""
|
|
16
|
+
A class for retrieving and working with proxy information.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
PROXY_SITE = "https://free-proxy-list.net/"
|
|
20
|
+
|
|
21
|
+
def __init__(self) -> None:
|
|
22
|
+
"""
|
|
23
|
+
Initializes the GetProxies object and sets the current date and time.
|
|
24
|
+
"""
|
|
25
|
+
todays_datetime = datetime.now()
|
|
26
|
+
self.date_now = todays_datetime.strftime("%Y-%m-%d")
|
|
27
|
+
self.time_now = todays_datetime.strftime("%H:%M")
|
|
28
|
+
|
|
29
|
+
def request_proxies(self) -> requests.Response:
|
|
30
|
+
"""
|
|
31
|
+
Sends a request to the proxy URL and returns the response object.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
requests.Response: The response object from the request.
|
|
35
|
+
"""
|
|
36
|
+
return requests.get(self.PROXY_SITE)
|
|
37
|
+
|
|
38
|
+
def parse_requests(self) -> BeautifulSoup:
|
|
39
|
+
"""
|
|
40
|
+
Parses the returned request from request_proxies and returns a BeautifulSoup object.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
BeautifulSoup: The BeautifulSoup object representing the parsed HTML.
|
|
44
|
+
"""
|
|
45
|
+
return BeautifulSoup(self.request_proxies().content, 'html.parser')
|
|
46
|
+
|
|
47
|
+
def extract_table_html(self) -> Tag:
|
|
48
|
+
"""
|
|
49
|
+
Extracts the raw HTML of the table containing the proxy information.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
BeautifulSoup: The BeautifulSoup object representing the extracted HTML.
|
|
53
|
+
"""
|
|
54
|
+
return self.parse_requests().find('table')
|
|
55
|
+
|
|
56
|
+
def return_proxy_table(self, https_only: bool = True) -> pd.DataFrame:
|
|
57
|
+
"""
|
|
58
|
+
Iterates through the HTML table to build a pandas DataFrame of proxies.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
https_only (bool, optional): Whether to return only proxies with HTTPS support.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
pd.DataFrame: The pandas DataFrame containing the proxy information.
|
|
65
|
+
"""
|
|
66
|
+
df = pd.DataFrame()
|
|
67
|
+
headers = None
|
|
68
|
+
for tr in self.extract_table_html().find_all('tr'):
|
|
69
|
+
if not headers:
|
|
70
|
+
headers = [
|
|
71
|
+
td.text.upper().replace(" ", "_") for td in tr.find_all(['th', 'td'])
|
|
72
|
+
]
|
|
73
|
+
df = pd.DataFrame(columns=headers)
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
df.loc[len(df)] = [td.text for td in tr.find_all(['th', 'td'])]
|
|
77
|
+
|
|
78
|
+
if https_only:
|
|
79
|
+
return df[df['HTTPS'] == 'yes']
|
|
80
|
+
return df
|
|
81
|
+
|
|
82
|
+
def build_proxy_dict(self) -> None:
|
|
83
|
+
"""
|
|
84
|
+
Builds a dictionary of HTTP and HTTPS proxy values from the proxy table DataFrame.
|
|
85
|
+
"""
|
|
86
|
+
proxy_table = self.return_proxy_table(https_only=True)
|
|
87
|
+
proxy_row = proxy_table.sample(1)
|
|
88
|
+
ip_address = proxy_row['IP_ADDRESS'].values[0]
|
|
89
|
+
port = proxy_row['PORT'].values[0]
|
|
90
|
+
|
|
91
|
+
self.proxies = {
|
|
92
|
+
"http": f"{ip_address}:{port}",
|
|
93
|
+
"https": f"{ip_address}:{port}",
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def proxy_dict(self) -> Dict[str, str]:
|
|
98
|
+
"""
|
|
99
|
+
Property that returns the proxy dictionary.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Dict[str, str]: The dictionary containing HTTP and HTTPS proxy values.
|
|
103
|
+
|
|
104
|
+
Raises:
|
|
105
|
+
RuntimeError: If a working proxy cannot be found after maximum retries.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
self.build_proxy_dict()
|
|
109
|
+
if test_proxy(proxies=self.proxies):
|
|
110
|
+
return self.proxies
|
|
111
|
+
raise RuntimeError("Failed to find a working proxy.")
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Dict
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
log = logging.getLogger(__name__)
|
|
7
|
+
__test__ = False
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_proxy(proxies: Dict[str, str]) -> bool:
|
|
11
|
+
"""
|
|
12
|
+
Test the functionality of a proxy by making a request to a sample URL.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
proxies (Dict[str, str]): Dictionary containing HTTP and HTTPS proxies.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
bool: True if the proxy is working, False otherwise.
|
|
19
|
+
"""
|
|
20
|
+
try:
|
|
21
|
+
url = 'http://www.example.com'
|
|
22
|
+
response = requests.get(url, proxies=proxies, timeout=5)
|
|
23
|
+
if response.status_code == 200:
|
|
24
|
+
log.info(f"{proxies['https']} Proxy is working!")
|
|
25
|
+
return True
|
|
26
|
+
else:
|
|
27
|
+
log.error(f"{proxies['https']} Proxy is NOT working!")
|
|
28
|
+
return False
|
|
29
|
+
except requests.exceptions.RequestException:
|
|
30
|
+
log.error("Unable to connect to the proxy.")
|
|
31
|
+
return False
|
|
File without changes
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from typing import Dict
|
|
3
|
+
|
|
4
|
+
from fake_useragent import UserAgent
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class GetUserAgent:
|
|
8
|
+
"""
|
|
9
|
+
A class for retrieving random User-Agent headers for web scraping.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
def __init__(self) -> None:
|
|
13
|
+
"""
|
|
14
|
+
Initializes the GetUserAgent object.
|
|
15
|
+
Sets up the UserAgent instance.
|
|
16
|
+
"""
|
|
17
|
+
self.user_agent = UserAgent()
|
|
18
|
+
self.accept_languages = ["en-US", "en-GB", "fr-FR", "es-ES"]
|
|
19
|
+
self.referer_urls = [
|
|
20
|
+
"https://www.google.com",
|
|
21
|
+
"https://www.bing.com",
|
|
22
|
+
"https://search.yahoo.com",
|
|
23
|
+
"https://en.wikipedia.org",
|
|
24
|
+
"https://www.reddit.com",
|
|
25
|
+
"https://twitter.com",
|
|
26
|
+
"https://www.facebook.com",
|
|
27
|
+
"https://www.youtube.com",
|
|
28
|
+
"https://www.amazon.com",
|
|
29
|
+
"https://www.cnn.com",
|
|
30
|
+
"https://www.bbc.com",
|
|
31
|
+
"https://www.reuters.com",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
def get_random_referer(self) -> str:
|
|
35
|
+
"""
|
|
36
|
+
Returns a random referer URL.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
str: A randomly chosen referer URL.
|
|
40
|
+
"""
|
|
41
|
+
return random.choice(self.referer_urls)
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def headers_dict(self) -> Dict[str, "UserAgent"]:
|
|
45
|
+
"""
|
|
46
|
+
Returns a dictionary containing the User-Agent header.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
dict: A dictionary with the 'User-Agent' key, random user agent, and a randomly chosen referer URL.
|
|
50
|
+
"""
|
|
51
|
+
return {
|
|
52
|
+
"User-Agent": self.user_agent.random,
|
|
53
|
+
"Accept-Language": random.choice(self.accept_languages),
|
|
54
|
+
"Referer": self.get_random_referer(),
|
|
55
|
+
"Cache-Control": "max-age=0" if random.random() < 0.5 else "no-cache",
|
|
56
|
+
"Upgrade-Insecure-Requests": "1" if random.random() < 0.5 else "0",
|
|
57
|
+
"DNT": "1" if random.random() < 0.3 else "0",
|
|
58
|
+
"X-Forwarded-For": ".".join(str(random.randint(0, 255)) for _ in range(4)),
|
|
59
|
+
"X-Requested-With": "XMLHttpRequest" if random.random() < 0.5 else "",
|
|
60
|
+
"X-Frame-Options": "DENY" if random.random() < 0.5 else "SAMEORIGIN",
|
|
61
|
+
"Connection": "keep-alive" if random.random() < 0.8 else "close",
|
|
62
|
+
}
|