orbweaver-tools 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+
2
+ MIT License
3
+
4
+ Copyright (c) 2025 Tom Freeman
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
@@ -0,0 +1,124 @@
1
+ Metadata-Version: 2.1
2
+ Name: orbweaver-tools
3
+ Version: 1.1.0
4
+ Summary: Stealth-oriented utilities for web scraping with rotating user agents, proxies, and Selenium helpers.
5
+ Home-page: https://github.com/Tom3man/orb-weaver
6
+ License: MIT
7
+ Keywords: web-scraping,selenium,proxies,automation
8
+ Author: Tom Freeman
9
+ Requires-Python: >=3.10,<4.0
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Internet :: WWW/HTTP
18
+ Classifier: Topic :: Software Development :: Libraries
19
+ Requires-Dist: bs4 (>=0.0.1,<0.0.2)
20
+ Requires-Dist: fake-useragent (>=1.4.0,<2.0.0)
21
+ Requires-Dist: pandas (>=2.1.4,<3.0.0)
22
+ Requires-Dist: requests (>=2.31.0,<3.0.0)
23
+ Requires-Dist: selenium (>=4.16.0,<5.0.0)
24
+ Requires-Dist: webdriver-manager (>=4.0.1,<5.0.0)
25
+ Project-URL: Repository, https://github.com/Tom3man/orb-weaver
26
+ Description-Content-Type: text/markdown
27
+
28
+ # Orb Weaver
29
+
30
+ Utilities for stealth-oriented web scraping with Selenium helpers, rotating headers, and proxy support.
31
+
32
+ ## Features
33
+
34
+ - Selenium driver bootstrap via `OrbDriver`
35
+ - Randomized user-agent and request headers
36
+ - Proxy harvesting and validation utilities
37
+ - Human-like browser interaction helpers (`slow_type`, random clicking, scroll, viewport changes)
38
+ - Optional PIA VPN integration for IP rotation
39
+
40
+ ## Installation
41
+
42
+ From PyPI (after publish):
43
+
44
+ ```bash
45
+ pip install orbweaver-tools
46
+ ```
47
+
48
+ From source with Poetry:
49
+
50
+ ```bash
51
+ poetry install
52
+ ```
53
+
54
+ ## Quick Start
55
+
56
+ ### Build a Selenium driver
57
+
58
+ ```python
59
+ from orb.spinner.core.driver import OrbDriver
60
+
61
+ orb_driver = OrbDriver(use_pia=False)
62
+ driver = orb_driver.get_webdriver(url="https://example.com")
63
+ ```
64
+
65
+ ### Send spoofed requests
66
+
67
+ ```python
68
+ from bs4 import BeautifulSoup
69
+ from orb.scraper.utils import spoof_request
70
+
71
+ response = spoof_request("https://example.com", use_proxies=False)
72
+ soup = BeautifulSoup(response.content, "html.parser")
73
+ ```
74
+
75
+ ### Human-like interactions
76
+
77
+ ```python
78
+ from selenium.webdriver.common.by import By
79
+ from orb.spinner.utils import slow_type, human_clicking
80
+
81
+ input_box = driver.find_element(By.ID, "search")
82
+ slow_type(input_box, "hello world", send_keys=True)
83
+
84
+ button = driver.find_element(By.ID, "submit")
85
+ human_clicking(driver, button)
86
+ ```
87
+
88
+ ## Development
89
+
90
+ Run tests:
91
+
92
+ ```bash
93
+ poetry run pytest -q
94
+ ```
95
+
96
+ Build package artifacts:
97
+
98
+ ```bash
99
+ poetry build
100
+ ```
101
+
102
+ ## Publish to PyPI
103
+
104
+ 1. Create an account on [PyPI](https://pypi.org/).
105
+ 2. Create an API token and configure Poetry credentials:
106
+
107
+ ```bash
108
+ poetry config pypi-token.pypi <your-token>
109
+ ```
110
+
111
+ 3. Publish:
112
+
113
+ ```bash
114
+ poetry publish --build
115
+ ```
116
+
117
+ ## License
118
+
119
+ MIT. See [`LICENSE`](LICENSE).
120
+
121
+ ## Notes
122
+
123
+ Use these tools responsibly and only against systems where you have permission to automate or scrape.
124
+
@@ -0,0 +1,96 @@
1
+ # Orb Weaver
2
+
3
+ Utilities for stealth-oriented web scraping with Selenium helpers, rotating headers, and proxy support.
4
+
5
+ ## Features
6
+
7
+ - Selenium driver bootstrap via `OrbDriver`
8
+ - Randomized user-agent and request headers
9
+ - Proxy harvesting and validation utilities
10
+ - Human-like browser interaction helpers (`slow_type`, random clicking, scroll, viewport changes)
11
+ - Optional PIA VPN integration for IP rotation
12
+
13
+ ## Installation
14
+
15
+ From PyPI (after publish):
16
+
17
+ ```bash
18
+ pip install orbweaver-tools
19
+ ```
20
+
21
+ From source with Poetry:
22
+
23
+ ```bash
24
+ poetry install
25
+ ```
26
+
27
+ ## Quick Start
28
+
29
+ ### Build a Selenium driver
30
+
31
+ ```python
32
+ from orb.spinner.core.driver import OrbDriver
33
+
34
+ orb_driver = OrbDriver(use_pia=False)
35
+ driver = orb_driver.get_webdriver(url="https://example.com")
36
+ ```
37
+
38
+ ### Send spoofed requests
39
+
40
+ ```python
41
+ from bs4 import BeautifulSoup
42
+ from orb.scraper.utils import spoof_request
43
+
44
+ response = spoof_request("https://example.com", use_proxies=False)
45
+ soup = BeautifulSoup(response.content, "html.parser")
46
+ ```
47
+
48
+ ### Human-like interactions
49
+
50
+ ```python
51
+ from selenium.webdriver.common.by import By
52
+ from orb.spinner.utils import slow_type, human_clicking
53
+
54
+ input_box = driver.find_element(By.ID, "search")
55
+ slow_type(input_box, "hello world", send_keys=True)
56
+
57
+ button = driver.find_element(By.ID, "submit")
58
+ human_clicking(driver, button)
59
+ ```
60
+
61
+ ## Development
62
+
63
+ Run tests:
64
+
65
+ ```bash
66
+ poetry run pytest -q
67
+ ```
68
+
69
+ Build package artifacts:
70
+
71
+ ```bash
72
+ poetry build
73
+ ```
74
+
75
+ ## Publish to PyPI
76
+
77
+ 1. Create an account on [PyPI](https://pypi.org/).
78
+ 2. Create an API token and configure Poetry credentials:
79
+
80
+ ```bash
81
+ poetry config pypi-token.pypi <your-token>
82
+ ```
83
+
84
+ 3. Publish:
85
+
86
+ ```bash
87
+ poetry publish --build
88
+ ```
89
+
90
+ ## License
91
+
92
+ MIT. See [`LICENSE`](LICENSE).
93
+
94
+ ## Notes
95
+
96
+ Use these tools responsibly and only against systems where you have permission to automate or scrape.
@@ -0,0 +1,14 @@
1
+ import logging
2
+ import os
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ MODULE_PATH = os.path.dirname(os.path.realpath(__file__))
6
+ REPO_PATH = os.path.dirname(MODULE_PATH)
7
+
8
+ try:
9
+ __version__ = version("orbweaver-tools")
10
+ except PackageNotFoundError:
11
+ __version__ = "0.0.0"
12
+
13
+ log = logging.getLogger("orb")
14
+ log.addHandler(logging.NullHandler())
File without changes
File without changes
@@ -0,0 +1,99 @@
1
+ import os
2
+ import socket
3
+ from typing import Optional
4
+
5
+ import requests
6
+ from selenium.webdriver.remote.webdriver import WebDriver
7
+
8
+ from orb import REPO_PATH
9
+ from orb.spinner.utils import get_user_agent
10
+
11
+
12
+ def create_welcome_page(
13
+ proxy_info: Optional[str] = None,
14
+ user_agent: Optional[str] = None,
15
+ ) -> str:
16
+ """
17
+ Create a welcome page HTML content with IP, proxy, and user agent information.
18
+
19
+ Args:
20
+ proxy_info (str, optional): Proxy information. Defaults to None.
21
+ user_agent (str, optional): User agent information. Defaults to None.
22
+
23
+ Returns:
24
+ str: The HTML content of the welcome page.
25
+ """
26
+ ip_information = f"""
27
+ <h2>IP Information</h2>
28
+ <p>Local IP Address: {get_local_ip()}</p>
29
+ <p>Public IP Address: {get_public_ip()}</p>
30
+ """
31
+
32
+ proxy_info_section = f"""
33
+ <h2>Proxy Information</h2>
34
+ <p>Proxy: {proxy_info}</p>
35
+ """ if proxy_info else ""
36
+
37
+ user_agent_section = f"""
38
+ <h2>User Agent Information</h2>
39
+ <p>User Agent: {user_agent}</p>
40
+ """ if user_agent else ""
41
+
42
+ page_content = f"""
43
+ <html>
44
+ <body>
45
+ <h1>Welcome to the Orb Weaver Project!</h1>
46
+ {ip_information}
47
+ {proxy_info_section}
48
+ {user_agent_section}
49
+ </body>
50
+ </html>
51
+ """
52
+ return page_content
53
+
54
+
55
+ def get_local_ip() -> str:
56
+ """
57
+ Get the local IP address of the machine.
58
+
59
+ Returns:
60
+ str: The local IP address.
61
+ """
62
+ return socket.gethostbyname(socket.gethostname())
63
+
64
+
65
+ def get_public_ip() -> str:
66
+ """
67
+ Get the public IP address of the machine.
68
+
69
+ Returns:
70
+ str: The public IP address.
71
+ """
72
+ response = requests.get('https://api.ipify.org')
73
+ return response.text
74
+
75
+
76
+ def build_welcome_page(
77
+ driver: WebDriver,
78
+ proxy_info: Optional[str] = None,
79
+ ) -> None:
80
+ """
81
+ Build a welcome page using the provided WebDriver and proxy info.
82
+ Webpage is then rendered with the active driver and then deleted to not store any sensitive information locally.
83
+ Args:
84
+ driver (WebDriver): The WebDriver instance.
85
+ proxy_info (str): Proxy information.
86
+ user_agent (str, optional): User agent information. Defaults to None.
87
+ """
88
+
89
+ page_content = create_welcome_page(
90
+ user_agent=get_user_agent(driver=driver),
91
+ proxy_info=proxy_info,
92
+ )
93
+
94
+ file_path = f"{REPO_PATH}/proxy_info.html"
95
+ with open(file_path, 'w') as file:
96
+ file.write(page_content)
97
+
98
+ driver.get(f'file://{file_path}')
99
+ os.remove(file_path)
File without changes
@@ -0,0 +1,111 @@
1
+ import logging
2
+ from datetime import datetime
3
+ from typing import Dict
4
+
5
+ import pandas as pd
6
+ import requests
7
+ from bs4 import BeautifulSoup, Tag
8
+
9
+ from orb.common.proxies.test_proxies import test_proxy
10
+
11
+ log = logging.getLogger(__name__)
12
+
13
+
14
+ class GetProxies:
15
+ """
16
+ A class for retrieving and working with proxy information.
17
+ """
18
+
19
+ PROXY_SITE = "https://free-proxy-list.net/"
20
+
21
+ def __init__(self) -> None:
22
+ """
23
+ Initializes the GetProxies object and sets the current date and time.
24
+ """
25
+ todays_datetime = datetime.now()
26
+ self.date_now = todays_datetime.strftime("%Y-%m-%d")
27
+ self.time_now = todays_datetime.strftime("%H:%M")
28
+
29
+ def request_proxies(self) -> requests.Response:
30
+ """
31
+ Sends a request to the proxy URL and returns the response object.
32
+
33
+ Returns:
34
+ requests.Response: The response object from the request.
35
+ """
36
+ return requests.get(self.PROXY_SITE)
37
+
38
+ def parse_requests(self) -> BeautifulSoup:
39
+ """
40
+ Parses the returned request from request_proxies and returns a BeautifulSoup object.
41
+
42
+ Returns:
43
+ BeautifulSoup: The BeautifulSoup object representing the parsed HTML.
44
+ """
45
+ return BeautifulSoup(self.request_proxies().content, 'html.parser')
46
+
47
+ def extract_table_html(self) -> Tag:
48
+ """
49
+ Extracts the raw HTML of the table containing the proxy information.
50
+
51
+ Returns:
52
+ BeautifulSoup: The BeautifulSoup object representing the extracted HTML.
53
+ """
54
+ return self.parse_requests().find('table')
55
+
56
+ def return_proxy_table(self, https_only: bool = True) -> pd.DataFrame:
57
+ """
58
+ Iterates through the HTML table to build a pandas DataFrame of proxies.
59
+
60
+ Args:
61
+ https_only (bool, optional): Whether to return only proxies with HTTPS support.
62
+
63
+ Returns:
64
+ pd.DataFrame: The pandas DataFrame containing the proxy information.
65
+ """
66
+ df = pd.DataFrame()
67
+ headers = None
68
+ for tr in self.extract_table_html().find_all('tr'):
69
+ if not headers:
70
+ headers = [
71
+ td.text.upper().replace(" ", "_") for td in tr.find_all(['th', 'td'])
72
+ ]
73
+ df = pd.DataFrame(columns=headers)
74
+ continue
75
+
76
+ df.loc[len(df)] = [td.text for td in tr.find_all(['th', 'td'])]
77
+
78
+ if https_only:
79
+ return df[df['HTTPS'] == 'yes']
80
+ return df
81
+
82
+ def build_proxy_dict(self) -> None:
83
+ """
84
+ Builds a dictionary of HTTP and HTTPS proxy values from the proxy table DataFrame.
85
+ """
86
+ proxy_table = self.return_proxy_table(https_only=True)
87
+ proxy_row = proxy_table.sample(1)
88
+ ip_address = proxy_row['IP_ADDRESS'].values[0]
89
+ port = proxy_row['PORT'].values[0]
90
+
91
+ self.proxies = {
92
+ "http": f"{ip_address}:{port}",
93
+ "https": f"{ip_address}:{port}",
94
+ }
95
+
96
+ @property
97
+ def proxy_dict(self) -> Dict[str, str]:
98
+ """
99
+ Property that returns the proxy dictionary.
100
+
101
+ Returns:
102
+ Dict[str, str]: The dictionary containing HTTP and HTTPS proxy values.
103
+
104
+ Raises:
105
+ RuntimeError: If a working proxy cannot be found after maximum retries.
106
+ """
107
+
108
+ self.build_proxy_dict()
109
+ if test_proxy(proxies=self.proxies):
110
+ return self.proxies
111
+ raise RuntimeError("Failed to find a working proxy.")
@@ -0,0 +1,31 @@
1
+ import logging
2
+ from typing import Dict
3
+
4
+ import requests
5
+
6
+ log = logging.getLogger(__name__)
7
+ __test__ = False
8
+
9
+
10
+ def test_proxy(proxies: Dict[str, str]) -> bool:
11
+ """
12
+ Test the functionality of a proxy by making a request to a sample URL.
13
+
14
+ Args:
15
+ proxies (Dict[str, str]): Dictionary containing HTTP and HTTPS proxies.
16
+
17
+ Returns:
18
+ bool: True if the proxy is working, False otherwise.
19
+ """
20
+ try:
21
+ url = 'http://www.example.com'
22
+ response = requests.get(url, proxies=proxies, timeout=5)
23
+ if response.status_code == 200:
24
+ log.info(f"{proxies['https']} Proxy is working!")
25
+ return True
26
+ else:
27
+ log.error(f"{proxies['https']} Proxy is NOT working!")
28
+ return False
29
+ except requests.exceptions.RequestException:
30
+ log.error("Unable to connect to the proxy.")
31
+ return False
@@ -0,0 +1,62 @@
1
+ import random
2
+ from typing import Dict
3
+
4
+ from fake_useragent import UserAgent
5
+
6
+
7
+ class GetUserAgent:
8
+ """
9
+ A class for retrieving random User-Agent headers for web scraping.
10
+ """
11
+
12
+ def __init__(self) -> None:
13
+ """
14
+ Initializes the GetUserAgent object.
15
+ Sets up the UserAgent instance.
16
+ """
17
+ self.user_agent = UserAgent()
18
+ self.accept_languages = ["en-US", "en-GB", "fr-FR", "es-ES"]
19
+ self.referer_urls = [
20
+ "https://www.google.com",
21
+ "https://www.bing.com",
22
+ "https://search.yahoo.com",
23
+ "https://en.wikipedia.org",
24
+ "https://www.reddit.com",
25
+ "https://twitter.com",
26
+ "https://www.facebook.com",
27
+ "https://www.youtube.com",
28
+ "https://www.amazon.com",
29
+ "https://www.cnn.com",
30
+ "https://www.bbc.com",
31
+ "https://www.reuters.com",
32
+ ]
33
+
34
+ def get_random_referer(self) -> str:
35
+ """
36
+ Returns a random referer URL.
37
+
38
+ Returns:
39
+ str: A randomly chosen referer URL.
40
+ """
41
+ return random.choice(self.referer_urls)
42
+
43
+ @property
44
+ def headers_dict(self) -> Dict[str, "UserAgent"]:
45
+ """
46
+ Returns a dictionary containing the User-Agent header.
47
+
48
+ Returns:
49
+ dict: A dictionary with the 'User-Agent' key, random user agent, and a randomly chosen referer URL.
50
+ """
51
+ return {
52
+ "User-Agent": self.user_agent.random,
53
+ "Accept-Language": random.choice(self.accept_languages),
54
+ "Referer": self.get_random_referer(),
55
+ "Cache-Control": "max-age=0" if random.random() < 0.5 else "no-cache",
56
+ "Upgrade-Insecure-Requests": "1" if random.random() < 0.5 else "0",
57
+ "DNT": "1" if random.random() < 0.3 else "0",
58
+ "X-Forwarded-For": ".".join(str(random.randint(0, 255)) for _ in range(4)),
59
+ "X-Requested-With": "XMLHttpRequest" if random.random() < 0.5 else "",
60
+ "X-Frame-Options": "DENY" if random.random() < 0.5 else "SAMEORIGIN",
61
+ "Connection": "keep-alive" if random.random() < 0.8 else "close",
62
+ }
@@ -0,0 +1,5 @@
1
+ from .pia import PiaVpn
2
+
3
+ __all__ = [
4
+ PiaVpn,
5
+ ]