abstract-webtools 0.1.6.95__py3-none-any.whl → 0.1.6.97__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,128 @@
1
+ import os
2
+ import re
3
+ import time
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ from urllib.parse import urljoin
7
+ from selenium import webdriver
8
+ from selenium.webdriver.common.by import By
9
+ from selenium.webdriver.common.keys import Keys
10
+ from selenium.webdriver.support.ui import WebDriverWait
11
+ from selenium.webdriver.support import expected_conditions as EC
12
+ from abstract_security import *
13
+ from abstract_webtools import *
14
+ DOWNLOAD_DIR = os.path.abspath("./downloads")
15
+ os.makedirs(DOWNLOAD_DIR, exist_ok=True)
16
+ class K2SDownloader:
17
+ def __init__(self):
18
+ self.session = requests.Session()
19
+ self.session.headers.update({
20
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
21
+ })
22
+ self.driver = self._init_driver()
23
+ self.logged_in = False
24
+
25
+ def _init_driver(self):
26
+ options = webdriver.ChromeOptions()
27
+ options.add_argument("--disable-blink-features=AutomationControlled")
28
+ options.add_argument("--headless")
29
+ return webdriver.Chrome(options=options)
30
+
31
+ def login(self):
32
+ userName = get_env_value('userName')
33
+ passWord = get_env_value('passWord')
34
+
35
+ self.driver.get("https://k2s.cc/auth/login")
36
+ time.sleep(3)
37
+
38
+
39
+ email_input = self.driver.find_element(By.NAME, "email")
40
+ password_input = self.driver.find_element(By.NAME, "input-password-auto-complete-on")
41
+ email_input.send_keys(userName)
42
+ password_input.send_keys(passWord)
43
+ password_input.send_keys(Keys.RETURN)
44
+
45
+ #WebDriverWait(self.driver, 20).until(
46
+ # EC.presence_of_element_located((By.XPATH, "//a[contains(text(), 'Logout')]"))
47
+ #)
48
+ self.logged_in = True
49
+ print("Login successful")
50
+ #except Exception as e:
51
+ # print(f"Login failed: {e}")
52
+ # with open('login_error.html', 'w', encoding='utf-8') as f:
53
+ # f.write(self.driver.page_source)
54
+
55
+ def download_file(self, url):
56
+ if not self.logged_in:
57
+ self.login()
58
+
59
+ print(f"Navigating to: {url}")
60
+ self.driver.get(url)
61
+ time.sleep(5)
62
+
63
+ if 'captcha' in self.driver.page_source.lower():
64
+ print("CAPTCHA detected. Manual intervention required.")
65
+ return
66
+
67
+ try:
68
+ download_button = WebDriverWait(self.driver, 30).until(
69
+ EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[href*="/download"], button[class*="download"]'))
70
+ )
71
+ print("Download button found; attempting to click or fetch URL")
72
+ download_url = download_button.get_attribute('href')
73
+
74
+ if download_url:
75
+ response = self.session.get(download_url, stream=True)
76
+ file_name = self._extract_filename(response, download_url)
77
+ file_path = os.path.join(DOWNLOAD_DIR, file_name)
78
+
79
+ with open(file_path, 'wb') as f:
80
+ for chunk in response.iter_content(chunk_size=8192):
81
+ f.write(chunk)
82
+ print(f"Downloaded: {file_path}")
83
+ return file_path
84
+ else:
85
+ download_button.click()
86
+ print("Button clicked. Waiting for download...")
87
+ time.sleep(30) # adjust as needed
88
+ except Exception as e:
89
+ print(f"Download failed for {url}: {e}")
90
+
91
+ def _extract_filename(self, response, url):
92
+ cd = response.headers.get('Content-Disposition', '')
93
+ if 'filename=' in cd:
94
+ return cd.split('filename=')[-1].strip('"')
95
+ return url.split('/')[-1].split('?')[0]
96
+
97
+ class dlsManager:
98
+ def __init__(self, downloader):
99
+ self.downloader = downloader
100
+ self.all_dls = []
101
+
102
+ def is_prev_dl(self, data):
103
+ k2s_link = data.get('k2s')
104
+ for prev_data in self.all_dls:
105
+ if prev_data.get('k2s') == k2s_link:
106
+ return True
107
+ self.all_dls.append(data)
108
+ return False
109
+
110
+ def dl_k2s_link(self, k2s_link):
111
+ if k2s_link:
112
+ print(f"Downloading: {k2s_link}")
113
+ self.downloader.download_file(k2s_link)
114
+ time.sleep(10)
115
+
116
+
117
+ def get_soup(url):
118
+ try:
119
+ resp = requests.get(url)
120
+ resp.raise_for_status()
121
+ return BeautifulSoup(resp.text, 'html.parser')
122
+ except Exception as e:
123
+ print(f"Failed to fetch soup for {url}: {e}")
124
+ return None
125
+
126
+ def get_k2s_link(soup):
127
+ match = re.search(r'https://k2s\.cc/file/[^"<]+', str(soup))
128
+ return match.group(0) if match else None
@@ -0,0 +1,196 @@
1
+ Metadata-Version: 2.4
2
+ Name: abstract_webtools
3
+ Version: 0.1.6.97
4
+ Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
+ Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
+ Author: putkoff
7
+ Author-email: partners@abstractendeavors.com
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Requires-Python: >=3.6
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: abstract_utilities>=0.2.2.30
16
+ Requires-Dist: PySimpleGUI>=4.60.5
17
+ Requires-Dist: urllib3>=2.0.4
18
+ Requires-Dist: requests>=2.31.0
19
+ Dynamic: author
20
+ Dynamic: author-email
21
+ Dynamic: classifier
22
+ Dynamic: description
23
+ Dynamic: description-content-type
24
+ Dynamic: home-page
25
+ Dynamic: requires-dist
26
+ Dynamic: requires-python
27
+ Dynamic: summary
28
+
29
+ # Unknown Package (vUnknown Version)
30
+
31
+ No description available
32
+
33
+ ## Installation
34
+
35
+ ```bash
36
+ pip install Unknown Package
37
+ ```
38
+
39
+ ## Dependencies
40
+
41
+ None
42
+
43
+ ## Modules
44
+
45
+ ### src/abstract_webtools/url_grabber.py
46
+
47
+ Description of script based on prompt: You are analyzing a Python script 'url_grabber.py' (mock response)
48
+
49
+ ### src/abstract_webtools/managers/__init__.py
50
+
51
+ Description of script based on prompt: You are analyzing a Python script '__init__.py' lo (mock response)
52
+
53
+ ### src/abstract_webtools/managers/cipherManager.py
54
+
55
+ Description of script based on prompt: You are analyzing a Python script 'cipherManager.p (mock response)
56
+
57
+ ### src/abstract_webtools/managers/linkManager/__init__.py
58
+
59
+ Description of script based on prompt: You are analyzing a Python script '__init__.py' lo (mock response)
60
+
61
+ ### src/abstract_webtools/managers/videoDownloader.py
62
+
63
+ Description of script based on prompt: You are analyzing a Python script 'videoDownloader (mock response)
64
+
65
+ ### src/abstract_webtools/managers/tlsAdapter.py
66
+
67
+ Description of script based on prompt: You are analyzing a Python script 'tlsAdapter.py' (mock response)
68
+
69
+ ### src/abstract_webtools/managers/soupManager/soupManager.py
70
+
71
+ Description of script based on prompt: You are analyzing a Python script 'soupManager.py' (mock response)
72
+
73
+ ### src/abstract_webtools/managers/urlManager/__init__.py
74
+
75
+ Description of script based on prompt: You are analyzing a Python script '__init__.py' lo (mock response)
76
+
77
+ ### src/abstract_webtools/url_grabber_new.py
78
+
79
+ Description of script based on prompt: You are analyzing a Python script 'url_grabber_new (mock response)
80
+
81
+ ### src/abstract_webtools/managers/requestManager/__init__.py
82
+
83
+ Description of script based on prompt: You are analyzing a Python script '__init__.py' lo (mock response)
84
+
85
+ ### src/abstract_webtools/managers/get_test.py
86
+
87
+ Description of script based on prompt: You are analyzing a Python script 'get_test.py' lo (mock response)
88
+
89
+ ### src/abstract_webtools/__init__.py
90
+
91
+ Description of script based on prompt: You are analyzing a Python script '__init__.py' lo (mock response)
92
+
93
+ ### src/abstract_webtools/managers/seleniumManager.py
94
+
95
+ Description of script based on prompt: You are analyzing a Python script 'seleniumManager (mock response)
96
+
97
+ ### src/abstract_webtools/managers/crawlManager.py
98
+
99
+ Description of script based on prompt: You are analyzing a Python script 'crawlManager.py (mock response)
100
+
101
+ ### src/abstract_webtools/managers/requestManager/requestManager.py
102
+
103
+ Description of script based on prompt: You are analyzing a Python script 'requestManager. (mock response)
104
+
105
+ ### src/abstract_webtools/managers/videoDownloader2.py
106
+
107
+ Description of script based on prompt: You are analyzing a Python script 'videoDownloader (mock response)
108
+
109
+ ### src/abstract_webtools/managers/soupManager/__init__.py
110
+
111
+ Description of script based on prompt: You are analyzing a Python script '__init__.py' lo (mock response)
112
+
113
+ ### src/abstract_webtools/main.py
114
+
115
+ Description of script based on prompt: You are analyzing a Python script 'main.py' locate (mock response)
116
+
117
+ ### src/testit.py
118
+
119
+ Description of script based on prompt: You are analyzing a Python script 'testit.py' loca (mock response)
120
+
121
+ ### src/abstract_webtools/soup_gui.py
122
+
123
+ Description of script based on prompt: You are analyzing a Python script 'soup_gui.py' lo (mock response)
124
+
125
+ ### src/abstract_webtools/managers/curlMgr.py
126
+
127
+ Description of script based on prompt: You are analyzing a Python script 'curlMgr.py' loc (mock response)
128
+
129
+ ### src/abstract_webtools/managers/sslManager.py
130
+
131
+ Description of script based on prompt: You are analyzing a Python script 'sslManager.py' (mock response)
132
+
133
+ ### src/abstract_webtools/managers/dynamicRateLimiter.py
134
+
135
+ Description of script based on prompt: You are analyzing a Python script 'dynamicRateLimi (mock response)
136
+
137
+ ### src/abstract_webtools/managers/mySocketClient.py
138
+
139
+ Description of script based on prompt: You are analyzing a Python script 'mySocketClient. (mock response)
140
+
141
+ ### src/abstract_webtools/managers/linkManager/linkManager.py
142
+
143
+ Description of script based on prompt: You are analyzing a Python script 'linkManager.py' (mock response)
144
+
145
+ ### src/abstract_webtools/managers/userAgentManager.py
146
+
147
+ Description of script based on prompt: You are analyzing a Python script 'userAgentManage (mock response)
148
+
149
+ ### src/abstract_webtools/extention_list.py
150
+
151
+ Description of script based on prompt: You are analyzing a Python script 'extention_list. (mock response)
152
+
153
+ ### src/abstract_webtools/managers/networkManager.py
154
+
155
+ Description of script based on prompt: You are analyzing a Python script 'networkManager. (mock response)
156
+
157
+ ### src/abstract_webtools/managers/domainManager.py
158
+
159
+ Description of script based on prompt: You are analyzing a Python script 'domainManager.p (mock response)
160
+
161
+ ### src/abstract_webtools/managers/urlManager/urlManager.py
162
+
163
+ Description of script based on prompt: You are analyzing a Python script 'urlManager.py' (mock response)
164
+
165
+ ### src/abstract_webtools/big_user_agent_list.py
166
+
167
+ Description of script based on prompt: You are analyzing a Python script 'big_user_agent_ (mock response)
168
+
169
+ ### src/abstract_webtools/abstract_usurpit.py
170
+
171
+ Description of script based on prompt: You are analyzing a Python script 'abstract_usurpi (mock response)
172
+
173
+ ### src/abstract_webtools/managers/soupManager/asoueces.py
174
+
175
+ Description of script based on prompt: You are analyzing a Python script 'asoueces.py' lo (mock response)
176
+
177
+ ### src/abstract_webtools/managers/crawlmgr2.py
178
+
179
+ Description of script based on prompt: You are analyzing a Python script 'crawlmgr2.py' l (mock response)
180
+
181
+ ### src/abstract_webtools/abstract_webtools.py
182
+
183
+ Description of script based on prompt: You are analyzing a Python script 'abstract_webtoo (mock response)
184
+
185
+ ### src/abstract_webtools/find_dirs.py
186
+
187
+ Description of script based on prompt: You are analyzing a Python script 'find_dirs.py' l (mock response)
188
+
189
+ ### src/abstract_webtools/domain_identifier.py
190
+
191
+ Description of script based on prompt: You are analyzing a Python script 'domain_identifi (mock response)
192
+
193
+ ### src/abstract_webtools/managers/allss\.py
194
+
195
+ Description of script based on prompt: You are analyzing a Python script 'allss\.py' loca (mock response)
196
+
@@ -6,6 +6,7 @@ abstract_webtools/big_user_agent_list.py,sha256=5ZkrUWmfzYL5yaULREslh9ZiRQeITbSj
6
6
  abstract_webtools/domain_identifier.py,sha256=AvWlGD7C19rySa_J_Brxi3kz43LMWvGsshuuZNg7MvI,3320
7
7
  abstract_webtools/extention_list.py,sha256=gRSO4nMbuuXDYzd-ss4s64sS80ZHmUoazMCpgoKG5vE,4884
8
8
  abstract_webtools/find_dirs.py,sha256=BlE4ruzMABqmv03NcutZ1j5N3pCc-Q4uNEAMpNolZCQ,2609
9
+ abstract_webtools/k2s_downloader.py,sha256=gju2y5nF-hbyU-z5-cnb3_YZ8YrOZDGts3LhMhdAaTU,4592
9
10
  abstract_webtools/main.py,sha256=_I7pPXPkoLZOoYGLQDrSLGhGuQt6-PVyXEHZSmglk2g,1329
10
11
  abstract_webtools/soup_gui.py,sha256=n95YAps1R6DpMwR4UbthSqQby0C5WHUa9tsW-f2qpLg,5184
11
12
  abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE70B8,10441
@@ -41,7 +42,7 @@ abstract_webtools/managers/soupManager/soupManager.py,sha256=U3_o189-OWoBRaSCe2s
41
42
  abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
42
43
  abstract_webtools/managers/urlManager/urlManager.py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
43
44
  abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
44
- abstract_webtools-0.1.6.95.dist-info/METADATA,sha256=U51Gqn0eYzH7i9l8R7ULbKZiycrUNkq6prTyT5RXtQo,16029
45
- abstract_webtools-0.1.6.95.dist-info/WHEEL,sha256=ck4Vq1_RXyvS4Jt6SI0Vz6fyVs4GWg7AINwpsaGEgPE,91
46
- abstract_webtools-0.1.6.95.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
47
- abstract_webtools-0.1.6.95.dist-info/RECORD,,
45
+ abstract_webtools-0.1.6.97.dist-info/METADATA,sha256=gLBSYLiRZdHP65FYPDVjiXwhwan7TgVjRxtdrHoVncM,7288
46
+ abstract_webtools-0.1.6.97.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
47
+ abstract_webtools-0.1.6.97.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
48
+ abstract_webtools-0.1.6.97.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.0.0)
2
+ Generator: setuptools (80.3.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,455 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: abstract_webtools
3
- Version: 0.1.6.95
4
- Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
- Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
- Author: putkoff
7
- Author-email: partners@abstractendeavors.com
8
- Classifier: Development Status :: 3 - Alpha
9
- Classifier: Intended Audience :: Developers
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Programming Language :: Python :: 3
12
- Classifier: Programming Language :: Python :: 3.11
13
- Requires-Python: >=3.6
14
- Description-Content-Type: text/markdown
15
- Requires-Dist: abstract_utilities>=0.2.2.30
16
- Requires-Dist: PySimpleGUI>=4.60.5
17
- Requires-Dist: urllib3>=2.0.4
18
- Requires-Dist: requests>=2.31.0
19
- Dynamic: author
20
- Dynamic: author-email
21
- Dynamic: classifier
22
- Dynamic: description
23
- Dynamic: description-content-type
24
- Dynamic: home-page
25
- Dynamic: requires-dist
26
- Dynamic: requires-python
27
- Dynamic: summary
28
-
29
- # Abstract WebTools
30
- Provides utilities for inspecting and parsing web content, including React components and URL utilities, with enhanced capabilities for managing HTTP requests and TLS configurations.
31
-
32
- - **Features**:
33
- - URL Validation: Ensures URL correctness and attempts different URL variations.
34
- - HTTP Request Manager: Custom HTTP request handling, including tailored user agents and improved TLS security through a custom adapter.
35
- - Source Code Acquisition: Retrieves the source code of specified websites.
36
- - React Component Parsing: Extracts JavaScript and JSX source code from web pages.
37
- - Comprehensive Link Extraction: Collects all internal links from a specified website.
38
- - Web Content Analysis: Extracts and categorizes various web content components such as HTML elements, attribute values, attribute names, and class names.
39
-
40
- ### abstract_webtools.py
41
- **Description:**
42
- Abstract WebTools offers a suite of utilities designed for web content inspection and parsing. One of its standout features is its ability to analyze URLs, ensuring their validity and automatically attempting different URL variations to obtain correct website access. It boasts a custom HTTP request management system that tailors user-agent strings and employs a specialized TLS adapter for heightened security. The toolkit also provides robust capabilities for extracting source code, including detecting React components on web pages. Additionally, it offers functionalities for extracting all internal website links and performing in-depth web content analysis. This makes Abstract WebTools an indispensable tool for web developers, cybersecurity professionals, and digital analysts.
43
- ![image](https://github.com/AbstractEndeavors/abstract_essentials/assets/57512254/0451d8ea-996f-4de5-9e6c-92a606aae4ef)
44
-
45
- - **Dependencies**:
46
- - `requests`
47
- - `ssl`
48
- - `HTTPAdapter` from `requests.adapters`
49
- - `PoolManager` from `urllib3.poolmanager`
50
- - `ssl_` from `urllib3.util`
51
- - `urlparse`, `urljoin` from `urllib.parse`
52
- - `BeautifulSoup` from `bs4`
53
-
54
-
55
- # UrlManager
56
-
57
- The `UrlManager` is a Python class designed to handle and manipulate URLs. It provides methods for cleaning and normalizing URLs, determining the correct version of a URL, extracting URL components, and more. This class is particularly useful for web scraping, web crawling, or any application where URL management is essential.
58
-
59
- ## Usage
60
-
61
- To use the `UrlManager` class, first import it into your Python script:
62
-
63
- ```python
64
- from abstract_webtools import UrlManager
65
- ```
66
-
67
- ### Initializing a UrlManager Object
68
-
69
- You can create a `UrlManager` object by providing an initial URL and an optional `requests` session. If no URL is provided, it defaults to 'www.example.com':
70
-
71
- ```python
72
- url_manager = UrlManager(url='https://www.example.com')
73
- ```
74
-
75
- ### URL Cleaning and Normalization
76
-
77
- The `clean_url` method takes a URL and returns a list of potential URL variations, including versions with and without 'www.', 'http://', and 'https://':
78
-
79
- ```python
80
- cleaned_urls = url_manager.clean_url()
81
- ```
82
-
83
- ### Getting the Correct URL
84
-
85
- The `get_correct_url` method tries each possible URL variation with an HTTP request to determine the correct version of the URL:
86
-
87
- ```python
88
- correct_url = url_manager.get_correct_url()
89
- ```
90
-
91
- ### Updating the URL
92
-
93
- You can update the URL associated with the `UrlManager` object using the `update_url` method:
94
-
95
- ```python
96
- url_manager.update_url('https://www.example2.com')
97
- ```
98
-
99
- ### Extracting URL Components
100
-
101
- The `url_to_pieces` method extracts various components of the URL, such as protocol, domain name, path, and query:
102
-
103
- ```python
104
- url_manager.url_to_pieces()
105
- print(url_manager.protocol)
106
- print(url_manager.domain_name)
107
- print(url_manager.path)
108
- print(url_manager.query)
109
- ```
110
-
111
- ### Additional Utility Methods
112
-
113
- - `get_domain_name(url)`: Returns the domain name (netloc) of a given URL.
114
- - `is_valid_url(url)`: Checks if a URL is valid.
115
- - `make_valid(href, url)`: Ensures a relative or incomplete URL is valid by joining it with a base URL.
116
- - `get_relative_href(url, href)`: Converts a relative URL to an absolute URL based on a base URL.
117
-
118
- ## Compatibility Note
119
-
120
- The `get_domain` method is kept for compatibility but is inconsistent. Use it only for "webpage_url_domain." Similarly, `url_basename`, `base_url`, and `urljoin` methods are available for URL manipulation.
121
-
122
- ## Example
123
-
124
- Here's a quick example of using the `UrlManager` class:
125
-
126
- ```python
127
- from abstract_webtools import UrlManager
128
-
129
- url_manager = UrlManager(url='https://www.example.com')
130
- cleaned_urls = url_manager.clean_url()
131
- correct_url = url_manager.get_correct_url()
132
- url_manager.update_url('https://www.example2.com')
133
-
134
- print(f"Cleaned URLs: {cleaned_urls}")
135
- print(f"Correct URL: {correct_url}")
136
- ```
137
-
138
- ## Dependencies
139
-
140
- The `UrlManager` class relies on the `requests` library for making HTTP requests. Ensure you have the `requests` library installed in your Python environment.
141
- # SafeRequest
142
-
143
- The `SafeRequest` class is a versatile Python utility designed to handle HTTP requests with enhanced safety features. It integrates with other managers like `UrlManager`, `NetworkManager`, and `UserAgentManager` to manage various aspects of the request, such as user-agent, SSL/TLS settings, proxies, headers, and more.
144
-
145
- ## Usage
146
-
147
- To use the `SafeRequest` class, first import it into your Python script:
148
-
149
- ```python
150
- from abstract_webtools import SafeRequest
151
- ```
152
-
153
- ### Initializing a SafeRequest Object
154
-
155
- You can create a `SafeRequest` object with various configuration options. By default, it uses sensible default values, but you can customize it as needed:
156
-
157
- ```python
158
- safe_request = SafeRequest(url='https://www.example.com')
159
- ```
160
-
161
- ### Updating URL and UrlManager
162
-
163
- You can update the URL associated with the `SafeRequest` object using the `update_url` method, which also updates the underlying `UrlManager`:
164
-
165
- ```python
166
- safe_request.update_url('https://www.example2.com')
167
- ```
168
-
169
- You can also update the `UrlManager` directly:
170
-
171
- ```python
172
- from url_manager import UrlManager
173
-
174
- url_manager = UrlManager(url='https://www.example3.com')
175
- safe_request.update_url_manager(url_manager)
176
- ```
177
-
178
- ### Making HTTP Requests
179
-
180
- The `SafeRequest` class handles making HTTP requests using the `try_request` method. It handles retries, timeouts, and rate limiting:
181
-
182
- ```python
183
- response = safe_request.try_request()
184
- if response:
185
- # Process the response here
186
- ```
187
-
188
- ### Accessing Response Data
189
-
190
- You can access the response data in various formats:
191
-
192
- - `safe_request.source_code`: HTML source code as a string.
193
- - `safe_request.source_code_bytes`: HTML source code as bytes.
194
- - `safe_request.source_code_json`: JSON data from the response (if the content type is JSON).
195
- - `safe_request.react_source_code`: JavaScript and JSX source code extracted from `<script>` tags.
196
-
197
- ### Customizing Request Configuration
198
-
199
- The `SafeRequest` class provides several options for customizing the request, such as headers, user-agent, proxies, SSL/TLS settings, and more. These can be set during initialization or updated later.
200
-
201
- ### Handling Rate Limiting
202
-
203
- The class can handle rate limiting scenarios by implementing rate limiters and waiting between requests.
204
-
205
- ### Error Handling
206
-
207
- The `SafeRequest` class handles various request-related exceptions and provides error messages for easier debugging.
208
-
209
- ## Dependencies
210
-
211
- The `SafeRequest` class relies on the `requests` library for making HTTP requests. Ensure you have the `requests` library installed in your Python environment:
212
-
213
- ```bash
214
- pip install requests
215
- ```
216
-
217
- ## Example
218
-
219
- Here's a quick example of using the `SafeRequest` class:
220
-
221
- ```python
222
- from abstract_webtools import SafeRequest
223
-
224
- safe_request = SafeRequest(url='https://www.example.com')
225
- response = safe_request.try_request()
226
- if response:
227
- print(f"Response status code: {response.status_code}")
228
- print(f"HTML source code: {safe_request.source_code}")
229
- ```
230
-
231
- # SoupManager
232
-
233
- The `SoupManager` class is a Python utility designed to simplify web scraping by providing easy access to the BeautifulSoup library. It allows you to parse and manipulate HTML or XML source code from a URL or provided source code.
234
-
235
- ## Usage
236
-
237
- To use the `SoupManager` class, first import it into your Python script:
238
-
239
- ```python
240
- from abstract_webtools import SoupManager
241
- ```
242
-
243
- ### Initializing a SoupManager Object
244
-
245
- You can create a `SoupManager` object with various configuration options. By default, it uses sensible default values, but you can customize it as needed:
246
-
247
- ```python
248
- soup_manager = SoupManager(url='https://www.example.com')
249
- ```
250
-
251
- ### Updating URL and Request Manager
252
-
253
- You can update the URL associated with the `SoupManager` object using the `update_url` method, which also updates the underlying `UrlManager` and `SafeRequest`:
254
-
255
- ```python
256
- soup_manager.update_url('https://www.example2.com')
257
- ```
258
-
259
- You can also update the source code directly:
260
-
261
- ```python
262
- source_code = '<html>...</html>'
263
- soup_manager.update_source_code(source_code)
264
- ```
265
-
266
- ### Accessing and Parsing HTML
267
-
268
- The `SoupManager` class provides easy access to the BeautifulSoup object, allowing you to search, extract, and manipulate HTML elements easily. You can use methods like `find_all`, `get_class`, `has_attributes`, and more to work with the HTML content.
269
-
270
- ```python
271
- elements = soup_manager.find_all(tag='a')
272
- ```
273
-
274
- ### Extracting Links
275
-
276
- The class also includes methods for extracting all website links from the HTML source code:
277
-
278
- ```python
279
- all_links = soup_manager.all_links
280
- ```
281
-
282
- ### Extracting Meta Tags
283
-
284
- You can extract meta tags from the HTML source code using the `meta_tags` property:
285
-
286
- ```python
287
- meta_tags = soup_manager.meta_tags
288
- ```
289
-
290
- ### Customizing Parsing
291
-
292
- You can customize the parsing behavior by specifying the parser type during initialization or updating it:
293
-
294
- ```python
295
- soup_manager.update_parse_type('lxml')
296
- ```
297
-
298
- ## Dependencies
299
-
300
- The `SoupManager` class relies on the `BeautifulSoup` library for parsing HTML or XML. Ensure you have the `beautifulsoup4` library installed in your Python environment:
301
-
302
- ```bash
303
- pip install beautifulsoup4
304
- ```
305
-
306
- ## Example
307
-
308
- Here's a quick example of using the `SoupManager` class:
309
-
310
- ```python
311
- from abstract_webtools import SoupManager
312
-
313
- soup_manager = SoupManager(url='https://www.example.com')
314
- all_links = soup_manager.all_links
315
- print(f"All Links: {all_links}")
316
- ```
317
- # LinkManager
318
-
319
- The `LinkManager` class is a Python utility designed to simplify the extraction and management of links (URLs) and associated data from HTML source code. It leverages other classes like `UrlManager`, `SafeRequest`, and `SoupManager` to facilitate link extraction and manipulation.
320
-
321
- ## Usage
322
-
323
- To use the `LinkManager` class, first import it into your Python script:
324
-
325
- ```python
326
- from abstract_webtools import LinkManager
327
- ```
328
-
329
- ### Initializing a LinkManager Object
330
-
331
- You can create a `LinkManager` object with various configuration options. By default, it uses sensible default values, but you can customize it as needed:
332
-
333
- ```python
334
- link_manager = LinkManager(url='https://www.example.com')
335
- ```
336
-
337
- ### Updating URL and Request Manager
338
-
339
- You can update the URL associated with the `LinkManager` object using the `update_url` method, which also updates the underlying `UrlManager`, `SafeRequest`, and `SoupManager`:
340
-
341
- ```python
342
- link_manager.update_url('https://www.example2.com')
343
- ```
344
-
345
- ### Accessing Extracted Links
346
-
347
- The `LinkManager` class provides easy access to extracted links and associated data:
348
-
349
- ```python
350
- all_links = link_manager.all_desired_links
351
- ```
352
-
353
- ### Customizing Link Extraction
354
-
355
- You can customize the link extraction behavior by specifying various parameters during initialization or updating them:
356
-
357
- ```python
358
- link_manager.update_desired(
359
- img_attr_value_desired=['thumbnail', 'image'],
360
- img_attr_value_undesired=['icon'],
361
- link_attr_value_desired=['blog', 'article'],
362
- link_attr_value_undesired=['archive'],
363
- image_link_tags='img',
364
- img_link_attrs='src',
365
- link_tags='a',
366
- link_attrs='href',
367
- strict_order_tags=True,
368
- associated_data_attr=['data-title', 'alt', 'title'],
369
- get_img=['data-title', 'alt', 'title']
370
- )
371
- ```
372
-
373
- ## Dependencies
374
-
375
- The `LinkManager` class relies on other classes within the `abstract_webtools` module, such as `UrlManager`, `SafeRequest`, and `SoupManager`. Ensure you have these classes and their dependencies correctly set up in your Python environment.
376
-
377
- ## Example
378
-
379
- Here's a quick example of using the `LinkManager` class:
380
-
381
- ```python
382
- from abstract_webtools import LinkManager
383
-
384
- link_manager = LinkManager(url='https://www.example.com')
385
- all_links = link_manager.all_desired_links
386
- print(f"All Links: {all_links}")
387
- ```
388
- ##Overall Usecases
389
- ```python
390
- from abstract_webtools import UrlManager, SafeRequest, SoupManager, LinkManager, VideoDownloader
391
-
392
- # --- UrlManager: Manages and manipulates URLs for web scraping/crawling ---
393
- url = "example.com"
394
- url_manager = UrlManager(url=url)
395
-
396
- # --- SafeRequest: Safely handles HTTP requests by managing user-agent, SSL/TLS, proxies, headers, etc. ---
397
- request_manager = SafeRequest(
398
- url_manager=url_manager,
399
- proxies={'8.219.195.47', '8.219.197.111'},
400
- timeout=(3.05, 70)
401
- )
402
-
403
- # --- SoupManager: Simplifies web scraping with easy access to BeautifulSoup ---
404
- soup_manager = SoupManager(
405
- url_manager=url_manager,
406
- request_manager=request_manager
407
- )
408
-
409
- # --- LinkManager: Extracts and manages links and associated data from HTML source code ---
410
- link_manager = LinkManager(
411
- url_manager=url_manager,
412
- soup_manager=soup_manager,
413
- link_attr_value_desired=['/view_video.php?viewkey='],
414
- link_attr_value_undesired=['phantomjs']
415
- )
416
-
417
- # Download videos from provided links (list or string)
418
- video_manager = VideoDownloader(link=link_manager.all_desired_links).download()
419
-
420
- # Use them individually, with default dependencies for basic inputs:
421
- standalone_soup = SoupManager(url=url).soup
422
- standalone_links = LinkManager(url=url).all_desired_links
423
-
424
- # Updating methods for manager classes
425
- url_1 = 'thedailydialectics.com'
426
- print(f"updating URL to {url_1}")
427
- url_manager.update_url(url=url_1)
428
- request_manager.update_url(url=url_1)
429
- soup_manager.update_url(url=url_1)
430
- link_manager.update_url(url=url_1)
431
-
432
- # Updating URL manager references
433
- request_manager.update_url_manager(url_manager=url_manager)
434
- soup_manager.update_url_manager(url_manager=url_manager)
435
- link_manager.update_url_manager(url_manager=url_manager)
436
-
437
- # Updating source code for managers
438
- source_code_bytes = request_manager.source_code_bytes
439
- soup_manager.update_source_code(source_code=source_code_bytes)
440
- link_manager.update_source_code(source_code=source_code_bytes)
441
- ```
442
- ## License
443
-
444
- This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
445
-
446
- #### Module Information
447
- -**Author**: putkoff
448
- -**Author Email**: partners@abstractendeavors.com
449
- -**Github**: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
450
- -**PYPI**: https://pypi.org/project/abstract-webtools
451
- -**Part of**: abstract_essentials
452
- -**Date**: 10/10/2023
453
- -**Version**: 0.1.4.54
454
- ---
455
-