abstract-webtools 0.1.4.55__py3-none-any.whl → 0.1.4.56__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/abstract_webtools.py +16 -16
- abstract_webtools-0.1.4.56.dist-info/METADATA +448 -0
- {abstract_webtools-0.1.4.55.dist-info → abstract_webtools-0.1.4.56.dist-info}/RECORD +6 -6
- abstract_webtools-0.1.4.55.dist-info/METADATA +0 -329
- {abstract_webtools-0.1.4.55.dist-info → abstract_webtools-0.1.4.56.dist-info}/LICENSE +0 -0
- {abstract_webtools-0.1.4.55.dist-info → abstract_webtools-0.1.4.56.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.4.55.dist-info → abstract_webtools-0.1.4.56.dist-info}/top_level.txt +0 -0
@@ -388,7 +388,7 @@ class MySocketClient():
|
|
388
388
|
def get_instance(ip_address='local_host',port=22,domain_name="example.com"):
|
389
389
|
if MySocketClientSingleton._instance is None:
|
390
390
|
MySocketClientSingleton._instance = MySocketClient(ip_address=ip_address,port=port,domain_name=domain_name)
|
391
|
-
elif MySocketClientSingleton._instance.ip_address != ip_address or MySocketClientSingleton._instance.port != port or
|
391
|
+
elif MySocketClientSingleton._instance.ip_address != ip_address or MySocketClientSingleton._instance.port != port or UrlManagerSingleton._instance.domain_name != domain_name:
|
392
392
|
MySocketClientSingleton._instance = MySocketClient(ip_address=ip_address,port=port,domain_name=domain_name)
|
393
393
|
return MySocketClient
|
394
394
|
def safe_json_loads(data):
|
@@ -402,13 +402,13 @@ def convert_to_json(obj):
|
|
402
402
|
if isinstance(obj, str):
|
403
403
|
return safe_json_loads(obj)
|
404
404
|
return None
|
405
|
-
class
|
405
|
+
class UrlManager:
|
406
406
|
def __init__(self, url=None, session=requests):
|
407
407
|
if url==None:
|
408
408
|
url='www.example.com'
|
409
409
|
self.url = url
|
410
410
|
self.session = session
|
411
|
-
# These methods seem essential for setting up the
|
411
|
+
# These methods seem essential for setting up the UrlManager object.
|
412
412
|
self.clean_urls = self.clean_url()
|
413
413
|
self.correct_url = self.get_correct_url()
|
414
414
|
self.url_to_pieces()
|
@@ -482,7 +482,7 @@ class URLManager:
|
|
482
482
|
print(e)
|
483
483
|
return None
|
484
484
|
def update_url(self,url):
|
485
|
-
# These methods seem essential for setting up the
|
485
|
+
# These methods seem essential for setting up the UrlManager object.
|
486
486
|
self.url = url
|
487
487
|
self.clean_urls = self.clean_url()
|
488
488
|
self.correct_url = self.get_correct_url()
|
@@ -556,15 +556,15 @@ class URLManager:
|
|
556
556
|
r'^(?:https?:)?//', base):
|
557
557
|
return None
|
558
558
|
return urllib.parse.urljoin(base, path)
|
559
|
-
class
|
559
|
+
class UrlManagerSingleton:
|
560
560
|
_instance = None
|
561
561
|
@staticmethod
|
562
562
|
def get_instance(url=None,session=requests):
|
563
|
-
if
|
564
|
-
|
565
|
-
elif
|
566
|
-
|
567
|
-
return
|
563
|
+
if UrlManagerSingleton._instance is None:
|
564
|
+
UrlManagerSingleton._instance = UrlManager(url,session=session)
|
565
|
+
elif UrlManagerSingleton._instance.session != session or UrlManagerSingleton._instance.url != url:
|
566
|
+
UrlManagerSingleton._instance = UrlManager(url,session=session)
|
567
|
+
return UrlManagerSingleton._instance
|
568
568
|
class SafeRequest:
|
569
569
|
def __init__(self,
|
570
570
|
url=None,
|
@@ -590,7 +590,7 @@ class SafeRequest:
|
|
590
590
|
max_retries=None,
|
591
591
|
request_wait_limit=None):
|
592
592
|
if url_manager == None:
|
593
|
-
url_manager =
|
593
|
+
url_manager = UrlManager(url=url)
|
594
594
|
self.url_manager=url_manager
|
595
595
|
if network_manager == None:
|
596
596
|
network_manager=NetworkManager(user_agent_manager=user_agent_manager,ssl_manager=ssl_manager, tls_adapter=tls_adapter,user_agent=user_agent,proxies=proxies,auth=auth,cookies=cookies,ciphers=ciphers, certification=certification, ssl_options=ssl_options)
|
@@ -798,17 +798,17 @@ class SafeRequestSingleton:
|
|
798
798
|
@staticmethod
|
799
799
|
def get_instance(url=None,headers:dict=None,max_retries=3,last_request_time=None,request_wait_limit=1.5):
|
800
800
|
if SafeRequestSingleton._instance is None:
|
801
|
-
SafeRequestSingleton._instance = SafeRequest(url,url_manager=
|
801
|
+
SafeRequestSingleton._instance = SafeRequest(url,url_manager=UrlManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
|
802
802
|
elif SafeRequestSingleton._instance.url != url or SafeRequestSingleton._instance.headers != headers or SafeRequestSingleton._instance.max_retries != max_retries or SafeRequestSingleton._instance.request_wait_limit != request_wait_limit:
|
803
|
-
SafeRequestSingleton._instance = SafeRequest(url,url_manager=
|
803
|
+
SafeRequestSingleton._instance = SafeRequest(url,url_manager=UrlManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
|
804
804
|
return SafeRequestSingleton._instance
|
805
805
|
class SoupManager:
|
806
806
|
def __init__(self,url=None,source_code=None,url_manager=None,request_manager=None, parse_type="html.parser"):
|
807
807
|
self.soup=[]
|
808
808
|
self.url=url
|
809
809
|
if url_manager == None:
|
810
|
-
url_manager=
|
811
|
-
if self.url != None and url_manager != None and url_manager.url !=
|
810
|
+
url_manager=UrlManager(url=self.url)
|
811
|
+
if self.url != None and url_manager != None and url_manager.url != UrlManager(url=url).url:
|
812
812
|
url_manager.update_url(url=self.url)
|
813
813
|
self.url_manager= url_manager
|
814
814
|
self.url=self.url_manager.url
|
@@ -1228,7 +1228,7 @@ class LinkManager:
|
|
1228
1228
|
def __init__(self,url="https://example.com",source_code=None,url_manager=None,request_manager=None,soup_manager=None,image_link_tags='img',img_link_attrs='src',link_tags='a',link_attrs='href',strict_order_tags=False,img_attr_value_desired=None,img_attr_value_undesired=None,link_attr_value_desired=None,link_attr_value_undesired=None,associated_data_attr=["data-title",'alt','title'],get_img=["data-title",'alt','title']):
|
1229
1229
|
self.url=url
|
1230
1230
|
if url_manager==None:
|
1231
|
-
url_manager=
|
1231
|
+
url_manager=UrlManager(url=url)
|
1232
1232
|
self.url_manager= url_manager
|
1233
1233
|
self.url=self.url_manager.url
|
1234
1234
|
if request_manager==None:
|
@@ -0,0 +1,448 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: abstract-webtools
|
3
|
+
Version: 0.1.4.56
|
4
|
+
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
|
+
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
|
+
Author: putkoff
|
7
|
+
Author-email: partners@abstractendeavors.com
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
9
|
+
Classifier: Intended Audience :: Developers
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
13
|
+
Requires-Python: >=3.6
|
14
|
+
Description-Content-Type: text/markdown
|
15
|
+
License-File: LICENSE
|
16
|
+
Requires-Dist: abstract-webtools (>=0.1.0)
|
17
|
+
Requires-Dist: abstract-utilities (>=0.2.0.0)
|
18
|
+
Requires-Dist: PySimpleGUI (>=4.60.5)
|
19
|
+
Requires-Dist: urllib3 (>=2.0.4)
|
20
|
+
Requires-Dist: requests (>=2.31.0)
|
21
|
+
|
22
|
+
# Abstract WebTools
|
23
|
+
Provides utilities for inspecting and parsing web content, including React components and URL utilities, with enhanced capabilities for managing HTTP requests and TLS configurations.
|
24
|
+
|
25
|
+
- **Features**:
|
26
|
+
- URL Validation: Ensures URL correctness and attempts different URL variations.
|
27
|
+
- HTTP Request Manager: Custom HTTP request handling, including tailored user agents and improved TLS security through a custom adapter.
|
28
|
+
- Source Code Acquisition: Retrieves the source code of specified websites.
|
29
|
+
- React Component Parsing: Extracts JavaScript and JSX source code from web pages.
|
30
|
+
- Comprehensive Link Extraction: Collects all internal links from a specified website.
|
31
|
+
- Web Content Analysis: Extracts and categorizes various web content components such as HTML elements, attribute values, attribute names, and class names.
|
32
|
+
|
33
|
+
### abstract_webtools.py
|
34
|
+
**Description:**
|
35
|
+
Abstract WebTools offers a suite of utilities designed for web content inspection and parsing. One of its standout features is its ability to analyze URLs, ensuring their validity and automatically attempting different URL variations to obtain correct website access. It boasts a custom HTTP request management system that tailors user-agent strings and employs a specialized TLS adapter for heightened security. The toolkit also provides robust capabilities for extracting source code, including detecting React components on web pages. Additionally, it offers functionalities for extracting all internal website links and performing in-depth web content analysis. This makes Abstract WebTools an indispensable tool for web developers, cybersecurity professionals, and digital analysts.
|
36
|
+

|
37
|
+
|
38
|
+
- **Dependencies**:
|
39
|
+
- `requests`
|
40
|
+
- `ssl`
|
41
|
+
- `HTTPAdapter` from `requests.adapters`
|
42
|
+
- `PoolManager` from `urllib3.poolmanager`
|
43
|
+
- `ssl_` from `urllib3.util`
|
44
|
+
- `urlparse`, `urljoin` from `urllib.parse`
|
45
|
+
- `BeautifulSoup` from `bs4`
|
46
|
+
|
47
|
+
|
48
|
+
# UrlManager
|
49
|
+
|
50
|
+
The `UrlManager` is a Python class designed to handle and manipulate URLs. It provides methods for cleaning and normalizing URLs, determining the correct version of a URL, extracting URL components, and more. This class is particularly useful for web scraping, web crawling, or any application where URL management is essential.
|
51
|
+
|
52
|
+
## Usage
|
53
|
+
|
54
|
+
To use the `UrlManager` class, first import it into your Python script:
|
55
|
+
|
56
|
+
```python
|
57
|
+
from abstract_webtools import UrlManager
|
58
|
+
```
|
59
|
+
|
60
|
+
### Initializing a UrlManager Object
|
61
|
+
|
62
|
+
You can create a `UrlManager` object by providing an initial URL and an optional `requests` session. If no URL is provided, it defaults to 'www.example.com':
|
63
|
+
|
64
|
+
```python
|
65
|
+
url_manager = UrlManager(url='https://www.example.com')
|
66
|
+
```
|
67
|
+
|
68
|
+
### URL Cleaning and Normalization
|
69
|
+
|
70
|
+
The `clean_url` method takes a URL and returns a list of potential URL variations, including versions with and without 'www.', 'http://', and 'https://':
|
71
|
+
|
72
|
+
```python
|
73
|
+
cleaned_urls = url_manager.clean_url()
|
74
|
+
```
|
75
|
+
|
76
|
+
### Getting the Correct URL
|
77
|
+
|
78
|
+
The `get_correct_url` method tries each possible URL variation with an HTTP request to determine the correct version of the URL:
|
79
|
+
|
80
|
+
```python
|
81
|
+
correct_url = url_manager.get_correct_url()
|
82
|
+
```
|
83
|
+
|
84
|
+
### Updating the URL
|
85
|
+
|
86
|
+
You can update the URL associated with the `UrlManager` object using the `update_url` method:
|
87
|
+
|
88
|
+
```python
|
89
|
+
url_manager.update_url('https://www.example2.com')
|
90
|
+
```
|
91
|
+
|
92
|
+
### Extracting URL Components
|
93
|
+
|
94
|
+
The `url_to_pieces` method extracts various components of the URL, such as protocol, domain name, path, and query:
|
95
|
+
|
96
|
+
```python
|
97
|
+
url_manager.url_to_pieces()
|
98
|
+
print(url_manager.protocol)
|
99
|
+
print(url_manager.domain_name)
|
100
|
+
print(url_manager.path)
|
101
|
+
print(url_manager.query)
|
102
|
+
```
|
103
|
+
|
104
|
+
### Additional Utility Methods
|
105
|
+
|
106
|
+
- `get_domain_name(url)`: Returns the domain name (netloc) of a given URL.
|
107
|
+
- `is_valid_url(url)`: Checks if a URL is valid.
|
108
|
+
- `make_valid(href, url)`: Ensures a relative or incomplete URL is valid by joining it with a base URL.
|
109
|
+
- `get_relative_href(url, href)`: Converts a relative URL to an absolute URL based on a base URL.
|
110
|
+
|
111
|
+
## Compatibility Note
|
112
|
+
|
113
|
+
The `get_domain` method is kept for compatibility but is inconsistent. Use it only for "webpage_url_domain." Similarly, `url_basename`, `base_url`, and `urljoin` methods are available for URL manipulation.
|
114
|
+
|
115
|
+
## Example
|
116
|
+
|
117
|
+
Here's a quick example of using the `UrlManager` class:
|
118
|
+
|
119
|
+
```python
|
120
|
+
from abstract_webtools import UrlManager
|
121
|
+
|
122
|
+
url_manager = UrlManager(url='https://www.example.com')
|
123
|
+
cleaned_urls = url_manager.clean_url()
|
124
|
+
correct_url = url_manager.get_correct_url()
|
125
|
+
url_manager.update_url('https://www.example2.com')
|
126
|
+
|
127
|
+
print(f"Cleaned URLs: {cleaned_urls}")
|
128
|
+
print(f"Correct URL: {correct_url}")
|
129
|
+
```
|
130
|
+
|
131
|
+
## Dependencies
|
132
|
+
|
133
|
+
The `UrlManager` class relies on the `requests` library for making HTTP requests. Ensure you have the `requests` library installed in your Python environment.
|
134
|
+
# SafeRequest
|
135
|
+
|
136
|
+
The `SafeRequest` class is a versatile Python utility designed to handle HTTP requests with enhanced safety features. It integrates with other managers like `URLManager`, `NetworkManager`, and `UserAgentManager` to manage various aspects of the request, such as user-agent, SSL/TLS settings, proxies, headers, and more.
|
137
|
+
|
138
|
+
## Usage
|
139
|
+
|
140
|
+
To use the `SafeRequest` class, first import it into your Python script:
|
141
|
+
|
142
|
+
```python
|
143
|
+
from abstract_webtools import SafeRequest
|
144
|
+
```
|
145
|
+
|
146
|
+
### Initializing a SafeRequest Object
|
147
|
+
|
148
|
+
You can create a `SafeRequest` object with various configuration options. By default, it uses sensible default values, but you can customize it as needed:
|
149
|
+
|
150
|
+
```python
|
151
|
+
safe_request = SafeRequest(url='https://www.example.com')
|
152
|
+
```
|
153
|
+
|
154
|
+
### Updating URL and URLManager
|
155
|
+
|
156
|
+
You can update the URL associated with the `SafeRequest` object using the `update_url` method, which also updates the underlying `URLManager`:
|
157
|
+
|
158
|
+
```python
|
159
|
+
safe_request.update_url('https://www.example2.com')
|
160
|
+
```
|
161
|
+
|
162
|
+
You can also update the `URLManager` directly:
|
163
|
+
|
164
|
+
```python
|
165
|
+
from url_manager import URLManager
|
166
|
+
|
167
|
+
url_manager = URLManager(url='https://www.example3.com')
|
168
|
+
safe_request.update_url_manager(url_manager)
|
169
|
+
```
|
170
|
+
|
171
|
+
### Making HTTP Requests
|
172
|
+
|
173
|
+
The `SafeRequest` class handles making HTTP requests using the `try_request` method. It handles retries, timeouts, and rate limiting:
|
174
|
+
|
175
|
+
```python
|
176
|
+
response = safe_request.try_request()
|
177
|
+
if response:
|
178
|
+
# Process the response here
|
179
|
+
```
|
180
|
+
|
181
|
+
### Accessing Response Data
|
182
|
+
|
183
|
+
You can access the response data in various formats:
|
184
|
+
|
185
|
+
- `safe_request.source_code`: HTML source code as a string.
|
186
|
+
- `safe_request.source_code_bytes`: HTML source code as bytes.
|
187
|
+
- `safe_request.source_code_json`: JSON data from the response (if the content type is JSON).
|
188
|
+
- `safe_request.react_source_code`: JavaScript and JSX source code extracted from `<script>` tags.
|
189
|
+
|
190
|
+
### Customizing Request Configuration
|
191
|
+
|
192
|
+
The `SafeRequest` class provides several options for customizing the request, such as headers, user-agent, proxies, SSL/TLS settings, and more. These can be set during initialization or updated later.
|
193
|
+
|
194
|
+
### Handling Rate Limiting
|
195
|
+
|
196
|
+
The class can handle rate limiting scenarios by implementing rate limiters and waiting between requests.
|
197
|
+
|
198
|
+
### Error Handling
|
199
|
+
|
200
|
+
The `SafeRequest` class handles various request-related exceptions and provides error messages for easier debugging.
|
201
|
+
|
202
|
+
## Dependencies
|
203
|
+
|
204
|
+
The `SafeRequest` class relies on the `requests` library for making HTTP requests. Ensure you have the `requests` library installed in your Python environment:
|
205
|
+
|
206
|
+
```bash
|
207
|
+
pip install requests
|
208
|
+
```
|
209
|
+
|
210
|
+
## Example
|
211
|
+
|
212
|
+
Here's a quick example of using the `SafeRequest` class:
|
213
|
+
|
214
|
+
```python
|
215
|
+
from abstract_webtools import SafeRequest
|
216
|
+
|
217
|
+
safe_request = SafeRequest(url='https://www.example.com')
|
218
|
+
response = safe_request.try_request()
|
219
|
+
if response:
|
220
|
+
print(f"Response status code: {response.status_code}")
|
221
|
+
print(f"HTML source code: {safe_request.source_code}")
|
222
|
+
```
|
223
|
+
|
224
|
+
# SoupManager
|
225
|
+
|
226
|
+
The `SoupManager` class is a Python utility designed to simplify web scraping by providing easy access to the BeautifulSoup library. It allows you to parse and manipulate HTML or XML source code from a URL or provided source code.
|
227
|
+
|
228
|
+
## Usage
|
229
|
+
|
230
|
+
To use the `SoupManager` class, first import it into your Python script:
|
231
|
+
|
232
|
+
```python
|
233
|
+
from abstract_webtools import SoupManager
|
234
|
+
```
|
235
|
+
|
236
|
+
### Initializing a SoupManager Object
|
237
|
+
|
238
|
+
You can create a `SoupManager` object with various configuration options. By default, it uses sensible default values, but you can customize it as needed:
|
239
|
+
|
240
|
+
```python
|
241
|
+
soup_manager = SoupManager(url='https://www.example.com')
|
242
|
+
```
|
243
|
+
|
244
|
+
### Updating URL and Request Manager
|
245
|
+
|
246
|
+
You can update the URL associated with the `SoupManager` object using the `update_url` method, which also updates the underlying `URLManager` and `SafeRequest`:
|
247
|
+
|
248
|
+
```python
|
249
|
+
soup_manager.update_url('https://www.example2.com')
|
250
|
+
```
|
251
|
+
|
252
|
+
You can also update the source code directly:
|
253
|
+
|
254
|
+
```python
|
255
|
+
source_code = '<html>...</html>'
|
256
|
+
soup_manager.update_source_code(source_code)
|
257
|
+
```
|
258
|
+
|
259
|
+
### Accessing and Parsing HTML
|
260
|
+
|
261
|
+
The `SoupManager` class provides easy access to the BeautifulSoup object, allowing you to search, extract, and manipulate HTML elements easily. You can use methods like `find_all`, `get_class`, `has_attributes`, and more to work with the HTML content.
|
262
|
+
|
263
|
+
```python
|
264
|
+
elements = soup_manager.find_all(tag='a')
|
265
|
+
```
|
266
|
+
|
267
|
+
### Extracting Links
|
268
|
+
|
269
|
+
The class also includes methods for extracting all website links from the HTML source code:
|
270
|
+
|
271
|
+
```python
|
272
|
+
all_links = soup_manager.all_links
|
273
|
+
```
|
274
|
+
|
275
|
+
### Extracting Meta Tags
|
276
|
+
|
277
|
+
You can extract meta tags from the HTML source code using the `meta_tags` property:
|
278
|
+
|
279
|
+
```python
|
280
|
+
meta_tags = soup_manager.meta_tags
|
281
|
+
```
|
282
|
+
|
283
|
+
### Customizing Parsing
|
284
|
+
|
285
|
+
You can customize the parsing behavior by specifying the parser type during initialization or updating it:
|
286
|
+
|
287
|
+
```python
|
288
|
+
soup_manager.update_parse_type('lxml')
|
289
|
+
```
|
290
|
+
|
291
|
+
## Dependencies
|
292
|
+
|
293
|
+
The `SoupManager` class relies on the `BeautifulSoup` library for parsing HTML or XML. Ensure you have the `beautifulsoup4` library installed in your Python environment:
|
294
|
+
|
295
|
+
```bash
|
296
|
+
pip install beautifulsoup4
|
297
|
+
```
|
298
|
+
|
299
|
+
## Example
|
300
|
+
|
301
|
+
Here's a quick example of using the `SoupManager` class:
|
302
|
+
|
303
|
+
```python
|
304
|
+
from abstract_webtools import SoupManager
|
305
|
+
|
306
|
+
soup_manager = SoupManager(url='https://www.example.com')
|
307
|
+
all_links = soup_manager.all_links
|
308
|
+
print(f"All Links: {all_links}")
|
309
|
+
```
|
310
|
+
# LinkManager
|
311
|
+
|
312
|
+
The `LinkManager` class is a Python utility designed to simplify the extraction and management of links (URLs) and associated data from HTML source code. It leverages other classes like `URLManager`, `SafeRequest`, and `SoupManager` to facilitate link extraction and manipulation.
|
313
|
+
|
314
|
+
## Usage
|
315
|
+
|
316
|
+
To use the `LinkManager` class, first import it into your Python script:
|
317
|
+
|
318
|
+
```python
|
319
|
+
from abstract_webtools import LinkManager
|
320
|
+
```
|
321
|
+
|
322
|
+
### Initializing a LinkManager Object
|
323
|
+
|
324
|
+
You can create a `LinkManager` object with various configuration options. By default, it uses sensible default values, but you can customize it as needed:
|
325
|
+
|
326
|
+
```python
|
327
|
+
link_manager = LinkManager(url='https://www.example.com')
|
328
|
+
```
|
329
|
+
|
330
|
+
### Updating URL and Request Manager
|
331
|
+
|
332
|
+
You can update the URL associated with the `LinkManager` object using the `update_url` method, which also updates the underlying `URLManager`, `SafeRequest`, and `SoupManager`:
|
333
|
+
|
334
|
+
```python
|
335
|
+
link_manager.update_url('https://www.example2.com')
|
336
|
+
```
|
337
|
+
|
338
|
+
### Accessing Extracted Links
|
339
|
+
|
340
|
+
The `LinkManager` class provides easy access to extracted links and associated data:
|
341
|
+
|
342
|
+
```python
|
343
|
+
all_links = link_manager.all_desired_links
|
344
|
+
```
|
345
|
+
|
346
|
+
### Customizing Link Extraction
|
347
|
+
|
348
|
+
You can customize the link extraction behavior by specifying various parameters during initialization or updating them:
|
349
|
+
|
350
|
+
```python
|
351
|
+
link_manager.update_desired(
|
352
|
+
img_attr_value_desired=['thumbnail', 'image'],
|
353
|
+
img_attr_value_undesired=['icon'],
|
354
|
+
link_attr_value_desired=['blog', 'article'],
|
355
|
+
link_attr_value_undesired=['archive'],
|
356
|
+
image_link_tags='img',
|
357
|
+
img_link_attrs='src',
|
358
|
+
link_tags='a',
|
359
|
+
link_attrs='href',
|
360
|
+
strict_order_tags=True,
|
361
|
+
associated_data_attr=['data-title', 'alt', 'title'],
|
362
|
+
get_img=['data-title', 'alt', 'title']
|
363
|
+
)
|
364
|
+
```
|
365
|
+
|
366
|
+
## Dependencies
|
367
|
+
|
368
|
+
The `LinkManager` class relies on other classes within the `abstract_webtools` module, such as `URLManager`, `SafeRequest`, and `SoupManager`. Ensure you have these classes and their dependencies correctly set up in your Python environment.
|
369
|
+
|
370
|
+
## Example
|
371
|
+
|
372
|
+
Here's a quick example of using the `LinkManager` class:
|
373
|
+
|
374
|
+
```python
|
375
|
+
from abstract_webtools import LinkManager
|
376
|
+
|
377
|
+
link_manager = LinkManager(url='https://www.example.com')
|
378
|
+
all_links = link_manager.all_desired_links
|
379
|
+
print(f"All Links: {all_links}")
|
380
|
+
```
|
381
|
+
##Overall Usecases
|
382
|
+
```python
|
383
|
+
from abstract_webtools import URLManager, SafeRequest, SoupManager, LinkManager, VideoDownloader
|
384
|
+
|
385
|
+
# --- URLManager: Manages and manipulates URLs for web scraping/crawling ---
|
386
|
+
url = "example.com"
|
387
|
+
url_manager = URLManager(url=url)
|
388
|
+
|
389
|
+
# --- SafeRequest: Safely handles HTTP requests by managing user-agent, SSL/TLS, proxies, headers, etc. ---
|
390
|
+
request_manager = SafeRequest(
|
391
|
+
url_manager=url_manager,
|
392
|
+
proxies={'8.219.195.47', '8.219.197.111'},
|
393
|
+
timeout=(3.05, 70)
|
394
|
+
)
|
395
|
+
|
396
|
+
# --- SoupManager: Simplifies web scraping with easy access to BeautifulSoup ---
|
397
|
+
soup_manager = SoupManager(
|
398
|
+
url_manager=url_manager,
|
399
|
+
request_manager=request_manager
|
400
|
+
)
|
401
|
+
|
402
|
+
# --- LinkManager: Extracts and manages links and associated data from HTML source code ---
|
403
|
+
link_manager = LinkManager(
|
404
|
+
url_manager=url_manager,
|
405
|
+
soup_manager=soup_manager,
|
406
|
+
link_attr_value_desired=['/view_video.php?viewkey='],
|
407
|
+
link_attr_value_undesired=['phantomjs']
|
408
|
+
)
|
409
|
+
|
410
|
+
# Download videos from provided links (list or string)
|
411
|
+
video_manager = VideoDownloader(link=link_manager.all_desired_links).download()
|
412
|
+
|
413
|
+
# Use them individually, with default dependencies for basic inputs:
|
414
|
+
standalone_soup = SoupManager(url=url).soup
|
415
|
+
standalone_links = LinkManager(url=url).all_desired_links
|
416
|
+
|
417
|
+
# Updating methods for manager classes
|
418
|
+
url_1 = 'thedailydialectics.com'
|
419
|
+
print(f"updating URL to {url_1}")
|
420
|
+
url_manager.update_url(url=url_1)
|
421
|
+
request_manager.update_url(url=url_1)
|
422
|
+
soup_manager.update_url(url=url_1)
|
423
|
+
link_manager.update_url(url=url_1)
|
424
|
+
|
425
|
+
# Updating URL manager references
|
426
|
+
request_manager.update_url_manager(url_manager=url_manager)
|
427
|
+
soup_manager.update_url_manager(url_manager=url_manager)
|
428
|
+
link_manager.update_url_manager(url_manager=url_manager)
|
429
|
+
|
430
|
+
# Updating source code for managers
|
431
|
+
source_code_bytes = request_manager.source_code_bytes
|
432
|
+
soup_manager.update_source_code(source_code=source_code_bytes)
|
433
|
+
link_manager.update_source_code(source_code=source_code_bytes)
|
434
|
+
```
|
435
|
+
## License
|
436
|
+
|
437
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
438
|
+
|
439
|
+
#### Module Information
|
440
|
+
-**Author**: putkoff
|
441
|
+
-**Author Email**: partners@abstractendeavors.com
|
442
|
+
-**Github**: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
443
|
+
-**PYPI**: https://pypi.org/project/abstract-webtools
|
444
|
+
-**Part of**: abstract_essentials
|
445
|
+
-**Date**: 10/10/2023
|
446
|
+
-**Version**: 0.1.4.54
|
447
|
+
---
|
448
|
+
|
@@ -1,6 +1,6 @@
|
|
1
1
|
abstract_webtools/__init__.py,sha256=2SWEfdPDHqqjUYsOQYlaOHF644ZYcO160nWKiAjga4w,34
|
2
2
|
abstract_webtools/abstract_crawler.py,sha256=e8jVVv1_EB8poqlrdQaJ19z9Z0t8un5uc-DKnj1Ud5s,8002
|
3
|
-
abstract_webtools/abstract_webtools.py,sha256=
|
3
|
+
abstract_webtools/abstract_webtools.py,sha256=ob9IjHg4xVcgIQQCeDGBqVmu8OCSRhXUSR3ljORDgi0,76215
|
4
4
|
abstract_webtools/abstract_webtools2.py,sha256=dlhhgmUTaN_NgkT6GcJMVBLuXjmW38gAOeCrKxYqytk,30685
|
5
5
|
abstract_webtools/big_user_agent_list.py,sha256=2Jzlg8rzEiwqFO0yaC8yGmGXQGZ_RWaHr3Apm6Gn1Yw,113552
|
6
6
|
abstract_webtools/dfgdsf.py,sha256=T1pj-ne_qVfaAdu1MIdtW3q3UZqNP78Kt0OMhz4Musk,1355
|
@@ -10,8 +10,8 @@ abstract_webtools/sou.py,sha256=8HjmcpXJFi_kC2O-SVGebUIFY5I5B9bPP9L8BAiWhfk,4526
|
|
10
10
|
abstract_webtools/soup.py,sha256=i9Z2EL8dufbzZGP_mrJom1DCQPUOfgfVlSbiHiPnCGo,4793
|
11
11
|
abstract_webtools/test_var.py,sha256=EnxfhiLqNaG9MIHPyIbobufRzo4z1p0UIm8ArZybeRU,72
|
12
12
|
abstract_webtools/vido_test.py,sha256=wP0YGj-dWBh76g7xpvC88nOtqfeTp2hdQ-mp4ywjjXg,147
|
13
|
-
abstract_webtools-0.1.4.
|
14
|
-
abstract_webtools-0.1.4.
|
15
|
-
abstract_webtools-0.1.4.
|
16
|
-
abstract_webtools-0.1.4.
|
17
|
-
abstract_webtools-0.1.4.
|
13
|
+
abstract_webtools-0.1.4.56.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
|
14
|
+
abstract_webtools-0.1.4.56.dist-info/METADATA,sha256=3R9juuedqawW4j8vITEc1ZXN_soHsp_q0wEEnF-Isqc,15908
|
15
|
+
abstract_webtools-0.1.4.56.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
|
16
|
+
abstract_webtools-0.1.4.56.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
17
|
+
abstract_webtools-0.1.4.56.dist-info/RECORD,,
|
@@ -1,329 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: abstract-webtools
|
3
|
-
Version: 0.1.4.55
|
4
|
-
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
|
-
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
|
-
Author: putkoff
|
7
|
-
Author-email: partners@abstractendeavors.com
|
8
|
-
Classifier: Development Status :: 3 - Alpha
|
9
|
-
Classifier: Intended Audience :: Developers
|
10
|
-
Classifier: License :: OSI Approved :: MIT License
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
12
|
-
Classifier: Programming Language :: Python :: 3.11
|
13
|
-
Requires-Python: >=3.6
|
14
|
-
Description-Content-Type: text/markdown
|
15
|
-
License-File: LICENSE
|
16
|
-
Requires-Dist: abstract-webtools (>=0.1.0)
|
17
|
-
Requires-Dist: abstract-utilities (>=0.2.0.0)
|
18
|
-
Requires-Dist: PySimpleGUI (>=4.60.5)
|
19
|
-
Requires-Dist: urllib3 (>=2.0.4)
|
20
|
-
Requires-Dist: requests (>=2.31.0)
|
21
|
-
|
22
|
-
# Abstract WebTools
|
23
|
-
Provides utilities for inspecting and parsing web content, including React components and URL utilities, with enhanced capabilities for managing HTTP requests and TLS configurations.
|
24
|
-
|
25
|
-
- **Features**:
|
26
|
-
- URL Validation: Ensures URL correctness and attempts different URL variations.
|
27
|
-
- HTTP Request Manager: Custom HTTP request handling, including tailored user agents and improved TLS security through a custom adapter.
|
28
|
-
- Source Code Acquisition: Retrieves the source code of specified websites.
|
29
|
-
- React Component Parsing: Extracts JavaScript and JSX source code from web pages.
|
30
|
-
- Comprehensive Link Extraction: Collects all internal links from a specified website.
|
31
|
-
- Web Content Analysis: Extracts and categorizes various web content components such as HTML elements, attribute values, attribute names, and class names.
|
32
|
-
|
33
|
-
### abstract_webtools.py
|
34
|
-
**Description:**
|
35
|
-
Abstract WebTools offers a suite of utilities designed for web content inspection and parsing. One of its standout features is its ability to analyze URLs, ensuring their validity and automatically attempting different URL variations to obtain correct website access. It boasts a custom HTTP request management system that tailors user-agent strings and employs a specialized TLS adapter for heightened security. The toolkit also provides robust capabilities for extracting source code, including detecting React components on web pages. Additionally, it offers functionalities for extracting all internal website links and performing in-depth web content analysis. This makes Abstract WebTools an indispensable tool for web developers, cybersecurity professionals, and digital analysts.
|
36
|
-

|
37
|
-
|
38
|
-
- **Dependencies**:
|
39
|
-
- `requests`
|
40
|
-
- `ssl`
|
41
|
-
- `HTTPAdapter` from `requests.adapters`
|
42
|
-
- `PoolManager` from `urllib3.poolmanager`
|
43
|
-
- `ssl_` from `urllib3.util`
|
44
|
-
- `urlparse`, `urljoin` from `urllib.parse`
|
45
|
-
- `BeautifulSoup` from `bs4`
|
46
|
-
|
47
|
-
#### **Functions**:
|
48
|
-
##### **Classes**:
|
49
|
-
|
50
|
-
- ###### `TLSAdapter(HTTPAdapter: int)`
|
51
|
-
- **Description**: A custom HTTPAdapter class that sets TLS/SSL options and ciphers.
|
52
|
-
- **Attributes**:
|
53
|
-
- `ssl_options (int)`: The TLS/SSL options to use when creating the SSL context.
|
54
|
-
- **Methods**:
|
55
|
-
- `ssl_options(self) -> int`
|
56
|
-
- **Purpose**: Returns the SSL options to be used when creating the SSL context.
|
57
|
-
- **Returns**: The SSL options.
|
58
|
-
- `__init__(self, ssl_options:int=0, *args, **kwargs) -> None`
|
59
|
-
- **Purpose**: Initializes the TLSAdapter with the specified SSL options.
|
60
|
-
- **Arguments**:
|
61
|
-
- `ssl_options (int, optional)`: The TLS/SSL options to use when creating the SSL context. Defaults to 0.
|
62
|
-
- `add_string_list(self, ls: (list or str), delim: str = '', string: str = '') -> str`
|
63
|
-
- **Purpose**: Concatenates the elements of a list into a single string with the given delimiter.
|
64
|
-
- **Arguments**:
|
65
|
-
- `ls (list or str)`: The list of elements or a comma-separated string.
|
66
|
-
- `delim (str, optional)`: The delimiter to use when concatenating elements. Defaults to an empty string.
|
67
|
-
- `string (str, optional)`: The initial string to append elements. Defaults to an empty string.
|
68
|
-
- **Returns**: The concatenated string.
|
69
|
-
- `get_ciphers(self) -> list`
|
70
|
-
- **Purpose**: Returns a list of preferred TLS/SSL ciphers.
|
71
|
-
- **Returns**: A list of TLS/SSL ciphers.
|
72
|
-
- `create_ciphers_string(self, ls: list = None) -> str`
|
73
|
-
- **Purpose**: Creates a colon-separated string of TLS/SSL ciphers from a list of ciphers.
|
74
|
-
- **Arguments**:
|
75
|
-
- `ls (list, optional)`: The list of TLS/SSL ciphers to use. Defaults to None, in which case it uses the default list.
|
76
|
-
- **Returns**: The colon-separated string of TLS/SSL ciphers.
|
77
|
-
- `init_poolmanager(self, *args, **kwargs) -> None`
|
78
|
-
- **Purpose**: Initializes the pool manager with the custom SSL context and ciphers.
|
79
|
-
- **Description**: This method leverages the given TLS/SSL ciphers and options to set up the pool manager with an appropriate SSL context.
|
80
|
-
|
81
|
-
- ##### `get_status(url:str) -> int`
|
82
|
-
- **Purpose**: Gets the HTTP status code of the given URL.
|
83
|
-
- **Arguments**:
|
84
|
-
- `url`: The URL to check the status of.
|
85
|
-
- **Returns**: The HTTP status code of the URL, or None if the request fails.
|
86
|
-
|
87
|
-
- ##### `clean_url(url:str) -> list`
|
88
|
-
- **Purpose**: Cleans the given URL and returns a list of possible variations.
|
89
|
-
- **Arguments**:
|
90
|
-
- `url`: The URL to clean.
|
91
|
-
- **Returns**: A list of possible URL variations, including 'http://' and 'https://' prefixes.
|
92
|
-
|
93
|
-
- ##### `get_correct_url(url: str, session: type(requests.Session) = requests) -> (str or bool)`
|
94
|
-
- **Purpose**: Gets the correct URL from the possible variations by trying each one with an HTTP request.
|
95
|
-
- **Arguments**:
|
96
|
-
- `url`: The URL to find the correct version of.
|
97
|
-
- `session`: The requests session to use for making HTTP requests. Defaults to requests.
|
98
|
-
- **Returns**: The correct version of the URL if found, or None if none of the variations are valid.
|
99
|
-
|
100
|
-
- ##### `try_request(url: str, session: type(requests.Session) = requests) -> (str or bool)`
|
101
|
-
- **Purpose**: Tries to make an HTTP request to the given URL using the provided session.
|
102
|
-
- **Arguments**:
|
103
|
-
- `url`: The URL to make the request to.
|
104
|
-
- `session`: The requests session to use for making HTTP requests. Defaults to requests.
|
105
|
-
- **Returns**: The response object if the request is successful, or None if the request fails.
|
106
|
-
|
107
|
-
- ##### `is_valid(url:str) -> bool`
|
108
|
-
- **Purpose**: Checks whether `url` is a valid URL.
|
109
|
-
- **Arguments**:
|
110
|
-
- `url`: The URL to check.
|
111
|
-
- **Returns**: True if the URL is valid, False otherwise.
|
112
|
-
|
113
|
-
- ##### `desktop_user_agents() -> list`
|
114
|
-
- **Purpose**: Returns a list of popular desktop user-agent strings for various browsers.
|
115
|
-
- **Returns**: A list of desktop user-agent strings.
|
116
|
-
|
117
|
-
- ##### `get_user_agent(user_agent=desktop_user_agents()[0]) -> dict`
|
118
|
-
- **Purpose**: Returns the user-agent header dictionary with the specified user-agent.
|
119
|
-
- **Arguments**:
|
120
|
-
- `user_agent`: The user-agent string to be used. Defaults to the first user-agent in the list.
|
121
|
-
- **Returns**: A dictionary containing the 'user-agent' header.
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
- ##### `get_Source_code(url: str = 'https://www.example.com', user_agent= desktop_user_agents()[0]) -> str`
|
126
|
-
- **Purpose**: Fetches the source code of the specified URL using a custom user-agent.
|
127
|
-
- **Arguments**:
|
128
|
-
- `url (str, optional)`: The URL to fetch the source code from. Defaults to 'https://www.example.com'.
|
129
|
-
- `user_agent (str, optional)`: The user-agent to use for the request. Defaults to the first user-agent in the list.
|
130
|
-
- **Returns**: The source code of the URL if the request is successful, or None if the request fails.
|
131
|
-
|
132
|
-
- ##### `parse_react_source(url:str) -> list`
|
133
|
-
- **Purpose**: Fetches the source code of the specified URL and extracts JavaScript and JSX source code (React components).
|
134
|
-
- **Arguments**:
|
135
|
-
- `url (str)`: The URL to fetch the source code from.
|
136
|
-
- **Returns**: A list of strings containing JavaScript and JSX source code found in <script> tags.
|
137
|
-
|
138
|
-
- ##### `get_all_website_links(url:str) -> list`
|
139
|
-
- **Purpose**: Returns all URLs that are found on the specified URL and belong to the same website.
|
140
|
-
- **Arguments**:
|
141
|
-
- `url (str)`: The URL to search for links.
|
142
|
-
- **Returns**: A list of URLs that belong to the same website as the specified URL.
|
143
|
-
|
144
|
-
- ##### `parse_all(url:str) -> dict`
|
145
|
-
- **Purpose**: Parses the source code of the specified URL and extracts information about HTML elements, attribute values, attribute names, and class names.
|
146
|
-
- **Arguments**:
|
147
|
-
- `url (str)`: The URL to fetch the source code from.
|
148
|
-
- **Returns**: A dict containing keys: [element_types, attribute_values, attribute_names, class_names] with values as lists for keys element types, attribute values, attribute names, and class names found in the source code.
|
149
|
-
|
150
|
-
- ##### `extract_elements(url:str=None, source_code:str=None, element_type=None, attribute_name=None, class_name=None) -> list`
|
151
|
-
- **Purpose**: Extracts portions of the source code from the specified URL based on provided filters.
|
152
|
-
- **Arguments**:
|
153
|
-
- `url (str, optional)`: The URL to fetch the source code from.
|
154
|
-
- `source_code (str, optional)`: The source code of the desired domain.
|
155
|
-
- `element_type (str, optional)`: The HTML element type to filter by. Defaults to None.
|
156
|
-
- `attribute_name (str, optional)`: The attribute name to filter by. Defaults to None.
|
157
|
-
- `class_name (str, optional)`: The class name to filter by. Defaults to None.
|
158
|
-
- **Returns**: list: A list of strings containing portions of the source code that match the provided filters, or None if url and source_code are not provided.
|
159
|
-
|
160
|
-
|
161
|
-
#### Usage
|
162
|
-
|
163
|
-
##### Get Status Code
|
164
|
-
|
165
|
-
The `get_status` function fetches the status code of the URL.
|
166
|
-
|
167
|
-
```python
|
168
|
-
from abstract_webtools import clean_url
|
169
|
-
|
170
|
-
urls = clean_url('https://example.com')
|
171
|
-
print(urls) # Output: ['https://example.com', 'http://example.com']
|
172
|
-
tps://example.com'
|
173
|
-
```
|
174
|
-
|
175
|
-
##### Try Request
|
176
|
-
|
177
|
-
The `try_request` function makes HTTP requests to a URL and returns the response if successful.
|
178
|
-
|
179
|
-
```python
|
180
|
-
from abstract_webtools import try_request
|
181
|
-
|
182
|
-
response = try_request('https://www.example.com')
|
183
|
-
print(response) # Output: <Response [200]>
|
184
|
-
```
|
185
|
-
|
186
|
-
##### Is Valid URL
|
187
|
-
|
188
|
-
The `is_valid` function checks whether a given URL is valid.
|
189
|
-
|
190
|
-
```python
|
191
|
-
from abstract_webtools import is_valid
|
192
|
-
|
193
|
-
valid = is_valid('https://www.example.com')
|
194
|
-
print(valid) # Output: True
|
195
|
-
```
|
196
|
-
|
197
|
-
##### Get Source Code
|
198
|
-
|
199
|
-
The `get_Source_code` function fetches the source code of a URL with a custom user-agent.
|
200
|
-
|
201
|
-
```python
|
202
|
-
from abstract_webtools import get_Source_code
|
203
|
-
|
204
|
-
source_code = get_Source_code('https://www.example.com')
|
205
|
-
print(source_code) # Output: HTML source code of the URL
|
206
|
-
```
|
207
|
-
|
208
|
-
##### Parse React Source
|
209
|
-
|
210
|
-
The `parse_react_source` function fetches the source code of a URL and extracts JavaScript and JSX source code (React components).
|
211
|
-
|
212
|
-
```python
|
213
|
-
from abstract_webtools import parse_react_source
|
214
|
-
|
215
|
-
react_code = parse_react_source('https://www.example.com')
|
216
|
-
print(react_code) # Output: List of JavaScript and JSX source code found in <script> tags
|
217
|
-
```
|
218
|
-
|
219
|
-
##### Get All Website Links
|
220
|
-
|
221
|
-
The `get_all_website_links` function returns all URLs found on a specified URL that belong to the same website.
|
222
|
-
|
223
|
-
```python
|
224
|
-
from abstract_webtools import get_all_website_links
|
225
|
-
|
226
|
-
links = get_all_website_links('https://www.example.com')
|
227
|
-
print(links) # Output: List of URLs belonging to the same website as the specified URL
|
228
|
-
```
|
229
|
-
|
230
|
-
##### Parse All
|
231
|
-
|
232
|
-
The `parse_all` function fetches the source code of a URL and extracts information about HTML elements, attribute values, attribute names, and class names.
|
233
|
-
|
234
|
-
```python
|
235
|
-
from abstract_webtools import parse_all
|
236
|
-
|
237
|
-
HTML_components = parse_all('https://www.example.com')
|
238
|
-
print(HTML_components["element_types"]) # Output: List of HTML element types
|
239
|
-
print(HTML_components["attribute_values"]) # Output: List of attribute values
|
240
|
-
print(HTML_components["attribute_names"]) # Output: List of attribute names
|
241
|
-
print(HTML_components["class_names"]) # Output: List of class names
|
242
|
-
```
|
243
|
-
|
244
|
-
##### Extract Elements
|
245
|
-
|
246
|
-
The `extract_elements` function fetches the source code of a URL and extracts portions of the source code based on provided filters.
|
247
|
-
|
248
|
-
```python
|
249
|
-
from abstract_webtools import extract_elements
|
250
|
-
|
251
|
-
elements = extract_elements('https://www.example.com', element_type='div', attribute_name='class', class_name='container')
|
252
|
-
print(elements) # Output: List of HTML elements that match the provided filters
|
253
|
-
```
|
254
|
-
##### Manager System
|
255
|
-
```python
|
256
|
-
from abstract_webtools import URLManager,SafeRequest,SoupManager,LinkManager,VideoDownloader
|
257
|
-
url = "example.com"
|
258
|
-
url_manager = URLManager(url=url)
|
259
|
-
request_manager = SafeRequest(url_manager=url_manager,
|
260
|
-
proxies={'8.219.195.47','8.219.197.111'},
|
261
|
-
timeout=(3.05, 70))
|
262
|
-
soup_manager = SoupManager(url_manager=url_manager,
|
263
|
-
request_manager=request_manager)
|
264
|
-
link_manager = LinkManager(url_manager=url_manager,
|
265
|
-
soup_manager=soup_manager,
|
266
|
-
link_attr_value_desired=['/view_video.php?viewkey='],
|
267
|
-
link_attr_value_undesired=['phantomjs'])
|
268
|
-
video_manager = VideoDownloader(link=link_manager.all_desired_links).download()
|
269
|
-
|
270
|
-
or you can use them individually, they each have their dependencies on eachother defaulted for basic inputs:
|
271
|
-
#standalone
|
272
|
-
url_manager = URLManager(url=url)
|
273
|
-
working_url = url_manager.url
|
274
|
-
#standalone
|
275
|
-
request_manager = SafeRequest(url=url)
|
276
|
-
source_code = request_manager.source_code
|
277
|
-
#standalone
|
278
|
-
soup_manager = SoupManager(url=url)
|
279
|
-
soup = soup_manager.soup
|
280
|
-
#standalone
|
281
|
-
link_manager = LinkManager(url=url)
|
282
|
-
all_href_links = link_manager.all_desired_links
|
283
|
-
all_src_image_links = link_manager.all_desired_image_links
|
284
|
-
link_manager.update_desired(link_tags=["li","a"],link_attrs=["href","src"],strict_order_tags=False)
|
285
|
-
filtered_link_list = link_manager.all_desired_links
|
286
|
-
|
287
|
-
##provides all values within the filtered parameters
|
288
|
-
all_desired_raw_links = link_manager.find_all_desired(tag='a',attr='href',strict_order_tags=False,attr_value_desired=['/view_video.php?viewkey='],,attr_value_undesired=['phantomjs'],associated_data_attr=["data-title",'alt','title'],get_img=["data-title",'alt','title'])
|
289
|
-
##provides all values within the filtered parameters once more filtered by attaching the parent domain to the value and checking for validity of the url
|
290
|
-
all_desired_valid_urls = find_all_desired_links(tag='a',attr='href',strict_order_tags=False,attr_value_desired=None,attr_value_undesired=['phantomjs'],associated_data_attr=["data-title",'alt','title'],get_img=["data-title",'alt','title'])
|
291
|
-
|
292
|
-
#associated_data_attr and get_img
|
293
|
-
these 2 parameters act as extras, the last value in find_all_desired will be a json list of all values produced in the previose, however they will have key values associated for any associated data that was determined to be associated with it based on the additional filter parameters
|
294
|
-
|
295
|
-
|
296
|
-
#standalone updates
|
297
|
-
link_manager.update_desired(link_tags=["li","a"],link_attrs=["href","src"],strict_order_tags=False)
|
298
|
-
updated_link_list = link_manager.all_desired_links
|
299
|
-
|
300
|
-
# any of the managers can be updated with the specific parameters that are attributed to them , they will then reinitialize maintaining the coherent structure it begain with
|
301
|
-
url_1='thedailydialectics.com'
|
302
|
-
print(f"updating url to {url_2}")
|
303
|
-
url_manager.update_url(url=url_2)
|
304
|
-
request_manager.update_url(url=url_2)
|
305
|
-
soup_manager.update_url(url=url_2)
|
306
|
-
link_manager.update_url(url=url_2)
|
307
|
-
|
308
|
-
print(f"updating url_manager to {url_1} and updating url managers")
|
309
|
-
url_manager.update_url(url=url)
|
310
|
-
request_manager.update_url_manager(url_manager=url_manager)
|
311
|
-
soup_manager.update_url_manager(url_manager=url_manager)
|
312
|
-
link_manager.update_url_manager(url_manager=url_manager)
|
313
|
-
|
314
|
-
source_code_bytes = request_manager.source_code_bytes
|
315
|
-
print(f"updating source_code to example.com source_code_bytes")
|
316
|
-
soup_manager.update_source_code(source_code=source_code_bytes)
|
317
|
-
link_manager.update_source_code(source_code=source_code_bytes)
|
318
|
-
```
|
319
|
-
|
320
|
-
#### Module Information
|
321
|
-
-**Author**: putkoff
|
322
|
-
-**Author Email**: partners@abstractendeavors.com
|
323
|
-
-**Github**: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
324
|
-
-**PYPI**: https://pypi.org/project/abstract-webtools
|
325
|
-
-**Part of**: abstract_essentials
|
326
|
-
-**Date**: 10/10/2023
|
327
|
-
-**Version**: 0.1.4.54
|
328
|
-
---
|
329
|
-
|
File without changes
|
File without changes
|
File without changes
|