abstract-webtools 0.1.4.54__py3-none-any.whl → 0.1.4.56__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -388,7 +388,7 @@ class MySocketClient():
388
388
  def get_instance(ip_address='local_host',port=22,domain_name="example.com"):
389
389
  if MySocketClientSingleton._instance is None:
390
390
  MySocketClientSingleton._instance = MySocketClient(ip_address=ip_address,port=port,domain_name=domain_name)
391
- elif MySocketClientSingleton._instance.ip_address != ip_address or MySocketClientSingleton._instance.port != port or URLManagerSingleton._instance.domain_name != domain_name:
391
+ elif MySocketClientSingleton._instance.ip_address != ip_address or MySocketClientSingleton._instance.port != port or UrlManagerSingleton._instance.domain_name != domain_name:
392
392
  MySocketClientSingleton._instance = MySocketClient(ip_address=ip_address,port=port,domain_name=domain_name)
393
393
  return MySocketClient
394
394
  def safe_json_loads(data):
@@ -402,13 +402,13 @@ def convert_to_json(obj):
402
402
  if isinstance(obj, str):
403
403
  return safe_json_loads(obj)
404
404
  return None
405
- class URLManager:
405
+ class UrlManager:
406
406
  def __init__(self, url=None, session=requests):
407
407
  if url==None:
408
408
  url='www.example.com'
409
409
  self.url = url
410
410
  self.session = session
411
- # These methods seem essential for setting up the URLManager object.
411
+ # These methods seem essential for setting up the UrlManager object.
412
412
  self.clean_urls = self.clean_url()
413
413
  self.correct_url = self.get_correct_url()
414
414
  self.url_to_pieces()
@@ -482,7 +482,7 @@ class URLManager:
482
482
  print(e)
483
483
  return None
484
484
  def update_url(self,url):
485
- # These methods seem essential for setting up the URLManager object.
485
+ # These methods seem essential for setting up the UrlManager object.
486
486
  self.url = url
487
487
  self.clean_urls = self.clean_url()
488
488
  self.correct_url = self.get_correct_url()
@@ -556,15 +556,15 @@ class URLManager:
556
556
  r'^(?:https?:)?//', base):
557
557
  return None
558
558
  return urllib.parse.urljoin(base, path)
559
- class URLManagerSingleton:
559
+ class UrlManagerSingleton:
560
560
  _instance = None
561
561
  @staticmethod
562
562
  def get_instance(url=None,session=requests):
563
- if URLManagerSingleton._instance is None:
564
- URLManagerSingleton._instance = URLManager(url,session=session)
565
- elif URLManagerSingleton._instance.session != session or URLManagerSingleton._instance.url != url:
566
- URLManagerSingleton._instance = URLManager(url,session=session)
567
- return URLManagerSingleton._instance
563
+ if UrlManagerSingleton._instance is None:
564
+ UrlManagerSingleton._instance = UrlManager(url,session=session)
565
+ elif UrlManagerSingleton._instance.session != session or UrlManagerSingleton._instance.url != url:
566
+ UrlManagerSingleton._instance = UrlManager(url,session=session)
567
+ return UrlManagerSingleton._instance
568
568
  class SafeRequest:
569
569
  def __init__(self,
570
570
  url=None,
@@ -590,7 +590,7 @@ class SafeRequest:
590
590
  max_retries=None,
591
591
  request_wait_limit=None):
592
592
  if url_manager == None:
593
- url_manager = URLManager(url=url)
593
+ url_manager = UrlManager(url=url)
594
594
  self.url_manager=url_manager
595
595
  if network_manager == None:
596
596
  network_manager=NetworkManager(user_agent_manager=user_agent_manager,ssl_manager=ssl_manager, tls_adapter=tls_adapter,user_agent=user_agent,proxies=proxies,auth=auth,cookies=cookies,ciphers=ciphers, certification=certification, ssl_options=ssl_options)
@@ -798,17 +798,17 @@ class SafeRequestSingleton:
798
798
  @staticmethod
799
799
  def get_instance(url=None,headers:dict=None,max_retries=3,last_request_time=None,request_wait_limit=1.5):
800
800
  if SafeRequestSingleton._instance is None:
801
- SafeRequestSingleton._instance = SafeRequest(url,url_manager=URLManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
801
+ SafeRequestSingleton._instance = SafeRequest(url,url_manager=UrlManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
802
802
  elif SafeRequestSingleton._instance.url != url or SafeRequestSingleton._instance.headers != headers or SafeRequestSingleton._instance.max_retries != max_retries or SafeRequestSingleton._instance.request_wait_limit != request_wait_limit:
803
- SafeRequestSingleton._instance = SafeRequest(url,url_manager=URLManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
803
+ SafeRequestSingleton._instance = SafeRequest(url,url_manager=UrlManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
804
804
  return SafeRequestSingleton._instance
805
805
  class SoupManager:
806
806
  def __init__(self,url=None,source_code=None,url_manager=None,request_manager=None, parse_type="html.parser"):
807
807
  self.soup=[]
808
808
  self.url=url
809
809
  if url_manager == None:
810
- url_manager=URLManager(url=self.url)
811
- if self.url != None and url_manager != None and url_manager.url != URLManager(url=url).url:
810
+ url_manager=UrlManager(url=self.url)
811
+ if self.url != None and url_manager != None and url_manager.url != UrlManager(url=url).url:
812
812
  url_manager.update_url(url=self.url)
813
813
  self.url_manager= url_manager
814
814
  self.url=self.url_manager.url
@@ -1228,7 +1228,7 @@ class LinkManager:
1228
1228
  def __init__(self,url="https://example.com",source_code=None,url_manager=None,request_manager=None,soup_manager=None,image_link_tags='img',img_link_attrs='src',link_tags='a',link_attrs='href',strict_order_tags=False,img_attr_value_desired=None,img_attr_value_undesired=None,link_attr_value_desired=None,link_attr_value_undesired=None,associated_data_attr=["data-title",'alt','title'],get_img=["data-title",'alt','title']):
1229
1229
  self.url=url
1230
1230
  if url_manager==None:
1231
- url_manager=URLManager(url=url)
1231
+ url_manager=UrlManager(url=url)
1232
1232
  self.url_manager= url_manager
1233
1233
  self.url=self.url_manager.url
1234
1234
  if request_manager==None:
@@ -0,0 +1,448 @@
1
+ Metadata-Version: 2.1
2
+ Name: abstract-webtools
3
+ Version: 0.1.4.56
4
+ Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
+ Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
+ Author: putkoff
7
+ Author-email: partners@abstractendeavors.com
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Requires-Python: >=3.6
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: abstract-webtools (>=0.1.0)
17
+ Requires-Dist: abstract-utilities (>=0.2.0.0)
18
+ Requires-Dist: PySimpleGUI (>=4.60.5)
19
+ Requires-Dist: urllib3 (>=2.0.4)
20
+ Requires-Dist: requests (>=2.31.0)
21
+
22
+ # Abstract WebTools
23
+ Provides utilities for inspecting and parsing web content, including React components and URL utilities, with enhanced capabilities for managing HTTP requests and TLS configurations.
24
+
25
+ - **Features**:
26
+ - URL Validation: Ensures URL correctness and attempts different URL variations.
27
+ - HTTP Request Manager: Custom HTTP request handling, including tailored user agents and improved TLS security through a custom adapter.
28
+ - Source Code Acquisition: Retrieves the source code of specified websites.
29
+ - React Component Parsing: Extracts JavaScript and JSX source code from web pages.
30
+ - Comprehensive Link Extraction: Collects all internal links from a specified website.
31
+ - Web Content Analysis: Extracts and categorizes various web content components such as HTML elements, attribute values, attribute names, and class names.
32
+
33
+ ### abstract_webtools.py
34
+ **Description:**
35
+ Abstract WebTools offers a suite of utilities designed for web content inspection and parsing. One of its standout features is its ability to analyze URLs, ensuring their validity and automatically attempting different URL variations to obtain correct website access. It boasts a custom HTTP request management system that tailors user-agent strings and employs a specialized TLS adapter for heightened security. The toolkit also provides robust capabilities for extracting source code, including detecting React components on web pages. Additionally, it offers functionalities for extracting all internal website links and performing in-depth web content analysis. This makes Abstract WebTools an indispensable tool for web developers, cybersecurity professionals, and digital analysts.
36
+ ![image](https://github.com/AbstractEndeavors/abstract_essentials/assets/57512254/0451d8ea-996f-4de5-9e6c-92a606aae4ef)
37
+
38
+ - **Dependencies**:
39
+ - `requests`
40
+ - `ssl`
41
+ - `HTTPAdapter` from `requests.adapters`
42
+ - `PoolManager` from `urllib3.poolmanager`
43
+ - `ssl_` from `urllib3.util`
44
+ - `urlparse`, `urljoin` from `urllib.parse`
45
+ - `BeautifulSoup` from `bs4`
46
+
47
+
48
+ # UrlManager
49
+
50
+ The `UrlManager` is a Python class designed to handle and manipulate URLs. It provides methods for cleaning and normalizing URLs, determining the correct version of a URL, extracting URL components, and more. This class is particularly useful for web scraping, web crawling, or any application where URL management is essential.
51
+
52
+ ## Usage
53
+
54
+ To use the `UrlManager` class, first import it into your Python script:
55
+
56
+ ```python
57
+ from abstract_webtools import UrlManager
58
+ ```
59
+
60
+ ### Initializing a UrlManager Object
61
+
62
+ You can create a `UrlManager` object by providing an initial URL and an optional `requests` session. If no URL is provided, it defaults to 'www.example.com':
63
+
64
+ ```python
65
+ url_manager = UrlManager(url='https://www.example.com')
66
+ ```
67
+
68
+ ### URL Cleaning and Normalization
69
+
70
+ The `clean_url` method takes a URL and returns a list of potential URL variations, including versions with and without 'www.', 'http://', and 'https://':
71
+
72
+ ```python
73
+ cleaned_urls = url_manager.clean_url()
74
+ ```
75
+
76
+ ### Getting the Correct URL
77
+
78
+ The `get_correct_url` method tries each possible URL variation with an HTTP request to determine the correct version of the URL:
79
+
80
+ ```python
81
+ correct_url = url_manager.get_correct_url()
82
+ ```
83
+
84
+ ### Updating the URL
85
+
86
+ You can update the URL associated with the `UrlManager` object using the `update_url` method:
87
+
88
+ ```python
89
+ url_manager.update_url('https://www.example2.com')
90
+ ```
91
+
92
+ ### Extracting URL Components
93
+
94
+ The `url_to_pieces` method extracts various components of the URL, such as protocol, domain name, path, and query:
95
+
96
+ ```python
97
+ url_manager.url_to_pieces()
98
+ print(url_manager.protocol)
99
+ print(url_manager.domain_name)
100
+ print(url_manager.path)
101
+ print(url_manager.query)
102
+ ```
103
+
104
+ ### Additional Utility Methods
105
+
106
+ - `get_domain_name(url)`: Returns the domain name (netloc) of a given URL.
107
+ - `is_valid_url(url)`: Checks if a URL is valid.
108
+ - `make_valid(href, url)`: Ensures a relative or incomplete URL is valid by joining it with a base URL.
109
+ - `get_relative_href(url, href)`: Converts a relative URL to an absolute URL based on a base URL.
110
+
111
+ ## Compatibility Note
112
+
113
+ The `get_domain` method is kept for compatibility but is inconsistent. Use it only for "webpage_url_domain." Similarly, `url_basename`, `base_url`, and `urljoin` methods are available for URL manipulation.
114
+
115
+ ## Example
116
+
117
+ Here's a quick example of using the `UrlManager` class:
118
+
119
+ ```python
120
+ from abstract_webtools import UrlManager
121
+
122
+ url_manager = UrlManager(url='https://www.example.com')
123
+ cleaned_urls = url_manager.clean_url()
124
+ correct_url = url_manager.get_correct_url()
125
+ url_manager.update_url('https://www.example2.com')
126
+
127
+ print(f"Cleaned URLs: {cleaned_urls}")
128
+ print(f"Correct URL: {correct_url}")
129
+ ```
130
+
131
+ ## Dependencies
132
+
133
+ The `UrlManager` class relies on the `requests` library for making HTTP requests. Ensure you have the `requests` library installed in your Python environment.
134
+ # SafeRequest
135
+
136
+ The `SafeRequest` class is a versatile Python utility designed to handle HTTP requests with enhanced safety features. It integrates with other managers like `URLManager`, `NetworkManager`, and `UserAgentManager` to manage various aspects of the request, such as user-agent, SSL/TLS settings, proxies, headers, and more.
137
+
138
+ ## Usage
139
+
140
+ To use the `SafeRequest` class, first import it into your Python script:
141
+
142
+ ```python
143
+ from abstract_webtools import SafeRequest
144
+ ```
145
+
146
+ ### Initializing a SafeRequest Object
147
+
148
+ You can create a `SafeRequest` object with various configuration options. By default, it uses sensible default values, but you can customize it as needed:
149
+
150
+ ```python
151
+ safe_request = SafeRequest(url='https://www.example.com')
152
+ ```
153
+
154
+ ### Updating URL and URLManager
155
+
156
+ You can update the URL associated with the `SafeRequest` object using the `update_url` method, which also updates the underlying `URLManager`:
157
+
158
+ ```python
159
+ safe_request.update_url('https://www.example2.com')
160
+ ```
161
+
162
+ You can also update the `URLManager` directly:
163
+
164
+ ```python
165
+ from url_manager import URLManager
166
+
167
+ url_manager = URLManager(url='https://www.example3.com')
168
+ safe_request.update_url_manager(url_manager)
169
+ ```
170
+
171
+ ### Making HTTP Requests
172
+
173
+ The `SafeRequest` class handles making HTTP requests using the `try_request` method. It handles retries, timeouts, and rate limiting:
174
+
175
+ ```python
176
+ response = safe_request.try_request()
177
+ if response:
178
+ # Process the response here
179
+ ```
180
+
181
+ ### Accessing Response Data
182
+
183
+ You can access the response data in various formats:
184
+
185
+ - `safe_request.source_code`: HTML source code as a string.
186
+ - `safe_request.source_code_bytes`: HTML source code as bytes.
187
+ - `safe_request.source_code_json`: JSON data from the response (if the content type is JSON).
188
+ - `safe_request.react_source_code`: JavaScript and JSX source code extracted from `<script>` tags.
189
+
190
+ ### Customizing Request Configuration
191
+
192
+ The `SafeRequest` class provides several options for customizing the request, such as headers, user-agent, proxies, SSL/TLS settings, and more. These can be set during initialization or updated later.
193
+
194
+ ### Handling Rate Limiting
195
+
196
+ The class can handle rate limiting scenarios by implementing rate limiters and waiting between requests.
197
+
198
+ ### Error Handling
199
+
200
+ The `SafeRequest` class handles various request-related exceptions and provides error messages for easier debugging.
201
+
202
+ ## Dependencies
203
+
204
+ The `SafeRequest` class relies on the `requests` library for making HTTP requests. Ensure you have the `requests` library installed in your Python environment:
205
+
206
+ ```bash
207
+ pip install requests
208
+ ```
209
+
210
+ ## Example
211
+
212
+ Here's a quick example of using the `SafeRequest` class:
213
+
214
+ ```python
215
+ from abstract_webtools import SafeRequest
216
+
217
+ safe_request = SafeRequest(url='https://www.example.com')
218
+ response = safe_request.try_request()
219
+ if response:
220
+ print(f"Response status code: {response.status_code}")
221
+ print(f"HTML source code: {safe_request.source_code}")
222
+ ```
223
+
224
+ # SoupManager
225
+
226
+ The `SoupManager` class is a Python utility designed to simplify web scraping by providing easy access to the BeautifulSoup library. It allows you to parse and manipulate HTML or XML source code from a URL or provided source code.
227
+
228
+ ## Usage
229
+
230
+ To use the `SoupManager` class, first import it into your Python script:
231
+
232
+ ```python
233
+ from abstract_webtools import SoupManager
234
+ ```
235
+
236
+ ### Initializing a SoupManager Object
237
+
238
+ You can create a `SoupManager` object with various configuration options. By default, it uses sensible default values, but you can customize it as needed:
239
+
240
+ ```python
241
+ soup_manager = SoupManager(url='https://www.example.com')
242
+ ```
243
+
244
+ ### Updating URL and Request Manager
245
+
246
+ You can update the URL associated with the `SoupManager` object using the `update_url` method, which also updates the underlying `URLManager` and `SafeRequest`:
247
+
248
+ ```python
249
+ soup_manager.update_url('https://www.example2.com')
250
+ ```
251
+
252
+ You can also update the source code directly:
253
+
254
+ ```python
255
+ source_code = '<html>...</html>'
256
+ soup_manager.update_source_code(source_code)
257
+ ```
258
+
259
+ ### Accessing and Parsing HTML
260
+
261
+ The `SoupManager` class provides easy access to the BeautifulSoup object, allowing you to search, extract, and manipulate HTML elements easily. You can use methods like `find_all`, `get_class`, `has_attributes`, and more to work with the HTML content.
262
+
263
+ ```python
264
+ elements = soup_manager.find_all(tag='a')
265
+ ```
266
+
267
+ ### Extracting Links
268
+
269
+ The class also includes methods for extracting all website links from the HTML source code:
270
+
271
+ ```python
272
+ all_links = soup_manager.all_links
273
+ ```
274
+
275
+ ### Extracting Meta Tags
276
+
277
+ You can extract meta tags from the HTML source code using the `meta_tags` property:
278
+
279
+ ```python
280
+ meta_tags = soup_manager.meta_tags
281
+ ```
282
+
283
+ ### Customizing Parsing
284
+
285
+ You can customize the parsing behavior by specifying the parser type during initialization or updating it:
286
+
287
+ ```python
288
+ soup_manager.update_parse_type('lxml')
289
+ ```
290
+
291
+ ## Dependencies
292
+
293
+ The `SoupManager` class relies on the `BeautifulSoup` library for parsing HTML or XML. Ensure you have the `beautifulsoup4` library installed in your Python environment:
294
+
295
+ ```bash
296
+ pip install beautifulsoup4
297
+ ```
298
+
299
+ ## Example
300
+
301
+ Here's a quick example of using the `SoupManager` class:
302
+
303
+ ```python
304
+ from abstract_webtools import SoupManager
305
+
306
+ soup_manager = SoupManager(url='https://www.example.com')
307
+ all_links = soup_manager.all_links
308
+ print(f"All Links: {all_links}")
309
+ ```
310
+ # LinkManager
311
+
312
+ The `LinkManager` class is a Python utility designed to simplify the extraction and management of links (URLs) and associated data from HTML source code. It leverages other classes like `URLManager`, `SafeRequest`, and `SoupManager` to facilitate link extraction and manipulation.
313
+
314
+ ## Usage
315
+
316
+ To use the `LinkManager` class, first import it into your Python script:
317
+
318
+ ```python
319
+ from abstract_webtools import LinkManager
320
+ ```
321
+
322
+ ### Initializing a LinkManager Object
323
+
324
+ You can create a `LinkManager` object with various configuration options. By default, it uses sensible default values, but you can customize it as needed:
325
+
326
+ ```python
327
+ link_manager = LinkManager(url='https://www.example.com')
328
+ ```
329
+
330
+ ### Updating URL and Request Manager
331
+
332
+ You can update the URL associated with the `LinkManager` object using the `update_url` method, which also updates the underlying `URLManager`, `SafeRequest`, and `SoupManager`:
333
+
334
+ ```python
335
+ link_manager.update_url('https://www.example2.com')
336
+ ```
337
+
338
+ ### Accessing Extracted Links
339
+
340
+ The `LinkManager` class provides easy access to extracted links and associated data:
341
+
342
+ ```python
343
+ all_links = link_manager.all_desired_links
344
+ ```
345
+
346
+ ### Customizing Link Extraction
347
+
348
+ You can customize the link extraction behavior by specifying various parameters during initialization or updating them:
349
+
350
+ ```python
351
+ link_manager.update_desired(
352
+ img_attr_value_desired=['thumbnail', 'image'],
353
+ img_attr_value_undesired=['icon'],
354
+ link_attr_value_desired=['blog', 'article'],
355
+ link_attr_value_undesired=['archive'],
356
+ image_link_tags='img',
357
+ img_link_attrs='src',
358
+ link_tags='a',
359
+ link_attrs='href',
360
+ strict_order_tags=True,
361
+ associated_data_attr=['data-title', 'alt', 'title'],
362
+ get_img=['data-title', 'alt', 'title']
363
+ )
364
+ ```
365
+
366
+ ## Dependencies
367
+
368
+ The `LinkManager` class relies on other classes within the `abstract_webtools` module, such as `URLManager`, `SafeRequest`, and `SoupManager`. Ensure you have these classes and their dependencies correctly set up in your Python environment.
369
+
370
+ ## Example
371
+
372
+ Here's a quick example of using the `LinkManager` class:
373
+
374
+ ```python
375
+ from abstract_webtools import LinkManager
376
+
377
+ link_manager = LinkManager(url='https://www.example.com')
378
+ all_links = link_manager.all_desired_links
379
+ print(f"All Links: {all_links}")
380
+ ```
381
+ ##Overall Usecases
382
+ ```python
383
+ from abstract_webtools import URLManager, SafeRequest, SoupManager, LinkManager, VideoDownloader
384
+
385
+ # --- URLManager: Manages and manipulates URLs for web scraping/crawling ---
386
+ url = "example.com"
387
+ url_manager = URLManager(url=url)
388
+
389
+ # --- SafeRequest: Safely handles HTTP requests by managing user-agent, SSL/TLS, proxies, headers, etc. ---
390
+ request_manager = SafeRequest(
391
+ url_manager=url_manager,
392
+ proxies={'8.219.195.47', '8.219.197.111'},
393
+ timeout=(3.05, 70)
394
+ )
395
+
396
+ # --- SoupManager: Simplifies web scraping with easy access to BeautifulSoup ---
397
+ soup_manager = SoupManager(
398
+ url_manager=url_manager,
399
+ request_manager=request_manager
400
+ )
401
+
402
+ # --- LinkManager: Extracts and manages links and associated data from HTML source code ---
403
+ link_manager = LinkManager(
404
+ url_manager=url_manager,
405
+ soup_manager=soup_manager,
406
+ link_attr_value_desired=['/view_video.php?viewkey='],
407
+ link_attr_value_undesired=['phantomjs']
408
+ )
409
+
410
+ # Download videos from provided links (list or string)
411
+ video_manager = VideoDownloader(link=link_manager.all_desired_links).download()
412
+
413
+ # Use them individually, with default dependencies for basic inputs:
414
+ standalone_soup = SoupManager(url=url).soup
415
+ standalone_links = LinkManager(url=url).all_desired_links
416
+
417
+ # Updating methods for manager classes
418
+ url_1 = 'thedailydialectics.com'
419
+ print(f"updating URL to {url_1}")
420
+ url_manager.update_url(url=url_1)
421
+ request_manager.update_url(url=url_1)
422
+ soup_manager.update_url(url=url_1)
423
+ link_manager.update_url(url=url_1)
424
+
425
+ # Updating URL manager references
426
+ request_manager.update_url_manager(url_manager=url_manager)
427
+ soup_manager.update_url_manager(url_manager=url_manager)
428
+ link_manager.update_url_manager(url_manager=url_manager)
429
+
430
+ # Updating source code for managers
431
+ source_code_bytes = request_manager.source_code_bytes
432
+ soup_manager.update_source_code(source_code=source_code_bytes)
433
+ link_manager.update_source_code(source_code=source_code_bytes)
434
+ ```
435
+ ## License
436
+
437
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
438
+
439
+ #### Module Information
440
+ -**Author**: putkoff
441
+ -**Author Email**: partners@abstractendeavors.com
442
+ -**Github**: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
443
+ -**PYPI**: https://pypi.org/project/abstract-webtools
444
+ -**Part of**: abstract_essentials
445
+ -**Date**: 10/10/2023
446
+ -**Version**: 0.1.4.54
447
+ ---
448
+
@@ -1,6 +1,6 @@
1
1
  abstract_webtools/__init__.py,sha256=2SWEfdPDHqqjUYsOQYlaOHF644ZYcO160nWKiAjga4w,34
2
2
  abstract_webtools/abstract_crawler.py,sha256=e8jVVv1_EB8poqlrdQaJ19z9Z0t8un5uc-DKnj1Ud5s,8002
3
- abstract_webtools/abstract_webtools.py,sha256=DLaVE-95qD6ItjNQ9-2E1h2qNcS_BGk2wCjVIGhso-o,76215
3
+ abstract_webtools/abstract_webtools.py,sha256=ob9IjHg4xVcgIQQCeDGBqVmu8OCSRhXUSR3ljORDgi0,76215
4
4
  abstract_webtools/abstract_webtools2.py,sha256=dlhhgmUTaN_NgkT6GcJMVBLuXjmW38gAOeCrKxYqytk,30685
5
5
  abstract_webtools/big_user_agent_list.py,sha256=2Jzlg8rzEiwqFO0yaC8yGmGXQGZ_RWaHr3Apm6Gn1Yw,113552
6
6
  abstract_webtools/dfgdsf.py,sha256=T1pj-ne_qVfaAdu1MIdtW3q3UZqNP78Kt0OMhz4Musk,1355
@@ -10,8 +10,8 @@ abstract_webtools/sou.py,sha256=8HjmcpXJFi_kC2O-SVGebUIFY5I5B9bPP9L8BAiWhfk,4526
10
10
  abstract_webtools/soup.py,sha256=i9Z2EL8dufbzZGP_mrJom1DCQPUOfgfVlSbiHiPnCGo,4793
11
11
  abstract_webtools/test_var.py,sha256=EnxfhiLqNaG9MIHPyIbobufRzo4z1p0UIm8ArZybeRU,72
12
12
  abstract_webtools/vido_test.py,sha256=wP0YGj-dWBh76g7xpvC88nOtqfeTp2hdQ-mp4ywjjXg,147
13
- abstract_webtools-0.1.4.54.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
14
- abstract_webtools-0.1.4.54.dist-info/METADATA,sha256=5fG4uH46i7pF9sCB7mFrCvs_TFTSrvYZuYHKOIlDpxA,8963
15
- abstract_webtools-0.1.4.54.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
16
- abstract_webtools-0.1.4.54.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
17
- abstract_webtools-0.1.4.54.dist-info/RECORD,,
13
+ abstract_webtools-0.1.4.56.dist-info/LICENSE,sha256=g3WEJFiVS27HyCGRTwKSsMLyciMaGFdWcZGOe1QalZk,3877
14
+ abstract_webtools-0.1.4.56.dist-info/METADATA,sha256=3R9juuedqawW4j8vITEc1ZXN_soHsp_q0wEEnF-Isqc,15908
15
+ abstract_webtools-0.1.4.56.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
16
+ abstract_webtools-0.1.4.56.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
17
+ abstract_webtools-0.1.4.56.dist-info/RECORD,,
@@ -1,206 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: abstract-webtools
3
- Version: 0.1.4.54
4
- Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
- Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
- Author: putkoff
7
- Author-email: partners@abstractendeavors.com
8
- Classifier: Development Status :: 3 - Alpha
9
- Classifier: Intended Audience :: Developers
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Programming Language :: Python :: 3
12
- Classifier: Programming Language :: Python :: 3.11
13
- Requires-Python: >=3.6
14
- Description-Content-Type: text/markdown
15
- License-File: LICENSE
16
- Requires-Dist: abstract-webtools (>=0.1.0)
17
- Requires-Dist: abstract-utilities (>=0.2.0.0)
18
- Requires-Dist: PySimpleGUI (>=4.60.5)
19
- Requires-Dist: urllib3 (>=2.0.4)
20
- Requires-Dist: requests (>=2.31.0)
21
-
22
- #Abstract Webtools for parsing web content.
23
-
24
-
25
- **Module:** `abstract_webtools`
26
- **Package:** `abstract_essentials`
27
- **GitHub Repository:** [abstract_essentials](https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools)
28
- **Contact Email:** [partners@abstractendeavors.com](mailto:partners@abstractendeavors.com)
29
- **Date:** 08/27/2023
30
- **Version:** 0.0.0.1
31
-
32
-
33
- ## Installation
34
-
35
- You can install the package via pip:
36
-
37
- ```bash
38
- pip install abstract_webtools
39
- ```
40
- ## Description
41
-
42
- The `abstract_webtools` module, part of the `abstract_essentials` package, provides various utilities and functionalities for web scraping and web request handling. This includes functions to retrieve and check the status of URLs, clean and format URLs, fetch source code with custom user agents, and more.
43
-
44
- ## Classes
45
-
46
- 1. **TLSAdapter(HTTPAdapter)**
47
- A custom HTTPAdapter class that configures TLS/SSL options and ciphers.
48
-
49
- ## Dependencies
50
-
51
- This module relies on the following external libraries:
52
-
53
- - `requests`
54
- - `ssl`
55
- - `urllib`
56
- - `BeautifulSoup`
57
-
58
- ## Usage
59
-
60
- ### Get Status Code
61
-
62
- The `get_status` function fetches the status code of the URL.
63
-
64
- ```python
65
- from abstract_webtools import clean_url
66
-
67
- urls = clean_url('https://example.com')
68
- print(urls) # Output: ['https://example.com', 'http://example.com']
69
- tps://example.com'
70
- ```
71
-
72
- ### Try Request
73
-
74
- The `try_request` function makes HTTP requests to a URL and returns the response if successful.
75
-
76
- ```python
77
- from abstract_webtools import try_request
78
-
79
- response = try_request('https://www.example.com')
80
- print(response) # Output: <Response [200]>
81
- ```
82
-
83
- ### Is Valid URL
84
-
85
- The `is_valid` function checks whether a given URL is valid.
86
-
87
- ```python
88
- from abstract_webtools import is_valid
89
-
90
- valid = is_valid('https://www.example.com')
91
- print(valid) # Output: True
92
- ```
93
-
94
- ### Get Source Code
95
-
96
- The `get_Source_code` function fetches the source code of a URL with a custom user-agent.
97
-
98
- ```python
99
- from abstract_webtools import get_Source_code
100
-
101
- source_code = get_Source_code('https://www.example.com')
102
- print(source_code) # Output: HTML source code of the URL
103
- ```
104
-
105
- ### Parse React Source
106
-
107
- The `parse_react_source` function fetches the source code of a URL and extracts JavaScript and JSX source code (React components).
108
-
109
- ```python
110
- from abstract_webtools import parse_react_source
111
-
112
- react_code = parse_react_source('https://www.example.com')
113
- print(react_code) # Output: List of JavaScript and JSX source code found in <script> tags
114
- ```
115
-
116
- ### Get All Website Links
117
-
118
- The `get_all_website_links` function returns all URLs found on a specified URL that belong to the same website.
119
-
120
- ```python
121
- from abstract_webtools import get_all_website_links
122
-
123
- links = get_all_website_links('https://www.example.com')
124
- print(links) # Output: List of URLs belonging to the same website as the specified URL
125
- ```
126
-
127
- ### Parse All
128
-
129
- The `parse_all` function fetches the source code of a URL and extracts information about HTML elements, attribute values, attribute names, and class names.
130
-
131
- ```python
132
- from abstract_webtools import parse_all
133
-
134
- HTML_components = parse_all('https://www.example.com')
135
- print(HTML_components["element_types"]) # Output: List of HTML element types
136
- print(HTML_components["attribute_values"]) # Output: List of attribute values
137
- print(HTML_components["attribute_names"]) # Output: List of attribute names
138
- print(HTML_components["class_names"]) # Output: List of class names
139
- ```
140
-
141
- ### Extract Elements
142
-
143
- The `extract_elements` function fetches the source code of a URL and extracts portions of the source code based on provided filters.
144
-
145
- ```python
146
- from abstract_webtools import extract_elements
147
-
148
- elements = extract_elements('https://www.example.com', element_type='div', attribute_name='class', class_name='container')
149
- print(elements) # Output: List of HTML elements that match the provided filters
150
- ```
151
-
152
- ## License
153
-
154
- This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
155
-
156
- ```
157
-
158
- This project is licensed under the MIT License
159
-
160
- MIT License
161
-
162
- The MIT License was first developed at the Massachusetts Institute of Technology (MIT) in the late 1980s. The exact origins MIT license are bit of mystery. Like the Apache 2.0, and BSD family of licenses the MIT License is a permissive software license that places few restrictions of reuse. Users of software using an MIT License are permitted to use, copy, modify, merge publish, distribute, sublicense and sell copies of the software. Some notable projects use the MIT License including Ruby on Rails, and the X Windows System.
163
- MIT License Conditions
164
- The MIT License is relatively simple and short. Below is the text of the MIT License from the Open Software Initiative.
165
- Begin license text.
166
-
167
- Copyright <YEAR> <COPYRIGHT HOLDER>
168
-
169
- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
170
-
171
- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
172
-
173
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
174
- End license text.
175
- Using MIT Licensed Code
176
-
177
-
178
-
179
- The basic conditions of using the MIT License are:
180
-
181
- 1. The original copyright notice
182
-
183
- 2. A copy of the license itself
184
-
185
- are including in all copies or any substantial portions of the software.
186
- MIT License Compatibility
187
-
188
- The MIT License is highly compatible with other permissive licenses. Including the BSD family of licenses. It is generally compatible with GNU GPL group of licenses. However if you distribute the code that contains or is derivative of GNU GPL code the final project must of GPL compliant. In other words any source code must of publicly available.
189
- MIT License, Patents
190
-
191
- The MIT License was developed before patenting software was a common practice in the U.S. It therefore does not contain an express patent license. The broad nature of the license in general, is considered by some to encompass an implicit waiver of patent rights. If you are concerned about patent rights, the Apache 2.0 license contains an explicit contributor's patent license.
192
- MIT No Attribution License (MIT-0)
193
-
194
- The MIT No Attribution License is a Public Domain equivalent license it is similar to the BSD Free license.
195
-
196
-
197
-
198
- Copyright <YEAR><COPYRIGHT HOLDER>
199
-
200
- Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so.
201
-
202
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
203
-
204
- ```
205
-
206
- For additional details on usage and documentation of functions, refer to their respective docstrings in the module.