abstract-webtools 0.1.6.3__tar.gz → 0.1.6.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {abstract_webtools-0.1.6.3/src/abstract_webtools.egg-info → abstract_webtools-0.1.6.5}/PKG-INFO +1 -1
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/setup.py +1 -1
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/crawlManager.py +0 -15
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/seleniumManager.py +18 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/soupManager.py +11 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5/src/abstract_webtools.egg-info}/PKG-INFO +1 -1
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/LICENSE +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/README.md +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/pyproject.toml +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/setup.cfg +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/__init__.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/abstract_webtools.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/big_user_agent_list.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/main.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/__init__.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/cipherManager.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/domainManager.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/linkManager.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/mySocketClient.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/networkManager.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/requestManager.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/sslManager.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/urlManager.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/userAgentManager.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/videoDownloader.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/soup_gui.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/url_grabber.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/url_grabber_new.py +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools.egg-info/SOURCES.txt +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools.egg-info/requires.txt +0 -0
- {abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools.egg-info/top_level.txt +0 -0
{abstract_webtools-0.1.6.3/src/abstract_webtools.egg-info → abstract_webtools-0.1.6.5}/PKG-INFO
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.5
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
4
|
long_description = fh.read()
|
5
5
|
setuptools.setup(
|
6
6
|
name='abstract_webtools',
|
7
|
-
version='0.1.6.
|
7
|
+
version='0.1.6.05',
|
8
8
|
author='putkoff',
|
9
9
|
author_email='partners@abstractendeavors.com',
|
10
10
|
description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
|
@@ -1,21 +1,6 @@
|
|
1
1
|
from .soupManager import *
|
2
2
|
|
3
|
-
def normalize_url(url, base_url):
|
4
|
-
"""
|
5
|
-
Normalize and resolve relative URLs, ensuring proper domain and format.
|
6
|
-
"""
|
7
|
-
# If URL starts with the base URL repeated, remove the extra part
|
8
|
-
if url.startswith(base_url):
|
9
|
-
url = url[len(base_url):]
|
10
3
|
|
11
|
-
# Resolve the URL against the base URL
|
12
|
-
normalized_url = urljoin(base_url, url.split('#')[0])
|
13
|
-
|
14
|
-
# Ensure only URLs belonging to the base domain are kept
|
15
|
-
if not normalized_url.startswith(base_url):
|
16
|
-
return None
|
17
|
-
|
18
|
-
return normalized_url
|
19
4
|
class crawlManager():
|
20
5
|
def __init__(self,url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type="html.parser"):
|
21
6
|
self.url=url
|
@@ -46,6 +46,7 @@ class seleniumManager(metaclass=SingletonMeta):
|
|
46
46
|
parsed_url = urlparse(url)
|
47
47
|
self.domain = parsed_url.netloc
|
48
48
|
self.scheme = parsed_url.scheme
|
49
|
+
self.base_url= f"{self.scheme}{self.domain}"
|
49
50
|
self.site_dir = os.path.join(os.getcwd(), self.domain)
|
50
51
|
os.makedirs(self.site_dir, exist_ok=True)
|
51
52
|
self.drivers = {}
|
@@ -83,7 +84,24 @@ class seleniumManager(metaclass=SingletonMeta):
|
|
83
84
|
self.drivers[url] = driver
|
84
85
|
driver.get(url)
|
85
86
|
return self.drivers[url]
|
87
|
+
def normalize_url(url, base_url=None):
|
88
|
+
"""
|
89
|
+
Normalize and resolve relative URLs, ensuring proper domain and format.
|
90
|
+
"""
|
91
|
+
# If URL starts with the base URL repeated, remove the extra part
|
92
|
+
manager = seleniumManager(url)
|
93
|
+
base_url = manager.base_url
|
94
|
+
if url.startswith(base_url):
|
95
|
+
url = url[len(base_url):]
|
86
96
|
|
97
|
+
# Resolve the URL against the base URL
|
98
|
+
normalized_url = urljoin(base_url, url.split('#')[0])
|
99
|
+
|
100
|
+
# Ensure only URLs belonging to the base domain are kept
|
101
|
+
if not normalized_url.startswith(base_url):
|
102
|
+
return None
|
103
|
+
|
104
|
+
return normalized_url
|
87
105
|
# Function to get Selenium page source
|
88
106
|
def get_selenium_source(url):
|
89
107
|
url_mgr = urlManager(url)
|
@@ -349,3 +349,14 @@ def get_soup_mgr(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=No
|
|
349
349
|
def get_all_attribute_values(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,tags_list = None):
|
350
350
|
soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr)
|
351
351
|
return soup_mgr.get_all_attribute_values(tags_list=tags_list)
|
352
|
+
def get_soup(url=None,url_mgr=None,req_mgr=None,source_code=None,parse_type="html.parser"):
|
353
|
+
if source_code or soup_mgr:
|
354
|
+
if soup_mgr:
|
355
|
+
return soup_mgr.soup
|
356
|
+
return BeautifulSoup(source_code, parse_type)
|
357
|
+
url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
358
|
+
url = get_url(url=url,url_mgr=url_mgr)
|
359
|
+
req_mgr = req_mgr or get_req_mgr(url_mgr=url_mgr,url=url,source_code=source)
|
360
|
+
source_code = req_mgr.source_code
|
361
|
+
soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr)
|
362
|
+
return soup_mgr.soup
|
{abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5/src/abstract_webtools.egg-info}/PKG-INFO
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.5
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/abstract_webtools.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/big_user_agent_list.py
RENAMED
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/sslManager.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/tlsAdapter.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/managers/urlManager.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/url_grabber.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools/url_grabber_new.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools.egg-info/SOURCES.txt
RENAMED
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools.egg-info/requires.txt
RENAMED
File without changes
|
{abstract_webtools-0.1.6.3 → abstract_webtools-0.1.6.5}/src/abstract_webtools.egg-info/top_level.txt
RENAMED
File without changes
|