abstract-webtools 0.1.6.138__tar.gz → 0.1.6.140__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/PKG-INFO +1 -1
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/setup.py +1 -1
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/__init__.py +1 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/linkManager/linkManager.py +46 -24
- abstract_webtools-0.1.6.140/src/abstract_webtools/managers/middleManager/__init__.py +1 -0
- abstract_webtools-0.1.6.140/src/abstract_webtools/managers/middleManager/imports.py +18 -0
- abstract_webtools-0.1.6.140/src/abstract_webtools/managers/middleManager/src/UnifiedWebManager.py +135 -0
- abstract_webtools-0.1.6.140/src/abstract_webtools/managers/middleManager/src/__init__.py +2 -0
- abstract_webtools-0.1.6.140/src/abstract_webtools/managers/middleManager/src/legacy_tools.py +8 -0
- abstract_webtools-0.1.6.140/src/abstract_webtools/managers/requestManager/requestManager.py +400 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/soupManager/soupManager.py +10 -10
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/urlManager/urlManager.py +20 -28
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools.egg-info/PKG-INFO +1 -1
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools.egg-info/SOURCES.txt +5 -0
- abstract_webtools-0.1.6.138/src/abstract_webtools/managers/requestManager/requestManager.py +0 -356
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/README.md +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/pyproject.toml +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/setup.cfg +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/__init__.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/abstract_usurpit.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/abstract_webtools.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/big_user_agent_list.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/domain_identifier.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/extention_list.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/find_dirs.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/k2s_downloader.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/main.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/allss//.py" +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/cipherManager.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/clownworld/__init__.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/clownworld/get_bolshevid_video.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/crawlManager.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/crawlmgr2.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/curlMgr.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/domainManager.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/get_test.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/linkManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/mySocketClient.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/networkManager.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/requestManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/seleniumManager.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/soupManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/soupManager/asoueces.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/sslManager.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/urlManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/urlManager/urlManager (Copy).py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/userAgentManager.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/videoDownloader.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/managers/videoDownloader2.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/soup_gui.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/url_grabber.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools/url_grabber_new.py +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools.egg-info/requires.txt +0 -0
- {abstract_webtools-0.1.6.138 → abstract_webtools-0.1.6.140}/src/abstract_webtools.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.140
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
4
|
long_description = fh.read()
|
5
5
|
setuptools.setup(
|
6
6
|
name='abstract_webtools',
|
7
|
-
version='0.1.6.
|
7
|
+
version='0.1.6.140',
|
8
8
|
author='putkoff',
|
9
9
|
author_email='partners@abstractendeavors.com',
|
10
10
|
description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
|
@@ -2,18 +2,7 @@ from ...abstract_webtools import *
|
|
2
2
|
from ..urlManager import *
|
3
3
|
from ..requestManager import *
|
4
4
|
from ..soupManager import *
|
5
|
-
|
6
|
-
if not url and not url_mgr:
|
7
|
-
return None
|
8
|
-
if url:
|
9
|
-
url_mgr = urlManager(url)
|
10
|
-
return url_mgr.url
|
11
|
-
def get_url_mgr(url=None,url_mgr=None):
|
12
|
-
if not url and not url_mgr:
|
13
|
-
return None
|
14
|
-
if url:
|
15
|
-
url_mgr = urlManager(url)
|
16
|
-
return url_mgr
|
5
|
+
from ..middleManager import *
|
17
6
|
class linkManager:
|
18
7
|
"""
|
19
8
|
LinkManager is a class for managing and extracting links and image links from a web page.
|
@@ -50,13 +39,40 @@ class linkManager:
|
|
50
39
|
- The LinkManager class helps manage and extract links and image links from web pages.
|
51
40
|
- The class provides flexibility in specifying criteria for link extraction.
|
52
41
|
"""
|
53
|
-
def __init__(self,
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
42
|
+
def __init__(self,
|
43
|
+
url=None,
|
44
|
+
source_code=None,
|
45
|
+
soup=None,
|
46
|
+
url_mgr=None,
|
47
|
+
req_mgr=None,
|
48
|
+
soup_mgr=None,
|
49
|
+
image_link_tags='img',
|
50
|
+
img_link_attrs='src',
|
51
|
+
link_tags='a',
|
52
|
+
link_attrs='href',
|
53
|
+
strict_order_tags=False,
|
54
|
+
img_attr_value_desired=None,
|
55
|
+
img_attr_value_undesired=None,
|
56
|
+
link_attr_value_desired=None,
|
57
|
+
link_attr_value_undesired=None,
|
58
|
+
associated_data_attr=["data-title",'alt','title'],
|
59
|
+
get_img=["data-title",'alt','title']
|
60
|
+
):
|
61
|
+
|
62
|
+
|
63
|
+
all_tools = get_soup_tools(
|
64
|
+
url=url,
|
65
|
+
url_mgr=url_mgr,
|
66
|
+
source_code=source_code,
|
67
|
+
req_mgr=req_mgr,
|
68
|
+
soup=soup,
|
69
|
+
soup_mgr=soup_mgr,
|
70
|
+
target_manager = self
|
71
|
+
)
|
59
72
|
|
73
|
+
|
74
|
+
|
75
|
+
|
60
76
|
self.strict_order_tags=strict_order_tags
|
61
77
|
self.image_link_tags=image_link_tags
|
62
78
|
self.img_link_attrs=img_link_attrs
|
@@ -171,19 +187,25 @@ class linkManager:
|
|
171
187
|
if url not in domains_ls and comp_domain == domain:
|
172
188
|
domains_ls.append(url)
|
173
189
|
return domains_ls
|
190
|
+
|
174
191
|
def find_all_desired_links(self,tag='img', attr='src',attr_value_desired=None,strict_order_tags=False,attr_value_undesired=None,associated_data_attr=None,all_desired=None,get_img=None):
|
175
192
|
all_desired = all_desired or self.find_all_desired(tag=tag,attr=attr,strict_order_tags=strict_order_tags,attr_value_desired=attr_value_desired,attr_value_undesired=attr_value_undesired,associated_data_attr=associated_data_attr,get_img=get_img)
|
176
193
|
assiciated_attrs = all_desired[-1]
|
177
194
|
valid_assiciated_attrs = []
|
178
195
|
desired_links=[]
|
179
196
|
for i,attr in enumerate(all_desired[:-1]):
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
197
|
+
|
198
|
+
self.url_mgr.domain = self.url_mgr.domain or ''
|
199
|
+
|
200
|
+
self.url_mgr.protocol = self.url_mgr.protocol or 'https'
|
201
|
+
|
202
|
+
if attr:
|
203
|
+
valid_attr=self.url_mgr.make_valid(attr,self.url_mgr.protocol+'://'+self.url_mgr.domain)
|
204
|
+
if valid_attr:
|
205
|
+
desired_links.append(valid_attr)
|
206
|
+
valid_assiciated_attrs.append(assiciated_attrs[i])
|
207
|
+
valid_assiciated_attrs[-1]["link"]=valid_attr
|
185
208
|
desired_links.append(valid_assiciated_attrs)
|
186
209
|
return desired_links
|
187
210
|
|
188
|
-
|
189
211
|
|
@@ -0,0 +1 @@
|
|
1
|
+
from .src import *
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import logging
|
2
|
+
from ..urlManager import (
|
3
|
+
urlManager,
|
4
|
+
get_url,
|
5
|
+
get_url_mgr
|
6
|
+
)
|
7
|
+
from ..requestManager import (
|
8
|
+
requestManager,
|
9
|
+
get_source,
|
10
|
+
get_req_mgr
|
11
|
+
)
|
12
|
+
from ..soupManager import (
|
13
|
+
soupManager,
|
14
|
+
get_soup,
|
15
|
+
get_soup_mgr
|
16
|
+
)
|
17
|
+
from bs4 import BeautifulSoup
|
18
|
+
logging.basicConfig(level=logging.INFO)
|
abstract_webtools-0.1.6.140/src/abstract_webtools/managers/middleManager/src/UnifiedWebManager.py
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
from ..imports import *
|
2
|
+
|
3
|
+
class UnifiedWebManager:
|
4
|
+
"""
|
5
|
+
Unified middleware that ties together URL, request, and soup managers.
|
6
|
+
Lazily initializes components based on provided inputs.
|
7
|
+
|
8
|
+
Args:
|
9
|
+
url (str or None): The base URL.
|
10
|
+
source_code (str or bytes or None): Pre-fetched source code.
|
11
|
+
url_mgr (urlManager or None): Existing URL manager.
|
12
|
+
req_mgr (requestManager or None): Existing request manager.
|
13
|
+
soup_mgr (soupManager or None): Existing soup manager.
|
14
|
+
parse_type (str): Parser type for BeautifulSoup (default: "html.parser").
|
15
|
+
"""
|
16
|
+
def __init__(self, url=None, source_code=None, url_mgr=None, req_mgr=None, soup_mgr=None,soup=None, parse_type="html.parser"):
|
17
|
+
self._url = url
|
18
|
+
self._source_code = source_code
|
19
|
+
self._url_mgr = url_mgr
|
20
|
+
self._req_mgr = req_mgr
|
21
|
+
self._soup_mgr = soup_mgr
|
22
|
+
self._parse_type = parse_type
|
23
|
+
self._soup = None # Lazy
|
24
|
+
|
25
|
+
@property
|
26
|
+
def url_mgr(self):
|
27
|
+
if self._url_mgr is None:
|
28
|
+
if self._url is None:
|
29
|
+
logging.warning("No URL provided; URL manager cannot be created.")
|
30
|
+
return None
|
31
|
+
self._url_mgr = urlManager(url=self._url)
|
32
|
+
return self._url_mgr
|
33
|
+
|
34
|
+
@property
|
35
|
+
def url(self):
|
36
|
+
if self._url is None and self.url_mgr:
|
37
|
+
self._url = self.url_mgr.url
|
38
|
+
return self._url
|
39
|
+
|
40
|
+
@property
|
41
|
+
def req_mgr(self):
|
42
|
+
if self._req_mgr is None:
|
43
|
+
self._req_mgr = requestManager(
|
44
|
+
url=self.url,
|
45
|
+
url_mgr=self.url_mgr,
|
46
|
+
source_code=self._source_code
|
47
|
+
)
|
48
|
+
return self._req_mgr
|
49
|
+
|
50
|
+
@property
|
51
|
+
def source_code(self):
|
52
|
+
if self._source_code is None and self.req_mgr:
|
53
|
+
self._source_code = self.req_mgr.source_code
|
54
|
+
return self._source_code
|
55
|
+
|
56
|
+
@property
|
57
|
+
def soup_mgr(self):
|
58
|
+
if self._soup_mgr is None:
|
59
|
+
self._soup_mgr = soupManager(
|
60
|
+
url=self.url,
|
61
|
+
url_mgr=self.url_mgr,
|
62
|
+
req_mgr=self.req_mgr,
|
63
|
+
source_code=self.source_code
|
64
|
+
)
|
65
|
+
return self._soup_mgr
|
66
|
+
|
67
|
+
@property
|
68
|
+
def soup(self):
|
69
|
+
if self._soup is None:
|
70
|
+
source = self.source_code
|
71
|
+
if source is None:
|
72
|
+
logging.warning("No source code available; Soup cannot be created.")
|
73
|
+
return None
|
74
|
+
if isinstance(source, bytes):
|
75
|
+
source = source.decode('utf-8', errors='ignore')
|
76
|
+
self._soup = BeautifulSoup(source, self._parse_type)
|
77
|
+
return self._soup
|
78
|
+
|
79
|
+
def update_url(self, url):
|
80
|
+
"""Update the URL and reset dependent managers."""
|
81
|
+
self._url = url
|
82
|
+
self._url_mgr = None
|
83
|
+
self._req_mgr = None
|
84
|
+
self._soup_mgr = None
|
85
|
+
self._source_code = None
|
86
|
+
self._soup = None
|
87
|
+
|
88
|
+
def update_source_code(self, source_code):
|
89
|
+
"""Update the source code and reset dependent managers."""
|
90
|
+
self._source_code = source_code
|
91
|
+
self._req_mgr = None
|
92
|
+
self._soup_mgr = None
|
93
|
+
self._soup = None
|
94
|
+
|
95
|
+
# Convenience methods for direct access
|
96
|
+
def get_all_tools(self):
|
97
|
+
"""Return a dict with all components (similar to original getters)."""
|
98
|
+
return {
|
99
|
+
'url': self.url,
|
100
|
+
'url_mgr': self.url_mgr,
|
101
|
+
'source_code': self.source_code,
|
102
|
+
'req_mgr': self.req_mgr,
|
103
|
+
'soup': self.soup,
|
104
|
+
'soup_mgr': self.soup_mgr
|
105
|
+
}
|
106
|
+
def endow_to_manager(self, target_manager, all_tools=None):
|
107
|
+
"""
|
108
|
+
Endow (assign) the attributes from all_tools to the target manager instance.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
target_manager: The instance (e.g., another manager class) to endow attributes to.
|
112
|
+
all_tools (dict or None): Optional dict of tools/attributes. If None, uses self.get_all_tools().
|
113
|
+
"""
|
114
|
+
if all_tools is None:
|
115
|
+
all_tools = self.get_all_tools()
|
116
|
+
for key, value in all_tools.items():
|
117
|
+
setattr(target_manager, key, value)
|
118
|
+
# Wrapper functions for backward compatibility
|
119
|
+
def get_url_tools(url=None, url_mgr=None):
|
120
|
+
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr)
|
121
|
+
return {'url': mgr.url, 'url_mgr': mgr.url_mgr}
|
122
|
+
|
123
|
+
def get_req_tools(url=None, url_mgr=None, source_code=None, req_mgr=None):
|
124
|
+
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr, source_code=source_code, req_mgr=req_mgr)
|
125
|
+
return {'url': mgr.url, 'url_mgr': mgr.url_mgr, 'source_code': mgr.source_code, 'req_mgr': mgr.req_mgr}
|
126
|
+
|
127
|
+
def get_soup_tools(url=None, url_mgr=None, source_code=None, req_mgr=None, soup=None, soup_mgr=None,target_manager=None):
|
128
|
+
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr, source_code=source_code, req_mgr=req_mgr, soup_mgr=soup_mgr)
|
129
|
+
if soup is not None:
|
130
|
+
mgr._soup = soup # Allow overriding
|
131
|
+
if target_manager:
|
132
|
+
mgr.endow_to_manager(target_manager, all_tools=None)
|
133
|
+
return mgr.get_all_tools()
|
134
|
+
|
135
|
+
|