abstract-webtools 0.1.6.140__py3-none-any.whl → 0.1.6.142__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/managers/linkManager/linkManager.py +10 -14
- abstract_webtools/managers/middleManager/src/UnifiedWebManage3r.py +136 -0
- abstract_webtools/managers/middleManager/src/UnifiedWebManager.py +21 -26
- abstract_webtools/managers/requestManager/requestManager.py +83 -84
- abstract_webtools/managers/soupManager/soupManager.py +11 -8
- {abstract_webtools-0.1.6.140.dist-info → abstract_webtools-0.1.6.142.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.6.140.dist-info → abstract_webtools-0.1.6.142.dist-info}/RECORD +9 -8
- {abstract_webtools-0.1.6.140.dist-info → abstract_webtools-0.1.6.142.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.6.140.dist-info → abstract_webtools-0.1.6.142.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@ from ...abstract_webtools import *
|
|
2
2
|
from ..urlManager import *
|
3
3
|
from ..requestManager import *
|
4
4
|
from ..soupManager import *
|
5
|
-
|
5
|
+
|
6
6
|
class linkManager:
|
7
7
|
"""
|
8
8
|
LinkManager is a class for managing and extracting links and image links from a web page.
|
@@ -46,6 +46,7 @@ class linkManager:
|
|
46
46
|
url_mgr=None,
|
47
47
|
req_mgr=None,
|
48
48
|
soup_mgr=None,
|
49
|
+
parse_type=None,
|
49
50
|
image_link_tags='img',
|
50
51
|
img_link_attrs='src',
|
51
52
|
link_tags='a',
|
@@ -58,21 +59,15 @@ class linkManager:
|
|
58
59
|
associated_data_attr=["data-title",'alt','title'],
|
59
60
|
get_img=["data-title",'alt','title']
|
60
61
|
):
|
61
|
-
|
62
|
-
|
63
|
-
all_tools = get_soup_tools(
|
64
|
-
url=url,
|
65
|
-
url_mgr=url_mgr,
|
66
|
-
source_code=source_code,
|
67
|
-
req_mgr=req_mgr,
|
68
|
-
soup=soup,
|
69
|
-
soup_mgr=soup_mgr,
|
70
|
-
target_manager = self
|
71
|
-
)
|
72
62
|
|
73
|
-
|
74
|
-
|
63
|
+
self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
64
|
+
self.url = get_url(url=url,url_mgr=self.url_mgr)
|
65
|
+
self.req_mgr = get_req_mgr(url=self.url,url_mgr=self.url_mgr,source_code=source_code,req_mgr=req_mgr)
|
66
|
+
self.source_code = get_source(url=self.url,url_mgr=self.url_mgr,source_code=source_code,req_mgr=self.req_mgr)
|
67
|
+
self.soup_mgr = get_soup_mgr(url=self.url,url_mgr=self.url_mgr,source_code=self.source_code,req_mgr=self.req_mgr,soup_mgr=soup_mgr,soup=soup,parse_type=parse_type)
|
75
68
|
|
69
|
+
self.soup = get_soup(url=self.url,url_mgr=self.url_mgr,req_mgr=self.req_mgr,source_code=self.source_code,soup_mgr=self.soup_mgr)
|
70
|
+
|
76
71
|
self.strict_order_tags=strict_order_tags
|
77
72
|
self.image_link_tags=image_link_tags
|
78
73
|
self.img_link_attrs=img_link_attrs
|
@@ -94,6 +89,7 @@ class linkManager:
|
|
94
89
|
attr_value_undesired=self.link_attr_value_undesired,
|
95
90
|
associated_data_attr=self.associated_data_attr,
|
96
91
|
get_img=get_img)
|
92
|
+
|
97
93
|
def re_initialize(self):
|
98
94
|
self.all_desired_image_links=self.find_all_desired_links(tag=self.image_link_tags,attr=self.img_link_attrs,strict_order_tags=self.strict_order_tags,attr_value_desired=self.img_attr_value_desired,attr_value_undesired=self.img_attr_value_undesired)
|
99
95
|
self.all_desired_links=self.find_all_desired_links(tag=self.link_tags,attr=self.link_attrs,strict_order_tags=self.strict_order_tags,attr_value_desired=self.link_attr_value_desired,attr_value_undesired=self.link_attr_value_undesired,associated_data_attr=self.associated_data_attr,get_img=self.get_img)
|
@@ -0,0 +1,136 @@
|
|
1
|
+
from ..imports import *
|
2
|
+
|
3
|
+
class UnifiedWebManager:
|
4
|
+
"""
|
5
|
+
Unified middleware that ties together URL, request, and soup managers.
|
6
|
+
Lazily initializes components based on provided inputs.
|
7
|
+
|
8
|
+
Args:
|
9
|
+
url (str or None): The base URL.
|
10
|
+
source_code (str or bytes or None): Pre-fetched source code.
|
11
|
+
url_mgr (urlManager or None): Existing URL manager.
|
12
|
+
req_mgr (requestManager or None): Existing request manager.
|
13
|
+
soup_mgr (soupManager or None): Existing soup manager.
|
14
|
+
parse_type (str): Parser type for BeautifulSoup (default: "html.parser").
|
15
|
+
"""
|
16
|
+
def __init__(self, url=None, source_code=None, url_mgr=None, req_mgr=None, soup_mgr=None,soup=None, parse_type="html.parser"):
|
17
|
+
self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
18
|
+
self.url = get_url(url=url,url_mgr=self.url_mgr)
|
19
|
+
self.req_mgr = get_source(url=self.url,url_mgr=self.url_mgr,source_code=source_code,req_mgr=req_mgr)
|
20
|
+
self.source_code = get_source(url=self.url,url_mgr=self.url_mgr,source_code=source_code,req_mgr=self.req_mgr)
|
21
|
+
self.soup_mgr = get_soup_mgr(url=self.url,url_mgr=self.url_mgr,source_code=self.source_code,req_mgr=self.req_mgr,soup_mgr=soup_mgr,soup=soup,parse_type=parse_type)
|
22
|
+
self.soup = get_soup(url=self.url,url_mgr=self.url_mgr,req_mgr=self.req_mgr,source_code=self.source_code,soup_mgr=self.soup_mgr)
|
23
|
+
|
24
|
+
@property
|
25
|
+
def url_mgr(self):
|
26
|
+
if self.url_mgr is None:
|
27
|
+
if self.url is None:
|
28
|
+
logging.warning("No URL provided; URL manager cannot be created.")
|
29
|
+
return None
|
30
|
+
self.url_mgr = urlManager(url=self.url)
|
31
|
+
return self.url_mgr
|
32
|
+
|
33
|
+
@property
|
34
|
+
def url(self):
|
35
|
+
if self.url is None and self.url_mgr:
|
36
|
+
self.url = self.url_mgr.url
|
37
|
+
return self.url
|
38
|
+
|
39
|
+
@property
|
40
|
+
def req_mgr(self):
|
41
|
+
if self.req_mgr is None:
|
42
|
+
self.req_mgr = requestManager(
|
43
|
+
url=self.url,
|
44
|
+
url_mgr=self.url_mgr,
|
45
|
+
source_code=self.source_code
|
46
|
+
)
|
47
|
+
return self.req_mgr
|
48
|
+
|
49
|
+
@property
|
50
|
+
def source_code(self):
|
51
|
+
if self.source_code is None and self.req_mgr:
|
52
|
+
self.source_code = self.req_mgr.source_code
|
53
|
+
return self.source_code
|
54
|
+
|
55
|
+
@property
|
56
|
+
def soup_mgr(self):
|
57
|
+
if self.soup_mgr is None:
|
58
|
+
self.soup_mgr = soupManager(
|
59
|
+
url=self.url,
|
60
|
+
url_mgr=self.url_mgr,
|
61
|
+
req_mgr=self.req_mgr,
|
62
|
+
source_code=self.source_code
|
63
|
+
)
|
64
|
+
return self.soup_mgr
|
65
|
+
|
66
|
+
@property
|
67
|
+
def soup(self):
|
68
|
+
if self.soup is None:
|
69
|
+
source = self.source_code
|
70
|
+
if source is None:
|
71
|
+
logging.warning("No source code available; Soup cannot be created.")
|
72
|
+
return None
|
73
|
+
if isinstance(source, bytes):
|
74
|
+
source = source.decode('utf-8', errors='ignore')
|
75
|
+
self.soup = BeautifulSoup(source, self.parse_type)
|
76
|
+
return self.soup
|
77
|
+
|
78
|
+
def update_url(self, url):
|
79
|
+
"""Update the URL and reset dependent managers."""
|
80
|
+
self.url = url
|
81
|
+
self.url_mgr = None
|
82
|
+
self.req_mgr = None
|
83
|
+
self.soup_mgr = None
|
84
|
+
self.source_code = None
|
85
|
+
self.soup = None
|
86
|
+
|
87
|
+
def update_source_code(self, source_code):
|
88
|
+
"""Update the source code and reset dependent managers."""
|
89
|
+
self.source_code = source_code
|
90
|
+
self.req_mgr = None
|
91
|
+
self.soup_mgr = None
|
92
|
+
self.soup = None
|
93
|
+
|
94
|
+
# Convenience methods for direct access
|
95
|
+
def get_all_tools(self):
|
96
|
+
"""Return a dict with all components (similar to original getters)."""
|
97
|
+
return {
|
98
|
+
'url': self.url,
|
99
|
+
'url_mgr': self.url_mgr,
|
100
|
+
'source_code': self.source_code,
|
101
|
+
'req_mgr': self.req_mgr,
|
102
|
+
'soup': self.soup,
|
103
|
+
'soup_mgr': self.soup_mgr
|
104
|
+
}
|
105
|
+
def endow_to_manager(self, target_manager, all_tools=None):
|
106
|
+
"""
|
107
|
+
Endow (assign) the attributes from all_tools to the target manager instance.
|
108
|
+
|
109
|
+
Args:
|
110
|
+
target_manager: The instance (e.g., another manager class) to endow attributes to.
|
111
|
+
all_tools (dict or None): Optional dict of tools/attributes. If None, uses self.get_all_tools().
|
112
|
+
"""
|
113
|
+
if all_tools is None:
|
114
|
+
all_tools = self.get_all_tools()
|
115
|
+
for key, value in all_tools.items():
|
116
|
+
setattr(target_manager, key, value)
|
117
|
+
return target_manager
|
118
|
+
# Wrapper functions for backward compatibility
|
119
|
+
def get_url_tools(url=None, url_mgr=None):
|
120
|
+
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr)
|
121
|
+
return {'url': mgr.url, 'url_mgr': mgr.url_mgr}
|
122
|
+
|
123
|
+
def get_req_tools(url=None, url_mgr=None, source_code=None, req_mgr=None):
|
124
|
+
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr, source_code=source_code, req_mgr=req_mgr)
|
125
|
+
return {'url': mgr.url, 'url_mgr': mgr.url_mgr, 'source_code': mgr.source_code, 'req_mgr': mgr.req_mgr}
|
126
|
+
|
127
|
+
def get_soup_tools(url=None, url_mgr=None, source_code=None, req_mgr=None, soup=None, soup_mgr=None,target_manager=None):
|
128
|
+
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr, source_code=source_code, req_mgr=req_mgr, soup_mgr=soup_mgr)
|
129
|
+
if soup is not None:
|
130
|
+
mgr.soup = soup # Allow overriding
|
131
|
+
if target_manager:
|
132
|
+
mgr.endow_to_manager(target_manager, all_tools=None)
|
133
|
+
return target_manager
|
134
|
+
return mgr.get_all_tools()
|
135
|
+
|
136
|
+
|
@@ -1,5 +1,9 @@
|
|
1
|
+
import logging
|
2
|
+
from bs4 import BeautifulSoup
|
1
3
|
from ..imports import *
|
2
4
|
|
5
|
+
logging.basicConfig(level=logging.INFO)
|
6
|
+
|
3
7
|
class UnifiedWebManager:
|
4
8
|
"""
|
5
9
|
Unified middleware that ties together URL, request, and soup managers.
|
@@ -11,16 +15,17 @@ class UnifiedWebManager:
|
|
11
15
|
url_mgr (urlManager or None): Existing URL manager.
|
12
16
|
req_mgr (requestManager or None): Existing request manager.
|
13
17
|
soup_mgr (soupManager or None): Existing soup manager.
|
18
|
+
soup (BeautifulSoup or None): Pre-parsed soup object.
|
14
19
|
parse_type (str): Parser type for BeautifulSoup (default: "html.parser").
|
15
20
|
"""
|
16
|
-
def __init__(self, url=None, source_code=None, url_mgr=None, req_mgr=None, soup_mgr=None,soup=None, parse_type="html.parser"):
|
21
|
+
def __init__(self, url=None, source_code=None, url_mgr=None, req_mgr=None, soup_mgr=None, soup=None, parse_type="html.parser"):
|
17
22
|
self._url = url
|
18
23
|
self._source_code = source_code
|
19
24
|
self._url_mgr = url_mgr
|
20
25
|
self._req_mgr = req_mgr
|
21
26
|
self._soup_mgr = soup_mgr
|
27
|
+
self._soup = soup
|
22
28
|
self._parse_type = parse_type
|
23
|
-
self._soup = None # Lazy
|
24
29
|
|
25
30
|
@property
|
26
31
|
def url_mgr(self):
|
@@ -28,40 +33,31 @@ class UnifiedWebManager:
|
|
28
33
|
if self._url is None:
|
29
34
|
logging.warning("No URL provided; URL manager cannot be created.")
|
30
35
|
return None
|
31
|
-
self._url_mgr =
|
36
|
+
self._url_mgr = get_url_mgr(url=self._url)
|
32
37
|
return self._url_mgr
|
33
38
|
|
34
39
|
@property
|
35
40
|
def url(self):
|
36
|
-
if self._url is None and self.url_mgr:
|
37
|
-
self._url = self.url_mgr
|
41
|
+
if self._url is None and self.url_mgr is not None:
|
42
|
+
self._url = get_url(url_mgr=self.url_mgr)
|
38
43
|
return self._url
|
39
44
|
|
40
45
|
@property
|
41
46
|
def req_mgr(self):
|
42
47
|
if self._req_mgr is None:
|
43
|
-
self._req_mgr =
|
44
|
-
url=self.url,
|
45
|
-
url_mgr=self.url_mgr,
|
46
|
-
source_code=self._source_code
|
47
|
-
)
|
48
|
+
self._req_mgr = get_req_mgr(url=self.url, url_mgr=self.url_mgr, source_code=self._source_code)
|
48
49
|
return self._req_mgr
|
49
50
|
|
50
51
|
@property
|
51
52
|
def source_code(self):
|
52
|
-
if self._source_code is None and self.req_mgr:
|
53
|
-
self._source_code = self.req_mgr
|
53
|
+
if self._source_code is None and self.req_mgr is not None:
|
54
|
+
self._source_code = get_source(req_mgr=self.req_mgr)
|
54
55
|
return self._source_code
|
55
56
|
|
56
57
|
@property
|
57
58
|
def soup_mgr(self):
|
58
59
|
if self._soup_mgr is None:
|
59
|
-
self._soup_mgr =
|
60
|
-
url=self.url,
|
61
|
-
url_mgr=self.url_mgr,
|
62
|
-
req_mgr=self.req_mgr,
|
63
|
-
source_code=self.source_code
|
64
|
-
)
|
60
|
+
self._soup_mgr = get_soup_mgr(url=self.url, url_mgr=self.url_mgr, source_code=self.source_code, req_mgr=self.req_mgr)
|
65
61
|
return self._soup_mgr
|
66
62
|
|
67
63
|
@property
|
@@ -73,7 +69,7 @@ class UnifiedWebManager:
|
|
73
69
|
return None
|
74
70
|
if isinstance(source, bytes):
|
75
71
|
source = source.decode('utf-8', errors='ignore')
|
76
|
-
self._soup =
|
72
|
+
self._soup = get_soup(source_code=source, parse_type=self._parse_type)
|
77
73
|
return self._soup
|
78
74
|
|
79
75
|
def update_url(self, url):
|
@@ -103,6 +99,7 @@ class UnifiedWebManager:
|
|
103
99
|
'soup': self.soup,
|
104
100
|
'soup_mgr': self.soup_mgr
|
105
101
|
}
|
102
|
+
|
106
103
|
def endow_to_manager(self, target_manager, all_tools=None):
|
107
104
|
"""
|
108
105
|
Endow (assign) the attributes from all_tools to the target manager instance.
|
@@ -115,6 +112,8 @@ class UnifiedWebManager:
|
|
115
112
|
all_tools = self.get_all_tools()
|
116
113
|
for key, value in all_tools.items():
|
117
114
|
setattr(target_manager, key, value)
|
115
|
+
return target_manager
|
116
|
+
|
118
117
|
# Wrapper functions for backward compatibility
|
119
118
|
def get_url_tools(url=None, url_mgr=None):
|
120
119
|
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr)
|
@@ -124,12 +123,8 @@ def get_req_tools(url=None, url_mgr=None, source_code=None, req_mgr=None):
|
|
124
123
|
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr, source_code=source_code, req_mgr=req_mgr)
|
125
124
|
return {'url': mgr.url, 'url_mgr': mgr.url_mgr, 'source_code': mgr.source_code, 'req_mgr': mgr.req_mgr}
|
126
125
|
|
127
|
-
def get_soup_tools(url=None, url_mgr=None, source_code=None, req_mgr=None, soup=None, soup_mgr=None,target_manager=None):
|
128
|
-
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr, source_code=source_code, req_mgr=req_mgr, soup_mgr=soup_mgr)
|
129
|
-
if soup is not None:
|
130
|
-
mgr._soup = soup # Allow overriding
|
126
|
+
def get_soup_tools(url=None, url_mgr=None, source_code=None, req_mgr=None, soup=None, soup_mgr=None, target_manager=None):
|
127
|
+
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr, source_code=source_code, req_mgr=req_mgr, soup_mgr=soup_mgr, soup=soup)
|
131
128
|
if target_manager:
|
132
|
-
mgr.endow_to_manager(target_manager
|
129
|
+
return mgr.endow_to_manager(target_manager)
|
133
130
|
return mgr.get_all_tools()
|
134
|
-
|
135
|
-
|
@@ -59,90 +59,89 @@ class requestManager:
|
|
59
59
|
- The SafeRequest class is designed for making HTTP requests with error handling and retries.
|
60
60
|
- It provides methods for authentication, response handling, and error management.
|
61
61
|
"""
|
62
|
-
def __init__(self,
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
self.re_initialize()
|
62
|
+
def __init__(self,url=None,
|
63
|
+
source_code=None,
|
64
|
+
url_mgr=None,
|
65
|
+
network_manager=None,
|
66
|
+
user_agent_manager=None,
|
67
|
+
ssl_manager=None,
|
68
|
+
ssl_options=None,
|
69
|
+
tls_adapter=None,
|
70
|
+
user_agent=None,
|
71
|
+
proxies=None,
|
72
|
+
headers=None,
|
73
|
+
cookies=None,
|
74
|
+
session=None,
|
75
|
+
adapter=None,
|
76
|
+
protocol=None,
|
77
|
+
ciphers=None,
|
78
|
+
spec_login=False,
|
79
|
+
login_referer=None,
|
80
|
+
login_user_agent=None,
|
81
|
+
auth=None,
|
82
|
+
login_url=None,
|
83
|
+
email=None,
|
84
|
+
password=None,
|
85
|
+
checkbox=None,
|
86
|
+
dropdown=None,
|
87
|
+
certification=None,
|
88
|
+
stream=False,
|
89
|
+
timeout=None,
|
90
|
+
last_request_time=None,
|
91
|
+
max_retries=None,
|
92
|
+
request_wait_limit=None):
|
93
|
+
self.url_mgr = get_url_mgr(url=url, url_mgr=url_mgr)
|
94
|
+
self.url = get_url(url=url, url_mgr=self.url_mgr)
|
95
|
+
self._url_mgr = self.url_mgr
|
96
|
+
self._url = self.url
|
97
|
+
self.user_agent = user_agent
|
98
|
+
self.user_agent_manager = user_agent_manager or UserAgentManager(user_agent=self.user_agent)
|
99
|
+
self.headers = headers or self.user_agent_manager.header or {'Accept': '*/*'}
|
100
|
+
self.user_agent = self.user_agent_manager.user_agent
|
101
|
+
self.ciphers = ciphers or CipherManager().ciphers_string
|
102
|
+
self.certification = certification
|
103
|
+
self.ssl_options = ssl_options
|
104
|
+
self.ssl_manager = ssl_manager or SSLManager(ciphers=self.ciphers, ssl_options=self.ssl_options, certification=self.certification)
|
105
|
+
self.tls_adapter = tls_adapter or TLSAdapter(ssl_manager=self.ssl_manager, certification=self.certification, ssl_options=self.ssl_manager.ssl_options)
|
106
|
+
self.network_manager = network_manager or NetworkManager(user_agent_manager=self.user_agent_manager, ssl_manager=self.ssl_manager, tls_adapter=self.tls_adapter, user_agent=user_agent, proxies=proxies, cookies=cookies, ciphers=ciphers, certification=certification, ssl_options=ssl_options)
|
107
|
+
self.stream = stream
|
108
|
+
self.tls_adapter = self.network_manager.tls_adapter
|
109
|
+
self.ciphers = self.network_manager.ciphers
|
110
|
+
self.certification = self.network_manager.certification
|
111
|
+
self.ssl_options = self.network_manager.ssl_options
|
112
|
+
self.proxies = self.network_manager.proxies
|
113
|
+
self.timeout = timeout
|
114
|
+
self.cookies = self.network_manager.cookies
|
115
|
+
self.session = session or requests.Session()
|
116
|
+
self.auth = auth
|
117
|
+
self.spec_login = spec_login
|
118
|
+
self.password = password
|
119
|
+
self.email = email
|
120
|
+
self.checkbox = checkbox
|
121
|
+
self.dropdown = dropdown
|
122
|
+
self.login_url = login_url
|
123
|
+
self.login_user_agent = login_user_agent
|
124
|
+
self.login_referer = login_referer
|
125
|
+
self.protocol = protocol or 'https://'
|
126
|
+
self.stream = stream if isinstance(stream, bool) else False
|
127
|
+
self.initialize_session()
|
128
|
+
self.last_request_time = last_request_time
|
129
|
+
self.max_retries = max_retries or 3
|
130
|
+
self.request_wait_limit = request_wait_limit or 1.5
|
131
|
+
self._response = None
|
132
|
+
self.status_code = None
|
133
|
+
self.source_code = None
|
134
|
+
self.source_code_bytes = None
|
135
|
+
self.source_code_json = {}
|
136
|
+
self.react_source_code = []
|
137
|
+
self.extracted_urls = []
|
138
|
+
self.php_blocks = []
|
139
|
+
self._response_data = None
|
140
|
+
if source_code is not None:
|
141
|
+
self._response = source_code
|
142
|
+
self.process_response_data()
|
143
|
+
else:
|
144
|
+
self.re_initialize()
|
146
145
|
|
147
146
|
def update_url_mgr(self, url_mgr):
|
148
147
|
self.url_mgr = url_mgr
|
@@ -39,7 +39,7 @@ class soupManager:
|
|
39
39
|
- The SoupManager class is designed for parsing HTML source code using BeautifulSoup.
|
40
40
|
- It provides various methods to extract data and discover elements within the source code.
|
41
41
|
"""
|
42
|
-
def __init__(self,url=None,source_code=None,url_mgr=None,req_mgr=None, parse_type="html.parser"):
|
42
|
+
def __init__(self,url=None,source_code=None,url_mgr=None,req_mgr=None,soup=None, parse_type="html.parser"):
|
43
43
|
self.soup=[]
|
44
44
|
url = get_url(url=url,url_mgr=url_mgr)
|
45
45
|
self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
@@ -50,7 +50,7 @@ class soupManager:
|
|
50
50
|
if source_code:
|
51
51
|
source_code = str(source_code)
|
52
52
|
self.source_code = source_code or ''
|
53
|
-
self.soup= BeautifulSoup(self.source_code, self.parse_type)
|
53
|
+
self.soup= soup or BeautifulSoup(self.source_code, self.parse_type)
|
54
54
|
self.all_tags_and_attribute_names = self.get_all_tags_and_attribute_names()
|
55
55
|
self.all_tags = self.all_tags_and_attribute_names.get('tags')
|
56
56
|
self.all_attribute_names = self.all_tags_and_attribute_names.get('attributes')
|
@@ -340,7 +340,8 @@ class SoupManagerSingleton():
|
|
340
340
|
elif parse_type != SoupManagerSingleton._instance.parse_type or source_code != SoupManagerSingleton._instance.source_code:
|
341
341
|
SoupManagerSingleton._instance = SoupManager(url_mgr,requestManager,parse_type=parse_type,source_code=source_code)
|
342
342
|
return SoupManagerSingleton._instance
|
343
|
-
def get_soup(url=None,url_mgr=None,req_mgr=None,source_code=None,soup_mgr=None,parse_type=
|
343
|
+
def get_soup(url=None,url_mgr=None,req_mgr=None,source_code=None,soup_mgr=None,soup=None,parse_type=None):
|
344
|
+
parse_type = parse_type or "html.parser"
|
344
345
|
if source_code or soup_mgr:
|
345
346
|
if soup_mgr:
|
346
347
|
return soup_mgr.soup
|
@@ -349,14 +350,16 @@ def get_soup(url=None,url_mgr=None,req_mgr=None,source_code=None,soup_mgr=None,p
|
|
349
350
|
url = get_url(url=url,url_mgr=url_mgr)
|
350
351
|
req_mgr = req_mgr or get_req_mgr(url_mgr=url_mgr,url=url,source_code=source_code)
|
351
352
|
source_code = req_mgr.source_code
|
352
|
-
soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr)
|
353
|
+
soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr,soup=soup)
|
353
354
|
return soup_mgr.soup
|
354
|
-
def get_soup_mgr(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,parse_type=
|
355
|
+
def get_soup_mgr(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,soup=None,parse_type=None):
|
356
|
+
parse_type = parse_type or "html.parser"
|
355
357
|
url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
356
358
|
url = get_url(url=url,url_mgr=url_mgr)
|
357
359
|
req_mgr = get_req_mgr(url_mgr=url_mgr,url=url,source_code=source_code)
|
358
|
-
soup_mgr = soup_mgr or soupManager(url_mgr=url_mgr,req_mgr=req_mgr,url=url,source_code=source_code)
|
360
|
+
soup_mgr = soup_mgr or soupManager(url_mgr=url_mgr,req_mgr=req_mgr,url=url,source_code=source_code,soup=soup)
|
359
361
|
return soup_mgr
|
360
|
-
def get_all_attribute_values(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,tags_list = None,parse_type=
|
361
|
-
|
362
|
+
def get_all_attribute_values(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,soup=None,tags_list = None,parse_type=None):
|
363
|
+
parse_type = parse_type or "html.parser"
|
364
|
+
soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr,soup=soup)
|
362
365
|
return soup_mgr.get_all_attribute_values(tags_list=tags_list)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.142
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -30,21 +30,22 @@ abstract_webtools/managers/videoDownloader2.py,sha256=v3H6akdhvVWGrB-r35m3cp_-aK
|
|
30
30
|
abstract_webtools/managers/clownworld/__init__.py,sha256=eq25euhRbFqHLm1ibi_7FGz_oNWs-kkyAkETzK3r4_Q,35
|
31
31
|
abstract_webtools/managers/clownworld/get_bolshevid_video.py,sha256=dNZdOxhXSA13DWFjdSOmvYrI3HybkrrvTBaMDbJfhfo,10140
|
32
32
|
abstract_webtools/managers/linkManager/__init__.py,sha256=NpfWNzvTLSfsIWSeLYIxPzeLHADk_grSx5rfgCeWERw,27
|
33
|
-
abstract_webtools/managers/linkManager/linkManager.py,sha256=
|
33
|
+
abstract_webtools/managers/linkManager/linkManager.py,sha256=QrAJq-Zt907jnsm2P9si8SQ5O5QrXor7Jn5W5rge4xU,12662
|
34
34
|
abstract_webtools/managers/middleManager/__init__.py,sha256=RLLS1CxPpixIiV50P6tFaJcQ9C2O3lz19I4EDMc_4rE,19
|
35
35
|
abstract_webtools/managers/middleManager/imports.py,sha256=T0cdlABayG64RI4PnDRf7gwLvcQ5owobD0EdaD0Fcuc,334
|
36
|
-
abstract_webtools/managers/middleManager/src/
|
36
|
+
abstract_webtools/managers/middleManager/src/UnifiedWebManage3r.py,sha256=j_EBd2QkGFTLBKUen9k-mRWHfT6NwtFfoFCI-AagKtA,5442
|
37
|
+
abstract_webtools/managers/middleManager/src/UnifiedWebManager.py,sha256=qYCvfjUbyXrJEvOEqX7SkW7qyoaXP641DUno0N2ivN8,5022
|
37
38
|
abstract_webtools/managers/middleManager/src/__init__.py,sha256=YaSAh7AG1EvFWFZBIe4pGvzmfr60rpR9ZDWoQKqAMd0,61
|
38
39
|
abstract_webtools/managers/middleManager/src/legacy_tools.py,sha256=2cCnRaq8UO7HdtffNtAOsZFJm_mpZbpvBuX0pIIWGaM,125
|
39
40
|
abstract_webtools/managers/requestManager/__init__.py,sha256=z2qGtweEoO_OKr959LGxVXEMu1hu7PIkmh89BEh5TI8,30
|
40
|
-
abstract_webtools/managers/requestManager/requestManager.py,sha256=
|
41
|
+
abstract_webtools/managers/requestManager/requestManager.py,sha256=26BdfGrkWq2ouDaf0P8HTVK46PtPZJHUO46lIZgd8D8,19768
|
41
42
|
abstract_webtools/managers/soupManager/__init__.py,sha256=mqfXfqM9sWlYpOkoXUqtBoVvk2KQx1862NnmRVJwGtY,27
|
42
43
|
abstract_webtools/managers/soupManager/asoueces.py,sha256=OaXqolZl0dI7b09NYwJ3Wnhuxf89ahZ1GjsOqy0GXfk,3506
|
43
|
-
abstract_webtools/managers/soupManager/soupManager.py,sha256=
|
44
|
+
abstract_webtools/managers/soupManager/soupManager.py,sha256=75gwqVXIRwgVqzATBC-DiJF2AT_AdE6FSBWy3DbW5ZA,17393
|
44
45
|
abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
|
45
46
|
abstract_webtools/managers/urlManager/urlManager (Copy).py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
|
46
47
|
abstract_webtools/managers/urlManager/urlManager.py,sha256=vY4KQXtcrlC2YtlultxQpVe581l5kAuT5VGA0WrI16g,8945
|
47
|
-
abstract_webtools-0.1.6.
|
48
|
-
abstract_webtools-0.1.6.
|
49
|
-
abstract_webtools-0.1.6.
|
50
|
-
abstract_webtools-0.1.6.
|
48
|
+
abstract_webtools-0.1.6.142.dist-info/METADATA,sha256=FXWOBxUiBcDXee0Dbf_JGQFsIZfI6caniOuWkEEB9P0,7289
|
49
|
+
abstract_webtools-0.1.6.142.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
50
|
+
abstract_webtools-0.1.6.142.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
51
|
+
abstract_webtools-0.1.6.142.dist-info/RECORD,,
|
File without changes
|
{abstract_webtools-0.1.6.140.dist-info → abstract_webtools-0.1.6.142.dist-info}/top_level.txt
RENAMED
File without changes
|