abstract-webtools 0.1.6.139__tar.gz → 0.1.6.141__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/PKG-INFO +1 -1
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/setup.py +1 -1
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/__init__.py +1 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/linkManager/linkManager.py +41 -16
- abstract_webtools-0.1.6.141/src/abstract_webtools/managers/middleManager/__init__.py +1 -0
- abstract_webtools-0.1.6.141/src/abstract_webtools/managers/middleManager/imports.py +18 -0
- abstract_webtools-0.1.6.141/src/abstract_webtools/managers/middleManager/src/UnifiedWebManage3r.py +136 -0
- abstract_webtools-0.1.6.141/src/abstract_webtools/managers/middleManager/src/UnifiedWebManager.py +130 -0
- abstract_webtools-0.1.6.141/src/abstract_webtools/managers/middleManager/src/__init__.py +2 -0
- abstract_webtools-0.1.6.141/src/abstract_webtools/managers/middleManager/src/legacy_tools.py +8 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/requestManager/requestManager.py +187 -144
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/soupManager/soupManager.py +17 -14
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/urlManager/urlManager.py +20 -28
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools.egg-info/PKG-INFO +1 -1
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools.egg-info/SOURCES.txt +6 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/README.md +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/pyproject.toml +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/setup.cfg +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/__init__.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/abstract_usurpit.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/abstract_webtools.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/big_user_agent_list.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/domain_identifier.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/extention_list.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/find_dirs.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/k2s_downloader.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/main.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/allss//.py" +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/cipherManager.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/clownworld/__init__.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/clownworld/get_bolshevid_video.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/crawlManager.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/crawlmgr2.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/curlMgr.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/domainManager.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/get_test.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/linkManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/mySocketClient.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/networkManager.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/requestManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/seleniumManager.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/soupManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/soupManager/asoueces.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/sslManager.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/urlManager/__init__.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/urlManager/urlManager (Copy).py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/userAgentManager.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/videoDownloader.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/managers/videoDownloader2.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/soup_gui.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/url_grabber.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/url_grabber_new.py +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools.egg-info/requires.txt +0 -0
- {abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.141
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
4
|
long_description = fh.read()
|
5
5
|
setuptools.setup(
|
6
6
|
name='abstract_webtools',
|
7
|
-
version='0.1.6.
|
7
|
+
version='0.1.6.141',
|
8
8
|
author='putkoff',
|
9
9
|
author_email='partners@abstractendeavors.com',
|
10
10
|
description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
|
@@ -2,11 +2,7 @@ from ...abstract_webtools import *
|
|
2
2
|
from ..urlManager import *
|
3
3
|
from ..requestManager import *
|
4
4
|
from ..soupManager import *
|
5
|
-
|
6
|
-
url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
7
|
-
return url_mgr.url
|
8
|
-
def get_url_mgr(url=None,url_mgr=None):
|
9
|
-
return url_mgr or urlManager(url)
|
5
|
+
|
10
6
|
class linkManager:
|
11
7
|
"""
|
12
8
|
LinkManager is a class for managing and extracting links and image links from a web page.
|
@@ -43,12 +39,34 @@ class linkManager:
|
|
43
39
|
- The LinkManager class helps manage and extract links and image links from web pages.
|
44
40
|
- The class provides flexibility in specifying criteria for link extraction.
|
45
41
|
"""
|
46
|
-
def __init__(self,
|
42
|
+
def __init__(self,
|
43
|
+
url=None,
|
44
|
+
source_code=None,
|
45
|
+
soup=None,
|
46
|
+
url_mgr=None,
|
47
|
+
req_mgr=None,
|
48
|
+
soup_mgr=None,
|
49
|
+
parse_type=None,
|
50
|
+
image_link_tags='img',
|
51
|
+
img_link_attrs='src',
|
52
|
+
link_tags='a',
|
53
|
+
link_attrs='href',
|
54
|
+
strict_order_tags=False,
|
55
|
+
img_attr_value_desired=None,
|
56
|
+
img_attr_value_undesired=None,
|
57
|
+
link_attr_value_desired=None,
|
58
|
+
link_attr_value_undesired=None,
|
59
|
+
associated_data_attr=["data-title",'alt','title'],
|
60
|
+
get_img=["data-title",'alt','title']
|
61
|
+
):
|
62
|
+
|
47
63
|
self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
48
|
-
self.url=self.url_mgr
|
49
|
-
self.req_mgr = get_req_mgr(
|
50
|
-
self.source_code =
|
51
|
-
self.soup_mgr = get_soup_mgr(
|
64
|
+
self.url = get_url(url=url,url_mgr=self.url_mgr)
|
65
|
+
self.req_mgr = get_req_mgr(url=self.url,url_mgr=self.url_mgr,source_code=source_code,req_mgr=req_mgr)
|
66
|
+
self.source_code = get_source(url=self.url,url_mgr=self.url_mgr,source_code=source_code,req_mgr=self.req_mgr)
|
67
|
+
self.soup_mgr = get_soup_mgr(url=self.url,url_mgr=self.url_mgr,source_code=self.source_code,req_mgr=self.req_mgr,soup_mgr=soup_mgr,soup=soup,parse_type=parse_type)
|
68
|
+
|
69
|
+
self.soup = get_soup(url=self.url,url_mgr=self.url_mgr,req_mgr=self.req_mgr,source_code=self.source_code,soup_mgr=self.soup_mgr)
|
52
70
|
|
53
71
|
self.strict_order_tags=strict_order_tags
|
54
72
|
self.image_link_tags=image_link_tags
|
@@ -71,6 +89,7 @@ class linkManager:
|
|
71
89
|
attr_value_undesired=self.link_attr_value_undesired,
|
72
90
|
associated_data_attr=self.associated_data_attr,
|
73
91
|
get_img=get_img)
|
92
|
+
|
74
93
|
def re_initialize(self):
|
75
94
|
self.all_desired_image_links=self.find_all_desired_links(tag=self.image_link_tags,attr=self.img_link_attrs,strict_order_tags=self.strict_order_tags,attr_value_desired=self.img_attr_value_desired,attr_value_undesired=self.img_attr_value_undesired)
|
76
95
|
self.all_desired_links=self.find_all_desired_links(tag=self.link_tags,attr=self.link_attrs,strict_order_tags=self.strict_order_tags,attr_value_desired=self.link_attr_value_desired,attr_value_undesired=self.link_attr_value_undesired,associated_data_attr=self.associated_data_attr,get_img=self.get_img)
|
@@ -164,19 +183,25 @@ class linkManager:
|
|
164
183
|
if url not in domains_ls and comp_domain == domain:
|
165
184
|
domains_ls.append(url)
|
166
185
|
return domains_ls
|
186
|
+
|
167
187
|
def find_all_desired_links(self,tag='img', attr='src',attr_value_desired=None,strict_order_tags=False,attr_value_undesired=None,associated_data_attr=None,all_desired=None,get_img=None):
|
168
188
|
all_desired = all_desired or self.find_all_desired(tag=tag,attr=attr,strict_order_tags=strict_order_tags,attr_value_desired=attr_value_desired,attr_value_undesired=attr_value_undesired,associated_data_attr=associated_data_attr,get_img=get_img)
|
169
189
|
assiciated_attrs = all_desired[-1]
|
170
190
|
valid_assiciated_attrs = []
|
171
191
|
desired_links=[]
|
172
192
|
for i,attr in enumerate(all_desired[:-1]):
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
193
|
+
|
194
|
+
self.url_mgr.domain = self.url_mgr.domain or ''
|
195
|
+
|
196
|
+
self.url_mgr.protocol = self.url_mgr.protocol or 'https'
|
197
|
+
|
198
|
+
if attr:
|
199
|
+
valid_attr=self.url_mgr.make_valid(attr,self.url_mgr.protocol+'://'+self.url_mgr.domain)
|
200
|
+
if valid_attr:
|
201
|
+
desired_links.append(valid_attr)
|
202
|
+
valid_assiciated_attrs.append(assiciated_attrs[i])
|
203
|
+
valid_assiciated_attrs[-1]["link"]=valid_attr
|
178
204
|
desired_links.append(valid_assiciated_attrs)
|
179
205
|
return desired_links
|
180
206
|
|
181
|
-
|
182
207
|
|
@@ -0,0 +1 @@
|
|
1
|
+
from .src import *
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import logging
|
2
|
+
from ..urlManager import (
|
3
|
+
urlManager,
|
4
|
+
get_url,
|
5
|
+
get_url_mgr
|
6
|
+
)
|
7
|
+
from ..requestManager import (
|
8
|
+
requestManager,
|
9
|
+
get_source,
|
10
|
+
get_req_mgr
|
11
|
+
)
|
12
|
+
from ..soupManager import (
|
13
|
+
soupManager,
|
14
|
+
get_soup,
|
15
|
+
get_soup_mgr
|
16
|
+
)
|
17
|
+
from bs4 import BeautifulSoup
|
18
|
+
logging.basicConfig(level=logging.INFO)
|
abstract_webtools-0.1.6.141/src/abstract_webtools/managers/middleManager/src/UnifiedWebManage3r.py
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
from ..imports import *
|
2
|
+
|
3
|
+
class UnifiedWebManager:
|
4
|
+
"""
|
5
|
+
Unified middleware that ties together URL, request, and soup managers.
|
6
|
+
Lazily initializes components based on provided inputs.
|
7
|
+
|
8
|
+
Args:
|
9
|
+
url (str or None): The base URL.
|
10
|
+
source_code (str or bytes or None): Pre-fetched source code.
|
11
|
+
url_mgr (urlManager or None): Existing URL manager.
|
12
|
+
req_mgr (requestManager or None): Existing request manager.
|
13
|
+
soup_mgr (soupManager or None): Existing soup manager.
|
14
|
+
parse_type (str): Parser type for BeautifulSoup (default: "html.parser").
|
15
|
+
"""
|
16
|
+
def __init__(self, url=None, source_code=None, url_mgr=None, req_mgr=None, soup_mgr=None,soup=None, parse_type="html.parser"):
|
17
|
+
self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
18
|
+
self.url = get_url(url=url,url_mgr=self.url_mgr)
|
19
|
+
self.req_mgr = get_source(url=self.url,url_mgr=self.url_mgr,source_code=source_code,req_mgr=req_mgr)
|
20
|
+
self.source_code = get_source(url=self.url,url_mgr=self.url_mgr,source_code=source_code,req_mgr=self.req_mgr)
|
21
|
+
self.soup_mgr = get_soup_mgr(url=self.url,url_mgr=self.url_mgr,source_code=self.source_code,req_mgr=self.req_mgr,soup_mgr=soup_mgr,soup=soup,parse_type=parse_type)
|
22
|
+
self.soup = get_soup(url=self.url,url_mgr=self.url_mgr,req_mgr=self.req_mgr,source_code=self.source_code,soup_mgr=self.soup_mgr)
|
23
|
+
|
24
|
+
@property
|
25
|
+
def url_mgr(self):
|
26
|
+
if self.url_mgr is None:
|
27
|
+
if self.url is None:
|
28
|
+
logging.warning("No URL provided; URL manager cannot be created.")
|
29
|
+
return None
|
30
|
+
self.url_mgr = urlManager(url=self.url)
|
31
|
+
return self.url_mgr
|
32
|
+
|
33
|
+
@property
|
34
|
+
def url(self):
|
35
|
+
if self.url is None and self.url_mgr:
|
36
|
+
self.url = self.url_mgr.url
|
37
|
+
return self.url
|
38
|
+
|
39
|
+
@property
|
40
|
+
def req_mgr(self):
|
41
|
+
if self.req_mgr is None:
|
42
|
+
self.req_mgr = requestManager(
|
43
|
+
url=self.url,
|
44
|
+
url_mgr=self.url_mgr,
|
45
|
+
source_code=self.source_code
|
46
|
+
)
|
47
|
+
return self.req_mgr
|
48
|
+
|
49
|
+
@property
|
50
|
+
def source_code(self):
|
51
|
+
if self.source_code is None and self.req_mgr:
|
52
|
+
self.source_code = self.req_mgr.source_code
|
53
|
+
return self.source_code
|
54
|
+
|
55
|
+
@property
|
56
|
+
def soup_mgr(self):
|
57
|
+
if self.soup_mgr is None:
|
58
|
+
self.soup_mgr = soupManager(
|
59
|
+
url=self.url,
|
60
|
+
url_mgr=self.url_mgr,
|
61
|
+
req_mgr=self.req_mgr,
|
62
|
+
source_code=self.source_code
|
63
|
+
)
|
64
|
+
return self.soup_mgr
|
65
|
+
|
66
|
+
@property
|
67
|
+
def soup(self):
|
68
|
+
if self.soup is None:
|
69
|
+
source = self.source_code
|
70
|
+
if source is None:
|
71
|
+
logging.warning("No source code available; Soup cannot be created.")
|
72
|
+
return None
|
73
|
+
if isinstance(source, bytes):
|
74
|
+
source = source.decode('utf-8', errors='ignore')
|
75
|
+
self.soup = BeautifulSoup(source, self.parse_type)
|
76
|
+
return self.soup
|
77
|
+
|
78
|
+
def update_url(self, url):
|
79
|
+
"""Update the URL and reset dependent managers."""
|
80
|
+
self.url = url
|
81
|
+
self.url_mgr = None
|
82
|
+
self.req_mgr = None
|
83
|
+
self.soup_mgr = None
|
84
|
+
self.source_code = None
|
85
|
+
self.soup = None
|
86
|
+
|
87
|
+
def update_source_code(self, source_code):
|
88
|
+
"""Update the source code and reset dependent managers."""
|
89
|
+
self.source_code = source_code
|
90
|
+
self.req_mgr = None
|
91
|
+
self.soup_mgr = None
|
92
|
+
self.soup = None
|
93
|
+
|
94
|
+
# Convenience methods for direct access
|
95
|
+
def get_all_tools(self):
|
96
|
+
"""Return a dict with all components (similar to original getters)."""
|
97
|
+
return {
|
98
|
+
'url': self.url,
|
99
|
+
'url_mgr': self.url_mgr,
|
100
|
+
'source_code': self.source_code,
|
101
|
+
'req_mgr': self.req_mgr,
|
102
|
+
'soup': self.soup,
|
103
|
+
'soup_mgr': self.soup_mgr
|
104
|
+
}
|
105
|
+
def endow_to_manager(self, target_manager, all_tools=None):
|
106
|
+
"""
|
107
|
+
Endow (assign) the attributes from all_tools to the target manager instance.
|
108
|
+
|
109
|
+
Args:
|
110
|
+
target_manager: The instance (e.g., another manager class) to endow attributes to.
|
111
|
+
all_tools (dict or None): Optional dict of tools/attributes. If None, uses self.get_all_tools().
|
112
|
+
"""
|
113
|
+
if all_tools is None:
|
114
|
+
all_tools = self.get_all_tools()
|
115
|
+
for key, value in all_tools.items():
|
116
|
+
setattr(target_manager, key, value)
|
117
|
+
return target_manager
|
118
|
+
# Wrapper functions for backward compatibility
|
119
|
+
def get_url_tools(url=None, url_mgr=None):
|
120
|
+
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr)
|
121
|
+
return {'url': mgr.url, 'url_mgr': mgr.url_mgr}
|
122
|
+
|
123
|
+
def get_req_tools(url=None, url_mgr=None, source_code=None, req_mgr=None):
|
124
|
+
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr, source_code=source_code, req_mgr=req_mgr)
|
125
|
+
return {'url': mgr.url, 'url_mgr': mgr.url_mgr, 'source_code': mgr.source_code, 'req_mgr': mgr.req_mgr}
|
126
|
+
|
127
|
+
def get_soup_tools(url=None, url_mgr=None, source_code=None, req_mgr=None, soup=None, soup_mgr=None,target_manager=None):
|
128
|
+
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr, source_code=source_code, req_mgr=req_mgr, soup_mgr=soup_mgr)
|
129
|
+
if soup is not None:
|
130
|
+
mgr.soup = soup # Allow overriding
|
131
|
+
if target_manager:
|
132
|
+
mgr.endow_to_manager(target_manager, all_tools=None)
|
133
|
+
return target_manager
|
134
|
+
return mgr.get_all_tools()
|
135
|
+
|
136
|
+
|
abstract_webtools-0.1.6.141/src/abstract_webtools/managers/middleManager/src/UnifiedWebManager.py
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
import logging
|
2
|
+
from bs4 import BeautifulSoup
|
3
|
+
from ..imports import *
|
4
|
+
|
5
|
+
logging.basicConfig(level=logging.INFO)
|
6
|
+
|
7
|
+
class UnifiedWebManager:
|
8
|
+
"""
|
9
|
+
Unified middleware that ties together URL, request, and soup managers.
|
10
|
+
Lazily initializes components based on provided inputs.
|
11
|
+
|
12
|
+
Args:
|
13
|
+
url (str or None): The base URL.
|
14
|
+
source_code (str or bytes or None): Pre-fetched source code.
|
15
|
+
url_mgr (urlManager or None): Existing URL manager.
|
16
|
+
req_mgr (requestManager or None): Existing request manager.
|
17
|
+
soup_mgr (soupManager or None): Existing soup manager.
|
18
|
+
soup (BeautifulSoup or None): Pre-parsed soup object.
|
19
|
+
parse_type (str): Parser type for BeautifulSoup (default: "html.parser").
|
20
|
+
"""
|
21
|
+
def __init__(self, url=None, source_code=None, url_mgr=None, req_mgr=None, soup_mgr=None, soup=None, parse_type="html.parser"):
|
22
|
+
self._url = url
|
23
|
+
self._source_code = source_code
|
24
|
+
self._url_mgr = url_mgr
|
25
|
+
self._req_mgr = req_mgr
|
26
|
+
self._soup_mgr = soup_mgr
|
27
|
+
self._soup = soup
|
28
|
+
self._parse_type = parse_type
|
29
|
+
|
30
|
+
@property
|
31
|
+
def url_mgr(self):
|
32
|
+
if self._url_mgr is None:
|
33
|
+
if self._url is None:
|
34
|
+
logging.warning("No URL provided; URL manager cannot be created.")
|
35
|
+
return None
|
36
|
+
self._url_mgr = get_url_mgr(url=self._url)
|
37
|
+
return self._url_mgr
|
38
|
+
|
39
|
+
@property
|
40
|
+
def url(self):
|
41
|
+
if self._url is None and self.url_mgr is not None:
|
42
|
+
self._url = get_url(url_mgr=self.url_mgr)
|
43
|
+
return self._url
|
44
|
+
|
45
|
+
@property
|
46
|
+
def req_mgr(self):
|
47
|
+
if self._req_mgr is None:
|
48
|
+
self._req_mgr = get_req_mgr(url=self.url, url_mgr=self.url_mgr, source_code=self._source_code)
|
49
|
+
return self._req_mgr
|
50
|
+
|
51
|
+
@property
|
52
|
+
def source_code(self):
|
53
|
+
if self._source_code is None and self.req_mgr is not None:
|
54
|
+
self._source_code = get_source(req_mgr=self.req_mgr)
|
55
|
+
return self._source_code
|
56
|
+
|
57
|
+
@property
|
58
|
+
def soup_mgr(self):
|
59
|
+
if self._soup_mgr is None:
|
60
|
+
self._soup_mgr = get_soup_mgr(url=self.url, url_mgr=self.url_mgr, source_code=self.source_code, req_mgr=self.req_mgr)
|
61
|
+
return self._soup_mgr
|
62
|
+
|
63
|
+
@property
|
64
|
+
def soup(self):
|
65
|
+
if self._soup is None:
|
66
|
+
source = self.source_code
|
67
|
+
if source is None:
|
68
|
+
logging.warning("No source code available; Soup cannot be created.")
|
69
|
+
return None
|
70
|
+
if isinstance(source, bytes):
|
71
|
+
source = source.decode('utf-8', errors='ignore')
|
72
|
+
self._soup = get_soup(source_code=source, parse_type=self._parse_type)
|
73
|
+
return self._soup
|
74
|
+
|
75
|
+
def update_url(self, url):
|
76
|
+
"""Update the URL and reset dependent managers."""
|
77
|
+
self._url = url
|
78
|
+
self._url_mgr = None
|
79
|
+
self._req_mgr = None
|
80
|
+
self._soup_mgr = None
|
81
|
+
self._source_code = None
|
82
|
+
self._soup = None
|
83
|
+
|
84
|
+
def update_source_code(self, source_code):
|
85
|
+
"""Update the source code and reset dependent managers."""
|
86
|
+
self._source_code = source_code
|
87
|
+
self._req_mgr = None
|
88
|
+
self._soup_mgr = None
|
89
|
+
self._soup = None
|
90
|
+
|
91
|
+
# Convenience methods for direct access
|
92
|
+
def get_all_tools(self):
|
93
|
+
"""Return a dict with all components (similar to original getters)."""
|
94
|
+
return {
|
95
|
+
'url': self.url,
|
96
|
+
'url_mgr': self.url_mgr,
|
97
|
+
'source_code': self.source_code,
|
98
|
+
'req_mgr': self.req_mgr,
|
99
|
+
'soup': self.soup,
|
100
|
+
'soup_mgr': self.soup_mgr
|
101
|
+
}
|
102
|
+
|
103
|
+
def endow_to_manager(self, target_manager, all_tools=None):
|
104
|
+
"""
|
105
|
+
Endow (assign) the attributes from all_tools to the target manager instance.
|
106
|
+
|
107
|
+
Args:
|
108
|
+
target_manager: The instance (e.g., another manager class) to endow attributes to.
|
109
|
+
all_tools (dict or None): Optional dict of tools/attributes. If None, uses self.get_all_tools().
|
110
|
+
"""
|
111
|
+
if all_tools is None:
|
112
|
+
all_tools = self.get_all_tools()
|
113
|
+
for key, value in all_tools.items():
|
114
|
+
setattr(target_manager, key, value)
|
115
|
+
return target_manager
|
116
|
+
|
117
|
+
# Wrapper functions for backward compatibility
|
118
|
+
def get_url_tools(url=None, url_mgr=None):
|
119
|
+
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr)
|
120
|
+
return {'url': mgr.url, 'url_mgr': mgr.url_mgr}
|
121
|
+
|
122
|
+
def get_req_tools(url=None, url_mgr=None, source_code=None, req_mgr=None):
|
123
|
+
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr, source_code=source_code, req_mgr=req_mgr)
|
124
|
+
return {'url': mgr.url, 'url_mgr': mgr.url_mgr, 'source_code': mgr.source_code, 'req_mgr': mgr.req_mgr}
|
125
|
+
|
126
|
+
def get_soup_tools(url=None, url_mgr=None, source_code=None, req_mgr=None, soup=None, soup_mgr=None, target_manager=None):
|
127
|
+
mgr = UnifiedWebManager(url=url, url_mgr=url_mgr, source_code=source_code, req_mgr=req_mgr, soup_mgr=soup_mgr, soup=soup)
|
128
|
+
if target_manager:
|
129
|
+
return mgr.endow_to_manager(target_manager)
|
130
|
+
return mgr.get_all_tools()
|
@@ -1,5 +1,4 @@
|
|
1
1
|
from ...abstract_webtools import *
|
2
|
-
|
3
2
|
from ..userAgentManager import *
|
4
3
|
from ..cipherManager import *
|
5
4
|
from ..sslManager import *
|
@@ -7,10 +6,14 @@ from ..tlsAdapter import *
|
|
7
6
|
from ..networkManager import *
|
8
7
|
from ..seleniumManager import *
|
9
8
|
from ..urlManager import *
|
9
|
+
logging.basicConfig(level=logging.INFO)
|
10
|
+
|
10
11
|
class requestManager:
|
11
12
|
"""
|
12
|
-
|
13
|
-
|
13
|
+
requestManager is a class for making HTTP requests with error handling and retries.
|
14
|
+
It supports initializing with a provided source_code without requiring a URL.
|
15
|
+
If source_code is provided, it uses that as the response content and skips fetching.
|
16
|
+
Enhanced to parse source_code for URLs, PHP blocks, and React/JS data even if not HTML.
|
14
17
|
Args:
|
15
18
|
url (str or None): The URL to make requests to (default is None).
|
16
19
|
url_mgr (urlManager or None): An instance of urlManager (default is None).
|
@@ -56,8 +59,7 @@ class requestManager:
|
|
56
59
|
- The SafeRequest class is designed for making HTTP requests with error handling and retries.
|
57
60
|
- It provides methods for authentication, response handling, and error management.
|
58
61
|
"""
|
59
|
-
def __init__(self,
|
60
|
-
url=None,
|
62
|
+
def __init__(self,url=None,
|
61
63
|
source_code=None,
|
62
64
|
url_mgr=None,
|
63
65
|
network_manager=None,
|
@@ -78,257 +80,299 @@ class requestManager:
|
|
78
80
|
login_user_agent=None,
|
79
81
|
auth=None,
|
80
82
|
login_url=None,
|
81
|
-
email
|
83
|
+
email=None,
|
82
84
|
password=None,
|
83
85
|
checkbox=None,
|
84
86
|
dropdown=None,
|
85
87
|
certification=None,
|
86
88
|
stream=False,
|
87
|
-
timeout
|
89
|
+
timeout=None,
|
88
90
|
last_request_time=None,
|
89
91
|
max_retries=None,
|
90
|
-
request_wait_limit=
|
91
|
-
|
92
|
-
self.
|
93
|
-
self.url=get_url(url=url,url_mgr=self.url_mgr)
|
92
|
+
request_wait_limit=None):
|
93
|
+
self.url_mgr = get_url_mgr(url=url, url_mgr=url_mgr)
|
94
|
+
self.url = get_url(url=url, url_mgr=self.url_mgr)
|
94
95
|
self._url_mgr = self.url_mgr
|
95
|
-
self._url=self.url
|
96
|
+
self._url = self.url
|
96
97
|
self.user_agent = user_agent
|
97
98
|
self.user_agent_manager = user_agent_manager or UserAgentManager(user_agent=self.user_agent)
|
98
|
-
self.headers= headers or self.user_agent_manager.header or {'Accept': '*/*'}
|
99
|
-
self.user_agent= self.user_agent_manager.user_agent
|
100
|
-
self.ciphers=ciphers or CipherManager().ciphers_string
|
101
|
-
self.certification=certification
|
102
|
-
self.ssl_options=ssl_options
|
99
|
+
self.headers = headers or self.user_agent_manager.header or {'Accept': '*/*'}
|
100
|
+
self.user_agent = self.user_agent_manager.user_agent
|
101
|
+
self.ciphers = ciphers or CipherManager().ciphers_string
|
102
|
+
self.certification = certification
|
103
|
+
self.ssl_options = ssl_options
|
103
104
|
self.ssl_manager = ssl_manager or SSLManager(ciphers=self.ciphers, ssl_options=self.ssl_options, certification=self.certification)
|
104
|
-
self.tls_adapter=tls_adapter or
|
105
|
-
self.network_manager= network_manager or NetworkManager(user_agent_manager=self.user_agent_manager,ssl_manager=self.ssl_manager, tls_adapter=self.tls_adapter,user_agent=user_agent,proxies=proxies,cookies=cookies,ciphers=ciphers, certification=certification, ssl_options=ssl_options)
|
106
|
-
self.stream=stream
|
107
|
-
self.tls_adapter=self.network_manager.tls_adapter
|
108
|
-
self.ciphers=self.network_manager.ciphers
|
109
|
-
self.certification=self.network_manager.certification
|
110
|
-
self.ssl_options=self.network_manager.ssl_options
|
111
|
-
self.proxies=self.network_manager.proxies
|
112
|
-
self.timeout=timeout
|
113
|
-
self.cookies=self.network_manager.cookies
|
114
|
-
self.session = session or requests.
|
105
|
+
self.tls_adapter = tls_adapter or TLSAdapter(ssl_manager=self.ssl_manager, certification=self.certification, ssl_options=self.ssl_manager.ssl_options)
|
106
|
+
self.network_manager = network_manager or NetworkManager(user_agent_manager=self.user_agent_manager, ssl_manager=self.ssl_manager, tls_adapter=self.tls_adapter, user_agent=user_agent, proxies=proxies, cookies=cookies, ciphers=ciphers, certification=certification, ssl_options=ssl_options)
|
107
|
+
self.stream = stream
|
108
|
+
self.tls_adapter = self.network_manager.tls_adapter
|
109
|
+
self.ciphers = self.network_manager.ciphers
|
110
|
+
self.certification = self.network_manager.certification
|
111
|
+
self.ssl_options = self.network_manager.ssl_options
|
112
|
+
self.proxies = self.network_manager.proxies
|
113
|
+
self.timeout = timeout
|
114
|
+
self.cookies = self.network_manager.cookies
|
115
|
+
self.session = session or requests.Session()
|
115
116
|
self.auth = auth
|
116
|
-
self.spec_login=spec_login
|
117
|
-
self.password=password
|
117
|
+
self.spec_login = spec_login
|
118
|
+
self.password = password
|
118
119
|
self.email = email
|
119
|
-
self.checkbox=checkbox
|
120
|
-
self.dropdown=dropdown
|
121
|
-
self.login_url=login_url
|
122
|
-
self.login_user_agent=login_user_agent
|
123
|
-
self.login_referer=login_referer
|
124
|
-
self.protocol=protocol or 'https://'
|
125
|
-
|
126
|
-
self.stream=stream if isinstance(stream,bool) else False
|
120
|
+
self.checkbox = checkbox
|
121
|
+
self.dropdown = dropdown
|
122
|
+
self.login_url = login_url
|
123
|
+
self.login_user_agent = login_user_agent
|
124
|
+
self.login_referer = login_referer
|
125
|
+
self.protocol = protocol or 'https://'
|
126
|
+
self.stream = stream if isinstance(stream, bool) else False
|
127
127
|
self.initialize_session()
|
128
|
-
self.last_request_time=last_request_time
|
128
|
+
self.last_request_time = last_request_time
|
129
129
|
self.max_retries = max_retries or 3
|
130
130
|
self.request_wait_limit = request_wait_limit or 1.5
|
131
|
-
self._response=None
|
132
|
-
self.status_code=None
|
133
|
-
self.source_code =
|
134
|
-
self.source_code_bytes=None
|
131
|
+
self._response = None
|
132
|
+
self.status_code = None
|
133
|
+
self.source_code = None
|
134
|
+
self.source_code_bytes = None
|
135
135
|
self.source_code_json = {}
|
136
|
-
self.react_source_code=[]
|
136
|
+
self.react_source_code = []
|
137
|
+
self.extracted_urls = []
|
138
|
+
self.php_blocks = []
|
137
139
|
self._response_data = None
|
138
|
-
|
139
|
-
|
140
|
-
|
140
|
+
if source_code is not None:
|
141
|
+
self._response = source_code
|
142
|
+
self.process_response_data()
|
143
|
+
else:
|
144
|
+
self.re_initialize()
|
145
|
+
|
146
|
+
def update_url_mgr(self, url_mgr):
|
147
|
+
self.url_mgr = url_mgr
|
141
148
|
self.re_initialize()
|
142
|
-
|
149
|
+
|
150
|
+
def update_url(self, url):
|
143
151
|
self.url_mgr.update_url(url=url)
|
144
152
|
self.re_initialize()
|
153
|
+
|
145
154
|
def re_initialize(self):
|
146
|
-
self._response=None
|
147
|
-
self.
|
155
|
+
self._response = None
|
156
|
+
if self.url_mgr.url is not None:
|
157
|
+
self.make_request()
|
148
158
|
self.source_code = None
|
149
|
-
self.source_code_bytes=None
|
159
|
+
self.source_code_bytes = None
|
150
160
|
self.source_code_json = {}
|
151
|
-
self.react_source_code=[]
|
161
|
+
self.react_source_code = []
|
162
|
+
self.extracted_urls = []
|
163
|
+
self.php_blocks = []
|
152
164
|
self._response_data = None
|
153
165
|
self.process_response_data()
|
166
|
+
|
154
167
|
@property
|
155
168
|
def response(self):
|
156
169
|
"""Lazy-loading of response."""
|
157
|
-
if self._response is None:
|
170
|
+
if self._response is None and self.url_mgr.url is not None:
|
158
171
|
self._response = self.fetch_response()
|
159
|
-
|
160
|
-
|
161
172
|
return self._response
|
162
|
-
|
163
|
-
|
173
|
+
|
174
|
+
def authenticate(self, session, login_url=None, email=None, password=None, checkbox=None, dropdown=None):
|
175
|
+
login_urls = login_url or [self.url_mgr.url, self.url_mgr.domain, self.url_mgr.url_join(url=self.url_mgr.domain, path='login'), self.url_mgr.url_join(url=self.url_mgr.domain, path='auth')]
|
164
176
|
s = session
|
165
|
-
if not isinstance(login_urls,list):
|
166
|
-
login_urls=[login_urls]
|
177
|
+
if not isinstance(login_urls, list):
|
178
|
+
login_urls = [login_urls]
|
167
179
|
for login_url in login_urls:
|
168
180
|
login_url_mgr = urlManager(login_url)
|
169
181
|
login_url = login_url_mgr.url
|
170
|
-
|
171
182
|
r = s.get(login_url)
|
172
183
|
soup = BeautifulSoup(r.content, "html.parser")
|
173
184
|
# Find the token or any CSRF protection token
|
174
185
|
token = soup.find('input', {'name': 'token'}).get('value') if soup.find('input', {'name': 'token'}) else None
|
175
|
-
if token
|
186
|
+
if token is not None:
|
176
187
|
break
|
177
188
|
login_data = {}
|
178
|
-
if email
|
179
|
-
login_data['email']=email
|
180
|
-
if password
|
189
|
+
if email is not None:
|
190
|
+
login_data['email'] = email
|
191
|
+
if password is not None:
|
181
192
|
login_data['password'] = password
|
182
|
-
if checkbox
|
193
|
+
if checkbox is not None:
|
183
194
|
login_data['checkbox'] = checkbox
|
184
|
-
if dropdown
|
185
|
-
login_data['dropdown']=dropdown
|
186
|
-
if token
|
195
|
+
if dropdown is not None:
|
196
|
+
login_data['dropdown'] = dropdown
|
197
|
+
if token is not None:
|
187
198
|
login_data['token'] = token
|
188
199
|
s.post(login_url, data=login_data)
|
189
200
|
return s
|
190
201
|
|
191
|
-
def fetch_response(self) ->
|
202
|
+
def fetch_response(self) -> requests.Response | None | str | bytes:
|
192
203
|
"""Actually fetches the response from the server."""
|
193
|
-
# You can further adapt this method to use retries or other logic you had
|
194
|
-
# in your original code, but the main goal here is to fetch and return the response
|
195
204
|
return self.try_request()
|
205
|
+
|
196
206
|
def spec_auth(self, session=None, email=None, password=None, login_url=None, login_referer=None, login_user_agent=None):
|
197
|
-
s = session or requests.
|
198
|
-
|
199
|
-
domain = self.url_mgr.url_join(self.url_mgr.get_correct_url(self.url_mgr.domain),'login') if login_url is None else login_url
|
207
|
+
s = session or requests.Session()
|
208
|
+
domain = self.url_mgr.url_join(self.url_mgr.get_correct_url(self.url_mgr.domain), 'login') if login_url is None else login_url
|
200
209
|
login_url = self.url_mgr.get_correct_url(url=domain)
|
201
|
-
|
202
210
|
login_referer = login_referer or self.url_mgr.url_join(url=login_url, path='?role=fast&to=&s=1&m=1&email=YOUR_EMAIL')
|
203
211
|
login_user_agent = login_user_agent or 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:50.0) Gecko/20100101 Firefox/50.0'
|
204
|
-
|
205
212
|
headers = {"Referer": login_referer, 'User-Agent': login_user_agent}
|
206
213
|
payload = {'email': email, 'pass': password}
|
207
|
-
|
208
214
|
page = s.get(login_url)
|
209
215
|
soup = BeautifulSoup(page.content, 'lxml')
|
210
216
|
action_url = soup.find('form')['action']
|
211
217
|
s.post(action_url, data=payload, headers=headers)
|
212
218
|
return s
|
219
|
+
|
213
220
|
def initialize_session(self):
|
214
|
-
s = self.session
|
221
|
+
s = self.session
|
215
222
|
if self.auth:
|
216
|
-
s= self.auth
|
223
|
+
s = self.auth
|
217
224
|
elif self.spec_login:
|
218
|
-
s=self.spec_auth(session=s,email=self.email, password=self.password, login_url=self.login_url, login_referer=self.login_referer, login_user_agent=self.login_user_agent)
|
225
|
+
s = self.spec_auth(session=s, email=self.email, password=self.password, login_url=self.login_url, login_referer=self.login_referer, login_user_agent=self.login_user_agent)
|
219
226
|
elif any([self.password, self.email, self.login_url, self.checkbox, self.dropdown]):
|
220
|
-
s=self.authenticate(session=s, login_url=self.login_url, email=self.email, password=self.password, checkbox=self.checkbox, dropdown=self.dropdown)
|
227
|
+
s = self.authenticate(session=s, login_url=self.login_url, email=self.email, password=self.password, checkbox=self.checkbox, dropdown=self.dropdown)
|
221
228
|
s.proxies = self.proxies
|
222
229
|
s.cookies["cf_clearance"] = self.network_manager.cookies
|
223
230
|
s.headers.update(self.headers)
|
224
231
|
s.mount(self.protocol, self.network_manager.tls_adapter)
|
225
232
|
return s
|
233
|
+
|
226
234
|
def process_response_data(self):
|
227
235
|
"""Processes the fetched response data."""
|
228
236
|
if not self.response:
|
229
237
|
return # No data to process
|
230
|
-
if
|
231
|
-
|
238
|
+
if isinstance(self.response, (str, bytes)):
|
239
|
+
if isinstance(self.response, str):
|
240
|
+
self.source_code = self.response
|
241
|
+
self.source_code_bytes = self.response.encode('utf-8') # Assume UTF-8
|
242
|
+
else:
|
243
|
+
self.source_code_bytes = self.response
|
244
|
+
try:
|
245
|
+
self.source_code = self.response.decode('utf-8')
|
246
|
+
except UnicodeDecodeError:
|
247
|
+
self.source_code = self.response.decode('latin-1') # Fallback
|
248
|
+
# Check if it's JSON
|
249
|
+
try:
|
250
|
+
data = json.loads(self.source_code)
|
251
|
+
self.source_code_json = data.get("response", data)
|
252
|
+
except json.JSONDecodeError:
|
253
|
+
pass
|
232
254
|
else:
|
233
255
|
self.source_code = self.response.text
|
234
256
|
self.source_code_bytes = self.response.content
|
235
|
-
if self.response.headers.get('content-type'
|
236
|
-
|
237
|
-
|
257
|
+
if self.response.headers.get('content-type', '').startswith('application/json'):
|
258
|
+
try:
|
259
|
+
data = json.loads(self.source_code)
|
238
260
|
self.source_code_json = data.get("response", data)
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
261
|
+
except json.JSONDecodeError:
|
262
|
+
pass
|
263
|
+
self.extract_urls()
|
264
|
+
self.extract_php_blocks()
|
265
|
+
self.get_react_source_code()
|
266
|
+
|
267
|
+
def extract_urls(self):
|
268
|
+
"""Extract URLs from source_code using regex."""
|
269
|
+
if not self.source_code:
|
270
|
+
return
|
271
|
+
url_pattern = r'https?://[^\s<>"\']+'
|
272
|
+
self.extracted_urls = re.findall(url_pattern, self.source_code)
|
244
273
|
|
245
|
-
|
246
|
-
|
274
|
+
def extract_php_blocks(self):
|
275
|
+
"""Extract PHP blocks from source_code if present."""
|
276
|
+
if not self.source_code:
|
277
|
+
return
|
278
|
+
php_pattern = r'<\?php(.*?)?\?>'
|
279
|
+
self.php_blocks = re.findall(php_pattern, self.source_code, re.DOTALL)
|
247
280
|
|
248
|
-
|
249
|
-
list: A list of strings containing JavaScript and JSX source code found in <script> tags.
|
281
|
+
def get_react_source_code(self) -> list:
|
250
282
|
"""
|
251
|
-
if
|
283
|
+
Extracts JavaScript and JSX source code from <script> tags if HTML-like.
|
284
|
+
If not HTML and looks like JS/React code, appends the whole source_code.
|
285
|
+
"""
|
286
|
+
if not self.source_code:
|
252
287
|
return []
|
253
|
-
|
254
|
-
|
288
|
+
# Check if likely JS code (e.g., contains 'import', 'function', 'React')
|
289
|
+
is_js_like = any(keyword in self.source_code.lower() for keyword in ['import ', 'function ', 'react', 'export ', 'const ', 'let ', 'var '])
|
290
|
+
# Check if HTML-like
|
291
|
+
is_html_like = self.source_code.strip().startswith('<') or '<html' in self.source_code.lower() or '<!doctype' in self.source_code.lower()
|
292
|
+
if not is_html_like and is_js_like:
|
293
|
+
self.react_source_code.append(self.source_code)
|
294
|
+
return self.react_source_code
|
295
|
+
content = self.source_code_bytes or self.source_code.encode('utf-8')
|
296
|
+
soup = BeautifulSoup(content, "html.parser")
|
297
|
+
script_tags = soup.find_all('script', type=lambda t: t and ('javascript' in t.lower() or 'jsx' in t.lower()))
|
255
298
|
for script_tag in script_tags:
|
256
|
-
|
299
|
+
if script_tag.string:
|
300
|
+
self.react_source_code.append(script_tag.string)
|
301
|
+
# If no scripts found but JS-like, append whole
|
302
|
+
if not script_tags and is_js_like:
|
303
|
+
self.react_source_code.append(self.source_code)
|
304
|
+
return self.react_source_code
|
257
305
|
|
258
|
-
|
259
|
-
def get_status(url:str=None) -> int:
|
306
|
+
def get_status(self, url: str = None) -> int | None:
|
260
307
|
"""
|
261
308
|
Gets the HTTP status code of the given URL.
|
262
|
-
|
263
|
-
Args:
|
264
|
-
url (str): The URL to check the status of.
|
265
|
-
|
266
|
-
Returns:
|
267
|
-
int: The HTTP status code of the URL, or None if the request fails.
|
268
309
|
"""
|
269
|
-
|
270
|
-
|
310
|
+
url = url or self.url_mgr.url
|
311
|
+
if url is None:
|
312
|
+
return None
|
313
|
+
try:
|
314
|
+
response = requests.head(url, timeout=5)
|
315
|
+
return response.status_code
|
316
|
+
except requests.RequestException:
|
317
|
+
return None
|
318
|
+
|
271
319
|
def wait_between_requests(self):
|
272
320
|
"""
|
273
321
|
Wait between requests based on the request_wait_limit.
|
274
322
|
"""
|
275
323
|
if self.last_request_time:
|
276
|
-
sleep_time = self.request_wait_limit - (
|
324
|
+
sleep_time = self.request_wait_limit - (time.time() - self.last_request_time)
|
277
325
|
if sleep_time > 0:
|
278
326
|
logging.info(f"Sleeping for {sleep_time:.2f} seconds.")
|
279
|
-
|
327
|
+
time.sleep(sleep_time)
|
280
328
|
|
281
329
|
def make_request(self):
|
282
330
|
"""
|
283
331
|
Make a request and handle potential errors.
|
284
332
|
"""
|
285
|
-
|
286
|
-
|
333
|
+
if self.url_mgr.url is None:
|
334
|
+
return None
|
287
335
|
self.wait_between_requests()
|
288
336
|
for _ in range(self.max_retries):
|
289
337
|
try:
|
290
|
-
self.try_request()
|
291
|
-
if self.
|
292
|
-
self.
|
293
|
-
|
294
|
-
self.
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
338
|
+
self._response = self.try_request()
|
339
|
+
if self._response:
|
340
|
+
if not isinstance(self._response, (str, bytes)):
|
341
|
+
self.status_code = self._response.status_code
|
342
|
+
if self._response.status_code == 200:
|
343
|
+
self.last_request_time = time.time()
|
344
|
+
return self._response
|
345
|
+
elif self._response.status_code == 429:
|
346
|
+
logging.warning(f"Rate limited by {self.url_mgr.url}. Retrying...")
|
347
|
+
time.sleep(5)
|
299
348
|
except requests.Timeout as e:
|
300
|
-
logging.error(f"Request to {
|
349
|
+
logging.error(f"Request to {self.url_mgr.url} timed out: {e}")
|
301
350
|
except requests.ConnectionError:
|
302
351
|
logging.error(f"Connection error for URL {self.url_mgr.url}.")
|
303
|
-
except requests.Timeout:
|
304
|
-
logging.error(f"Request timeout for URL {self.url_mgr.url}.")
|
305
352
|
except requests.RequestException as e:
|
306
353
|
logging.error(f"Request exception for URL {self.url_mgr.url}: {e}")
|
307
354
|
try:
|
308
355
|
response = get_selenium_source(self.url_mgr.url)
|
309
356
|
if response:
|
310
|
-
self.
|
311
|
-
|
312
|
-
|
313
|
-
|
357
|
+
self._response = response
|
358
|
+
self.status_code = 200 # Assume success
|
359
|
+
return self._response
|
360
|
+
except Exception as e:
|
361
|
+
logging.error(f"Failed to retrieve content from {self.url_mgr.url} after {self.max_retries} retries: {e}")
|
314
362
|
return None
|
315
|
-
|
363
|
+
|
364
|
+
def try_request(self) -> requests.Response | str | bytes | None:
|
316
365
|
"""
|
317
366
|
Tries to make an HTTP request to the given URL using the provided session.
|
318
|
-
|
319
|
-
Args:
|
320
|
-
timeout (int): Timeout for the request.
|
321
|
-
|
322
|
-
Returns:
|
323
|
-
requests.Response or None: The response object if the request is successful, or None if the request fails.
|
324
367
|
"""
|
368
|
+
if self.url_mgr.url is None:
|
369
|
+
return None
|
325
370
|
try:
|
326
|
-
return get_selenium_source(self.url_mgr.url)#self.session.get(
|
327
|
-
except requests.
|
328
|
-
|
371
|
+
return get_selenium_source(self.url_mgr.url) # or self.session.get(self.url_mgr.url, timeout=self.timeout, stream=self.stream)
|
372
|
+
except requests.RequestException as e:
|
373
|
+
logging.error(f"Request failed: {e}")
|
329
374
|
return None
|
330
375
|
|
331
|
-
|
332
376
|
@property
|
333
377
|
def url(self):
|
334
378
|
return self.url_mgr.url
|
@@ -345,12 +389,11 @@ class SafeRequestSingleton:
|
|
345
389
|
elif SafeRequestSingleton._instance.url != url or SafeRequestSingleton._instance.headers != headers or SafeRequestSingleton._instance.max_retries != max_retries or SafeRequestSingleton._instance.request_wait_limit != request_wait_limit:
|
346
390
|
SafeRequestSingleton._instance = SafeRequest(url,url_mgr=urlManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
|
347
391
|
return SafeRequestSingleton._instance
|
392
|
+
def get_source(url=None,url_mgr=None,source_code=None,req_mgr=None):
|
393
|
+
req_mgr = get_req_mgr(req_mgr=req_mgr,url=url,url_mgr=url_mgr,source_code=source_code)
|
394
|
+
return req_mgr.source_code
|
348
395
|
def get_req_mgr(url=None,url_mgr=None,source_code=None,req_mgr=None):
|
349
396
|
url = get_url(url=url,url_mgr=url_mgr)
|
350
397
|
url_mgr = get_url_mgr(url=url,url_mgr=url_mgr )
|
351
398
|
req_mgr = req_mgr or requestManager(url_mgr=url_mgr,url=url,source_code=source_code)
|
352
399
|
return req_mgr
|
353
|
-
def get_source(url=None,url_mgr=None,source_code=None,req_mgr=None):
|
354
|
-
# Placeholder for actual implementation.
|
355
|
-
req_mgr = get_req_mgr(req_mgr=req_mgr,url=url,url_mgr=url_mgr,source_code=source_code)
|
356
|
-
return req_mgr.source_code
|
@@ -39,7 +39,7 @@ class soupManager:
|
|
39
39
|
- The SoupManager class is designed for parsing HTML source code using BeautifulSoup.
|
40
40
|
- It provides various methods to extract data and discover elements within the source code.
|
41
41
|
"""
|
42
|
-
def __init__(self,url=None,source_code=None,url_mgr=None,req_mgr=None, parse_type="html.parser"):
|
42
|
+
def __init__(self,url=None,source_code=None,url_mgr=None,req_mgr=None,soup=None, parse_type="html.parser"):
|
43
43
|
self.soup=[]
|
44
44
|
url = get_url(url=url,url_mgr=url_mgr)
|
45
45
|
self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
@@ -49,8 +49,8 @@ class soupManager:
|
|
49
49
|
source_code = source_code or self.req_mgr.source_code or self.req_mgr.source_code_bytes
|
50
50
|
if source_code:
|
51
51
|
source_code = str(source_code)
|
52
|
-
self.source_code = source_code
|
53
|
-
self.soup= BeautifulSoup(self.source_code, self.parse_type)
|
52
|
+
self.source_code = source_code or ''
|
53
|
+
self.soup= soup or BeautifulSoup(self.source_code, self.parse_type)
|
54
54
|
self.all_tags_and_attribute_names = self.get_all_tags_and_attribute_names()
|
55
55
|
self.all_tags = self.all_tags_and_attribute_names.get('tags')
|
56
56
|
self.all_attribute_names = self.all_tags_and_attribute_names.get('attributes')
|
@@ -340,16 +340,8 @@ class SoupManagerSingleton():
|
|
340
340
|
elif parse_type != SoupManagerSingleton._instance.parse_type or source_code != SoupManagerSingleton._instance.source_code:
|
341
341
|
SoupManagerSingleton._instance = SoupManager(url_mgr,requestManager,parse_type=parse_type,source_code=source_code)
|
342
342
|
return SoupManagerSingleton._instance
|
343
|
-
def
|
344
|
-
|
345
|
-
url = get_url(url=url,url_mgr=url_mgr)
|
346
|
-
req_mgr = get_req_mgr(url_mgr=url_mgr,url=url,source_code=source_code)
|
347
|
-
soup_mgr = soup_mgr or soupManager(url_mgr=url_mgr,req_mgr=req_mgr,url=url,source_code=source_code)
|
348
|
-
return soup_mgr
|
349
|
-
def get_all_attribute_values(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,tags_list = None,parse_type="html.parser"):
|
350
|
-
soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr)
|
351
|
-
return soup_mgr.get_all_attribute_values(tags_list=tags_list)
|
352
|
-
def get_soup(url=None,url_mgr=None,req_mgr=None,source_code=None,soup_mgr=None,parse_type="html.parser"):
|
343
|
+
def get_soup(url=None,url_mgr=None,req_mgr=None,source_code=None,soup_mgr=None,soup=None,parse_type=None):
|
344
|
+
parse_type = parse_type or "html.parser"
|
353
345
|
if source_code or soup_mgr:
|
354
346
|
if soup_mgr:
|
355
347
|
return soup_mgr.soup
|
@@ -358,5 +350,16 @@ def get_soup(url=None,url_mgr=None,req_mgr=None,source_code=None,soup_mgr=None,p
|
|
358
350
|
url = get_url(url=url,url_mgr=url_mgr)
|
359
351
|
req_mgr = req_mgr or get_req_mgr(url_mgr=url_mgr,url=url,source_code=source_code)
|
360
352
|
source_code = req_mgr.source_code
|
361
|
-
soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr)
|
353
|
+
soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr,soup=soup)
|
362
354
|
return soup_mgr.soup
|
355
|
+
def get_soup_mgr(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,soup=None,parse_type=None):
|
356
|
+
parse_type = parse_type or "html.parser"
|
357
|
+
url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
358
|
+
url = get_url(url=url,url_mgr=url_mgr)
|
359
|
+
req_mgr = get_req_mgr(url_mgr=url_mgr,url=url,source_code=source_code)
|
360
|
+
soup_mgr = soup_mgr or soupManager(url_mgr=url_mgr,req_mgr=req_mgr,url=url,source_code=source_code,soup=soup)
|
361
|
+
return soup_mgr
|
362
|
+
def get_all_attribute_values(url=None,url_mgr=None,source_code=None,req_mgr=None,soup_mgr=None,soup=None,tags_list = None,parse_type=None):
|
363
|
+
parse_type = parse_type or "html.parser"
|
364
|
+
soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,soup_mgr=soup_mgr,soup=soup)
|
365
|
+
return soup_mgr.get_all_attribute_values(tags_list=tags_list)
|
@@ -17,21 +17,21 @@ class urlManager:
|
|
17
17
|
Now handles url=None gracefully: sets internals to None/empty and methods return None or empty values without errors.
|
18
18
|
"""
|
19
19
|
def __init__(self, url=None, session=None):
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
20
|
+
self._url = url # Allow None
|
21
|
+
self.session = session or requests.Session()
|
22
|
+
if self._url is None:
|
23
|
+
self.clean_urls = []
|
24
|
+
self.url = None
|
25
|
+
self.protocol = None
|
26
|
+
self.domain = None
|
27
|
+
self.path = ""
|
28
|
+
self.query = ""
|
29
|
+
self.all_urls = []
|
30
|
+
else:
|
31
|
+
self.clean_urls = self.clean_url()
|
32
|
+
self.url = self.get_correct_url() or self._url
|
33
|
+
self.protocol, self.domain, self.path, self.query = self.url_to_pieces(self.url)
|
34
|
+
self.all_urls = []
|
35
35
|
|
36
36
|
def url_to_pieces(self, url):
|
37
37
|
"""
|
@@ -233,17 +233,9 @@ class urlManagerSingleton:
|
|
233
233
|
elif urlManagerSingleton._instance.session != session or urlManagerSingleton._instance.url != url:
|
234
234
|
urlManagerSingleton._instance = urlManager(url, session=session)
|
235
235
|
return urlManagerSingleton._instance
|
236
|
+
def get_url(url=None,url_mgr=None):
|
237
|
+
url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
238
|
+
return url_mgr.url
|
239
|
+
def get_url_mgr(url=None,url_mgr=None):
|
240
|
+
return url_mgr or urlManager(url)
|
236
241
|
|
237
|
-
def get_url(url=None, url_mgr=None):
|
238
|
-
if not url and not url_mgr:
|
239
|
-
return None
|
240
|
-
if url_mgr is None and url is not None:
|
241
|
-
url_mgr = urlManager(url)
|
242
|
-
return url_mgr.url if url_mgr else None
|
243
|
-
|
244
|
-
def get_url_mgr(url=None, url_mgr=None):
|
245
|
-
if url_mgr is None:
|
246
|
-
url_mgr = urlManager(url=url) # Always create instance, even if url=None
|
247
|
-
if url_mgr and url is None:
|
248
|
-
url = url_mgr.url
|
249
|
-
return url_mgr
|
{abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools.egg-info/PKG-INFO
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.141
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -40,6 +40,12 @@ src/abstract_webtools/managers/clownworld/__init__.py
|
|
40
40
|
src/abstract_webtools/managers/clownworld/get_bolshevid_video.py
|
41
41
|
src/abstract_webtools/managers/linkManager/__init__.py
|
42
42
|
src/abstract_webtools/managers/linkManager/linkManager.py
|
43
|
+
src/abstract_webtools/managers/middleManager/__init__.py
|
44
|
+
src/abstract_webtools/managers/middleManager/imports.py
|
45
|
+
src/abstract_webtools/managers/middleManager/src/UnifiedWebManage3r.py
|
46
|
+
src/abstract_webtools/managers/middleManager/src/UnifiedWebManager.py
|
47
|
+
src/abstract_webtools/managers/middleManager/src/__init__.py
|
48
|
+
src/abstract_webtools/managers/middleManager/src/legacy_tools.py
|
43
49
|
src/abstract_webtools/managers/requestManager/__init__.py
|
44
50
|
src/abstract_webtools/managers/requestManager/requestManager.py
|
45
51
|
src/abstract_webtools/managers/soupManager/__init__.py
|
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/extention_list.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/find_dirs.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/k2s_downloader.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/soup_gui.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/url_grabber.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.139 → abstract_webtools-0.1.6.141}/src/abstract_webtools/url_grabber_new.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|