abstract-webtools 0.1.6.145__py3-none-any.whl → 0.1.6.146__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/managers/cipherManager.py +12 -13
- abstract_webtools/managers/crawlManager.py +35 -26
- abstract_webtools/managers/curlMgr.py +25 -47
- abstract_webtools/managers/meta_dump.py +27 -0
- abstract_webtools/managers/networkManager.py +48 -13
- abstract_webtools/managers/soupManager/soupManager.py +46 -19
- abstract_webtools/managers/sslManager.py +11 -2
- abstract_webtools/managers/userAgentManager.py +31 -40
- {abstract_webtools-0.1.6.145.dist-info → abstract_webtools-0.1.6.146.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.6.145.dist-info → abstract_webtools-0.1.6.146.dist-info}/RECORD +12 -12
- abstract_webtools/managers/seleniumManager.py +0 -119
- {abstract_webtools-0.1.6.145.dist-info → abstract_webtools-0.1.6.146.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.6.145.dist-info → abstract_webtools-0.1.6.146.dist-info}/top_level.txt +0 -0
@@ -1,21 +1,20 @@
|
|
1
1
|
class CipherManager:
|
2
2
|
@staticmethod
|
3
|
-
def
|
3
|
+
def get_default_ciphers() -> list:
|
4
4
|
return [
|
5
|
-
"ECDHE-RSA-AES256-GCM-SHA384",
|
6
|
-
"ECDHE-RSA-AES256-SHA384",
|
7
|
-
"ECDHE-RSA-AES256-SHA",
|
8
|
-
"ECDHE-RSA-AES128-GCM-SHA256",
|
9
|
-
"ECDHE-ECDSA-AES128-GCM-SHA256",
|
10
|
-
"AES256-SHA",
|
5
|
+
"ECDHE-RSA-AES256-GCM-SHA384","ECDHE-ECDSA-AES256-GCM-SHA384",
|
6
|
+
"ECDHE-RSA-AES256-SHA384","ECDHE-ECDSA-AES256-SHA384",
|
7
|
+
"ECDHE-RSA-AES256-SHA","ECDHE-ECDSA-AES256-SHA",
|
8
|
+
"ECDHE-RSA-AES128-GCM-SHA256","ECDHE-RSA-AES128-SHA256",
|
9
|
+
"ECDHE-ECDSA-AES128-GCM-SHA256","ECDHE-ECDSA-AES128-SHA256",
|
10
|
+
"AES256-SHA","AES128-SHA"
|
11
11
|
]
|
12
|
+
def __init__(self, cipher_list=None):
|
13
|
+
self.cipher_list = cipher_list or self.get_default_ciphers()
|
14
|
+
if isinstance(self.cipher_list, str):
|
15
|
+
self.cipher_list = [c.strip() for c in self.cipher_list.split(',') if c.strip()]
|
16
|
+
self.ciphers_string = ','.join(self.cipher_list) if self.cipher_list else ''
|
12
17
|
|
13
|
-
def __init__(self,cipher_list=None):
|
14
|
-
if cipher_list == None:
|
15
|
-
cipher_list=self.get_default_ciphers()
|
16
|
-
self.cipher_list = cipher_list
|
17
|
-
self.create_list()
|
18
|
-
self.ciphers_string = self.add_string_list()
|
19
18
|
def add_string_list(self):
|
20
19
|
if len(self.cipher_list)==0:
|
21
20
|
return ''
|
@@ -119,14 +119,42 @@ if __name__ == "__main__":
|
|
119
119
|
generator.run()
|
120
120
|
|
121
121
|
class crawlManager:
|
122
|
-
def __init__(self, url
|
122
|
+
def __init__(self, url, req_mgr, url_mgr, source_code=None, parse_type="html.parser"):
|
123
|
+
self.url_mgr = url_mgr
|
124
|
+
self.req_mgr = req_mgr
|
123
125
|
self.url = url
|
124
|
-
self.source_code = source_code
|
125
126
|
self.parse_type = parse_type
|
126
|
-
self.
|
127
|
-
self.
|
128
|
-
self.
|
129
|
-
|
127
|
+
self.source_code = source_code or req_mgr.source_code
|
128
|
+
self.soup = BeautifulSoup(self.source_code or "", parse_type)
|
129
|
+
self.base_netloc = urlparse(self.url).netloc
|
130
|
+
|
131
|
+
def is_internal(self, link):
|
132
|
+
u = urlparse(link)
|
133
|
+
return (not u.netloc) or (u.netloc == self.base_netloc)
|
134
|
+
|
135
|
+
def links_on_page(self):
|
136
|
+
out = set()
|
137
|
+
for a in self.soup.find_all("a", href=True):
|
138
|
+
out.add(urljoin(self.url, a["href"]))
|
139
|
+
return out
|
140
|
+
|
141
|
+
def crawl(self, start=None, max_depth=2, _depth=0, visited=None, session=None):
|
142
|
+
start = start or self.url
|
143
|
+
visited = visited or set()
|
144
|
+
if _depth > max_depth or start in visited:
|
145
|
+
return visited
|
146
|
+
visited.add(start)
|
147
|
+
|
148
|
+
# fetch
|
149
|
+
r = self.req_mgr.session.get(start, timeout=30)
|
150
|
+
r.raise_for_status()
|
151
|
+
soup = BeautifulSoup(r.text, self.parse_type)
|
152
|
+
|
153
|
+
for a in soup.find_all("a", href=True):
|
154
|
+
link = urljoin(start, a["href"])
|
155
|
+
if self.is_internal(link) and link not in visited:
|
156
|
+
self.crawl(link, max_depth=max_depth, _depth=_depth+1, visited=visited)
|
157
|
+
return visited
|
130
158
|
def get_new_source_and_url(self, url=None):
|
131
159
|
"""Fetches new source code and response for a given URL."""
|
132
160
|
url = url
|
@@ -194,26 +222,7 @@ class crawlManager:
|
|
194
222
|
return ('yearly', '0.3')
|
195
223
|
return ('weekly', '1.0')
|
196
224
|
|
197
|
-
|
198
|
-
"""Recursively crawls the site up to max_depth and returns valid internal links."""
|
199
|
-
visited = visited or set()
|
200
|
-
if depth > max_depth or url in visited:
|
201
|
-
return []
|
202
|
-
|
203
|
-
visited.add(url)
|
204
|
-
try:
|
205
|
-
soup = get_soup(url)
|
206
|
-
links = []
|
207
|
-
for tag in soup.find_all('a', href=True):
|
208
|
-
link = urljoin(url, tag['href'])
|
209
|
-
if urlparse(link).netloc == urlparse(url).netloc and link not in visited:
|
210
|
-
links.append(link)
|
211
|
-
self.crawl(link, max_depth, depth + 1, visited)
|
212
|
-
return links
|
213
|
-
except Exception as e:
|
214
|
-
print(f"Error crawling {url}: {e}")
|
215
|
-
return []
|
216
|
-
|
225
|
+
|
217
226
|
def get_meta_info(self, url=None):
|
218
227
|
"""Fetches metadata, including title and meta tags, from the page."""
|
219
228
|
url = url or self.url
|
@@ -1,48 +1,26 @@
|
|
1
|
-
import os
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
f'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 '
|
23
|
-
f'Safari/537.36" -H "Accept: */*" "{website}"'
|
24
|
-
)
|
25
|
-
|
26
|
-
def download_site(website, destination_dir, filename):
|
27
|
-
os.makedirs(destination_dir, exist_ok=True)
|
28
|
-
os.chmod(destination_dir, 0o755) # set directory permissions if needed
|
29
|
-
|
30
|
-
destination_path = os.path.join(destination_dir, filename)
|
31
|
-
|
32
|
-
# GET the resource
|
33
|
-
response = requests.get(website, headers={
|
34
|
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
35
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
36
|
-
"Chrome/91.0.4472.124 Safari/537.36",
|
37
|
-
"Accept": "*/*"
|
38
|
-
}, allow_redirects=True)
|
39
|
-
|
40
|
-
# Raise an exception if the download fails
|
41
|
-
response.raise_for_status()
|
42
|
-
|
43
|
-
# Write content to file
|
1
|
+
import os, subprocess, requests
|
2
|
+
|
3
|
+
def curl_download(website, destination_path, user_agent=None):
|
4
|
+
os.makedirs(os.path.dirname(destination_path), exist_ok=True)
|
5
|
+
ua = user_agent or ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
6
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
7
|
+
"Chrome/91.0.4472.124 Safari/537.36")
|
8
|
+
subprocess.run([
|
9
|
+
"curl","-L","--output", destination_path,
|
10
|
+
"-H", f"User-Agent: {ua}",
|
11
|
+
"-H", "Accept: */*",
|
12
|
+
website
|
13
|
+
], check=True)
|
14
|
+
|
15
|
+
def requests_download(website, destination_path, headers=None):
|
16
|
+
os.makedirs(os.path.dirname(destination_path), exist_ok=True)
|
17
|
+
hdr = {"User-Agent": ("Mozilla/5.0 ... Chrome/91.0 Safari/537.36"),
|
18
|
+
"Accept": "*/*"}
|
19
|
+
if headers: hdr.update(headers)
|
20
|
+
r = requests.get(website, headers=hdr, allow_redirects=True, timeout=30)
|
21
|
+
r.raise_for_status()
|
44
22
|
with open(destination_path, "wb") as f:
|
45
|
-
f.write(
|
46
|
-
|
47
|
-
|
48
|
-
|
23
|
+
f.write(r.content)
|
24
|
+
|
25
|
+
if __name__ == "__main__":
|
26
|
+
pass # no side effects
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# meta_dump.py
|
2
|
+
from abstract_webtools.managers.networkManager import NetworkManager
|
3
|
+
from abstract_webtools.managers.userAgentManager import UserAgentManager
|
4
|
+
from abstract_webtools.managers.soupManager.soupManager import soupManager
|
5
|
+
import json, sys
|
6
|
+
|
7
|
+
def dump_all_meta(url: str):
|
8
|
+
ua = UserAgentManager(browser="Chrome", operating_system="Windows")
|
9
|
+
net = NetworkManager(user_agent_manager=ua)
|
10
|
+
|
11
|
+
r = net.session.get(url, timeout=30)
|
12
|
+
r.raise_for_status()
|
13
|
+
|
14
|
+
sm = soupManager(url=url, source_code=r.text, req_mgr=net)
|
15
|
+
out = {
|
16
|
+
"url": url,
|
17
|
+
"title": sm.soup.title.string.strip() if sm.soup.title and sm.soup.title.string else None,
|
18
|
+
"meta": sm.all_meta(),
|
19
|
+
"citation": sm.citation_dict(),
|
20
|
+
"links": sm.all_links(),
|
21
|
+
"json_ld": sm.all_jsonld(),
|
22
|
+
}
|
23
|
+
print(json.dumps(out, indent=2, ensure_ascii=False))
|
24
|
+
|
25
|
+
if __name__ == "__main__":
|
26
|
+
url = sys.argv[1]
|
27
|
+
dump_all_meta(url)
|
@@ -1,15 +1,50 @@
|
|
1
|
+
from typing import Optional, List
|
2
|
+
import requests
|
1
3
|
from ..abstract_webtools import *
|
2
|
-
from
|
4
|
+
from .sslManager import SSLManager
|
5
|
+
from .cipherManager import CipherManager
|
6
|
+
|
7
|
+
class TLSAdapter(HTTPAdapter):
|
8
|
+
def __init__(self, ssl_manager: SSLManager=None):
|
9
|
+
ssl_manager = ssl_manager or SSLManager()
|
10
|
+
self.ssl_context = ssl_manager.ssl_context
|
11
|
+
super().__init__()
|
12
|
+
def init_poolmanager(self, *args, **kwargs):
|
13
|
+
kwargs['ssl_context'] = self.ssl_context
|
14
|
+
return super().init_poolmanager(*args, **kwargs)
|
15
|
+
|
3
16
|
class NetworkManager:
|
4
|
-
def __init__(self, user_agent_manager=None,ssl_manager=None,
|
5
|
-
|
6
|
-
|
7
|
-
self.
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
self.
|
14
|
-
self.
|
15
|
-
|
17
|
+
def __init__(self, user_agent_manager=None, ssl_manager=None, proxies=None, cookies=None,
|
18
|
+
ciphers=None, certification: Optional[str]=None, ssl_options: Optional[List[str]]=None):
|
19
|
+
self.ua_mgr = user_agent_manager or UserAgentManager()
|
20
|
+
self.ssl_mgr = ssl_manager or SSLManager(
|
21
|
+
ciphers=ciphers or CipherManager().ciphers_string,
|
22
|
+
ssl_options=ssl_options,
|
23
|
+
certification=certification
|
24
|
+
)
|
25
|
+
|
26
|
+
self.session = requests.Session()
|
27
|
+
self.session.headers.update({
|
28
|
+
"User-Agent": self.ua_mgr.user_agent,
|
29
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
30
|
+
"Accept-Language": "en-US,en;q=0.9",
|
31
|
+
"Connection": "keep-alive"
|
32
|
+
})
|
33
|
+
adapter = TLSAdapter(self.ssl_mgr)
|
34
|
+
self.session.mount("https://", adapter)
|
35
|
+
self.session.mount("http://", HTTPAdapter())
|
36
|
+
|
37
|
+
if proxies:
|
38
|
+
self.session.proxies = proxies
|
39
|
+
if cookies:
|
40
|
+
if isinstance(cookies, requests.cookies.RequestsCookieJar):
|
41
|
+
self.session.cookies = cookies
|
42
|
+
elif isinstance(cookies, dict):
|
43
|
+
jar = requests.cookies.RequestsCookieJar()
|
44
|
+
for k,v in cookies.items(): jar.set(k,v)
|
45
|
+
self.session.cookies = jar
|
46
|
+
# if string: up to you—parse or ignore
|
47
|
+
|
48
|
+
# retries (optional)
|
49
|
+
from requests.adapters import Retry
|
50
|
+
self.session.adapters['https://'].max_retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
|
@@ -1,6 +1,9 @@
|
|
1
1
|
from ...abstract_webtools import *
|
2
2
|
from ..urlManager import *
|
3
3
|
from ..requestManager import *
|
4
|
+
from bs4 import BeautifulSoup
|
5
|
+
import re, json
|
6
|
+
|
4
7
|
class soupManager:
|
5
8
|
"""
|
6
9
|
SoupManager is a class for managing and parsing HTML source code using BeautifulSoup.
|
@@ -39,25 +42,49 @@ class soupManager:
|
|
39
42
|
- The SoupManager class is designed for parsing HTML source code using BeautifulSoup.
|
40
43
|
- It provides various methods to extract data and discover elements within the source code.
|
41
44
|
"""
|
42
|
-
|
43
|
-
|
44
|
-
url =
|
45
|
-
self.
|
46
|
-
self.
|
47
|
-
self.
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
45
|
+
|
46
|
+
def __init__(self, url=None, source_code=None, req_mgr=None, parse_type="html.parser"):
|
47
|
+
self.url = url
|
48
|
+
self.req_mgr = req_mgr
|
49
|
+
self.source_code = (source_code or (req_mgr.source_code if req_mgr else "")) or ""
|
50
|
+
self.soup = BeautifulSoup(self.source_code, parse_type)
|
51
|
+
|
52
|
+
def all_meta(self):
|
53
|
+
out = []
|
54
|
+
for m in self.soup.find_all("meta"):
|
55
|
+
row = {}
|
56
|
+
for k in ("name","property","http-equiv","itemprop","charset","content"):
|
57
|
+
v = m.get(k)
|
58
|
+
if v: row[k] = v
|
59
|
+
if row: out.append(row)
|
60
|
+
return out
|
61
|
+
|
62
|
+
def citation_dict(self):
|
63
|
+
out = {}
|
64
|
+
for m in self.soup.find_all("meta"):
|
65
|
+
k = (m.get("name") or m.get("property") or "").lower()
|
66
|
+
if k.startswith("citation_") and m.get("content"):
|
67
|
+
out.setdefault(k, []).append(m["content"])
|
68
|
+
return out
|
69
|
+
|
70
|
+
def all_links(self):
|
71
|
+
res = []
|
72
|
+
for l in self.soup.find_all("link"):
|
73
|
+
rel = l.get("rel")
|
74
|
+
if isinstance(rel, list): rel = " ".join(rel)
|
75
|
+
res.append({
|
76
|
+
"rel": rel, "href": l.get("href"),
|
77
|
+
"type": l.get("type"), "title": l.get("title"), "hreflang": l.get("hreflang")
|
78
|
+
})
|
79
|
+
return res
|
80
|
+
|
81
|
+
def all_jsonld(self):
|
82
|
+
blocks = []
|
83
|
+
for s in self.soup.find_all("script", type=re.compile("application/ld\\+json", re.I)):
|
84
|
+
txt = s.get_text(strip=True)
|
85
|
+
try: blocks.append(json.loads(txt))
|
86
|
+
except Exception: blocks.append({"raw": txt})
|
87
|
+
return blocks
|
61
88
|
def re_initialize(self):
|
62
89
|
self.soup= BeautifulSoup(self.source_code, self.parse_type)
|
63
90
|
self._all_links_data = None
|
@@ -1,12 +1,21 @@
|
|
1
1
|
from ..abstract_webtools import *
|
2
|
+
# sslManager.py
|
3
|
+
from ..abstract_webtools import * # must expose ssl, ssl_
|
4
|
+
from .cipherManager import CipherManager # be explicit, safer
|
5
|
+
|
2
6
|
class SSLManager:
|
3
7
|
def __init__(self, ciphers=None, ssl_options=None, certification=None):
|
4
8
|
self.ciphers = ciphers or CipherManager().ciphers_string
|
5
9
|
self.ssl_options = ssl_options or self.get_default_ssl_settings()
|
6
10
|
self.certification = certification or ssl.CERT_REQUIRED
|
7
11
|
self.ssl_context = self.get_context()
|
12
|
+
|
8
13
|
def get_default_ssl_settings(self):
|
9
14
|
return ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1 | ssl.OP_NO_COMPRESSION
|
10
|
-
def get_context(self):
|
11
|
-
return ssl_.create_urllib3_context(ciphers=self.ciphers, cert_reqs=self.certification, options=self.ssl_options)
|
12
15
|
|
16
|
+
def get_context(self):
|
17
|
+
return ssl_.create_urllib3_context(
|
18
|
+
ciphers=self.ciphers,
|
19
|
+
cert_reqs=self.certification,
|
20
|
+
options=self.ssl_options
|
21
|
+
)
|
@@ -1,29 +1,27 @@
|
|
1
|
+
# userAgentManager.py
|
1
2
|
from ..abstract_webtools import *
|
2
3
|
import random
|
4
|
+
|
3
5
|
operating_systems = ['Macintosh','Windows','Linux']
|
4
6
|
browsers = ['Firefox','Chrome','IceDragon','Waterfox','Gecko','Safari','MetaSr']
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
if
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
return itter_list[0]
|
16
|
-
def get_browser(browser=None):
|
17
|
-
return get_itter(browser,browsers)
|
18
|
-
def get_operating_system(operating_system=None):
|
19
|
-
return get_itter(operating_system,operating_systems)
|
7
|
+
|
8
|
+
def _pick(val, options):
|
9
|
+
if not val: return options[0]
|
10
|
+
if val in options: return val
|
11
|
+
l = val.lower()
|
12
|
+
for o in options:
|
13
|
+
if l in o.lower():
|
14
|
+
return o
|
15
|
+
return options[0]
|
16
|
+
|
20
17
|
class UserAgentManager:
|
21
|
-
def __init__(self, operating_system=None, browser=None, version=None,user_agent=None):
|
22
|
-
self.operating_system =
|
23
|
-
self.browser =
|
18
|
+
def __init__(self, operating_system=None, browser=None, version=None, user_agent=None):
|
19
|
+
self.operating_system = _pick(operating_system, operating_systems)
|
20
|
+
self.browser = _pick(browser, browsers)
|
24
21
|
self.version = version or '42.0'
|
25
22
|
self.user_agent = user_agent or self.get_user_agent()
|
26
|
-
self.header = self.
|
23
|
+
self.header = {"user-agent": self.user_agent}
|
24
|
+
|
27
25
|
@staticmethod
|
28
26
|
def user_agent_db():
|
29
27
|
from ..big_user_agent_list import big_user_agent_dict
|
@@ -31,30 +29,23 @@ class UserAgentManager:
|
|
31
29
|
|
32
30
|
def get_user_agent(self):
|
33
31
|
ua_db = self.user_agent_db()
|
32
|
+
os_db = ua_db.get(self.operating_system) or random.choice(list(ua_db.values()))
|
33
|
+
br_db = os_db.get(self.browser) or random.choice(list(os_db.values()))
|
34
|
+
if self.version in br_db:
|
35
|
+
return br_db[self.version]
|
36
|
+
return random.choice(list(br_db.values()))
|
34
37
|
|
35
|
-
if self.operating_system and self.operating_system in ua_db:
|
36
|
-
operating_system_db = ua_db[self.operating_system]
|
37
|
-
else:
|
38
|
-
operating_system_db = random.choice(list(ua_db.values()))
|
39
|
-
|
40
|
-
if self.browser and self.browser in operating_system_db:
|
41
|
-
browser_db = operating_system_db[self.browser]
|
42
|
-
else:
|
43
|
-
browser_db = random.choice(list(operating_system_db.values()))
|
44
|
-
|
45
|
-
if self.version and self.version in browser_db:
|
46
|
-
return browser_db[self.version]
|
47
|
-
else:
|
48
|
-
return random.choice(list(browser_db.values()))
|
49
|
-
|
50
|
-
def user_agent_header(self):
|
51
|
-
return {"user-agent": self.user_agent}
|
52
38
|
class UserAgentManagerSingleton:
|
53
39
|
_instance = None
|
40
|
+
|
54
41
|
@staticmethod
|
55
|
-
def get_instance(
|
42
|
+
def get_instance(**kwargs):
|
43
|
+
ua = kwargs.get("user_agent")
|
56
44
|
if UserAgentManagerSingleton._instance is None:
|
57
|
-
UserAgentManagerSingleton._instance = UserAgentManager(
|
58
|
-
|
59
|
-
|
45
|
+
UserAgentManagerSingleton._instance = UserAgentManager(**kwargs)
|
46
|
+
else:
|
47
|
+
# rebuild if user_agent explicitly changed
|
48
|
+
inst = UserAgentManagerSingleton._instance
|
49
|
+
if ua and ua != inst.user_agent:
|
50
|
+
UserAgentManagerSingleton._instance = UserAgentManager(**kwargs)
|
60
51
|
return UserAgentManagerSingleton._instance
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.146
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -12,20 +12,20 @@ abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE7
|
|
12
12
|
abstract_webtools/url_grabber_new.py,sha256=xb23qo4anOY0Ax3CAfaHJ8s5VEz61Sinh-XpEDFW7Is,3621
|
13
13
|
abstract_webtools/managers/__init__.py,sha256=RXQAK5z9nYlocM91P2OC4jR352-MiqT5bAi4xZl7_FU,470
|
14
14
|
abstract_webtools/managers/allss\.py,sha256=IBhlyRQHfK-BtwUnSEbIPqlI1MtZ8-XsdaHv0b91HQ0,269
|
15
|
-
abstract_webtools/managers/cipherManager.py,sha256=
|
16
|
-
abstract_webtools/managers/crawlManager.py,sha256=
|
15
|
+
abstract_webtools/managers/cipherManager.py,sha256=trmjY6AoBDKnh4oprtJaGhGV-jyNmyUoPnw44s0C4PA,1707
|
16
|
+
abstract_webtools/managers/crawlManager.py,sha256=mytGHr4bQDboq0q9XhqtPApJt48sL1KlMFsUV1MxvxM,13141
|
17
17
|
abstract_webtools/managers/crawlmgr2.py,sha256=PvHas-FSlp98osc-2so9zw-2c7amUMdwIj6tmc6Rl00,1910
|
18
|
-
abstract_webtools/managers/curlMgr.py,sha256=
|
18
|
+
abstract_webtools/managers/curlMgr.py,sha256=gM_TzsnYIQGiK__YXarHt5XpRLdE-RgaJqRYKzsXm34,1025
|
19
19
|
abstract_webtools/managers/domainManager.py,sha256=95znOBv05W77mW_fbZAfl4RmlENDlYqhEOMkL02L220,3610
|
20
20
|
abstract_webtools/managers/dynamicRateLimiter.py,sha256=ycn5VQEPnmxjNMew4IVh-t5t43jhM39uypoOK9bJDDg,7662
|
21
21
|
abstract_webtools/managers/get_test.py,sha256=nISrhUGdyvRv18wTGoifGhizBFoHeK0N3FymMASloFw,825
|
22
|
+
abstract_webtools/managers/meta_dump.py,sha256=3U-P-CRF5YfjtJuIoGlt6XMOu8Xdt1ijZNxfT9lmH1w,937
|
22
23
|
abstract_webtools/managers/mySocketClient.py,sha256=-j1Q8Ds9RCSbjZdx3ZF9mVpgwxaO0BBssanUcpYVQoY,2045
|
23
|
-
abstract_webtools/managers/networkManager.py,sha256=
|
24
|
+
abstract_webtools/managers/networkManager.py,sha256=eVnQACciE0r3E-xYqLjqNc26c_VCUZsiajusx2q4Pu4,2164
|
24
25
|
abstract_webtools/managers/seleneumManager.py,sha256=wyo4SpocgRz3W50b33GW3po32_uxYwmdE1TFZ_0k07s,9539
|
25
|
-
abstract_webtools/managers/
|
26
|
-
abstract_webtools/managers/sslManager.py,sha256=I9YUqJo8_KwLOwfBTAoSfzKSfR4Vtjw1HQXsXRnCV-g,641
|
26
|
+
abstract_webtools/managers/sslManager.py,sha256=jvWFnZ80Quyb-kD8C41xWGC1dEzR4vvTS_QUIXgmPQQ,827
|
27
27
|
abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_oWuoo4VZ_HE,1454
|
28
|
-
abstract_webtools/managers/userAgentManager.py,sha256=
|
28
|
+
abstract_webtools/managers/userAgentManager.py,sha256=Lmpa0cvTkzXJ51Lmfcb_TuPPSNETcJbHY38pyQadaIU,1885
|
29
29
|
abstract_webtools/managers/videoDownloader.py,sha256=mKXhKYNnJwPaiqAsHvFTBGLdXFgR3wdV0G1OIimiKbE,15424
|
30
30
|
abstract_webtools/managers/videoDownloader2.py,sha256=v3H6akdhvVWGrB-r35m3cp_-aKkNWadpfCiMylOnv6w,12748
|
31
31
|
abstract_webtools/managers/clownworld/__init__.py,sha256=eq25euhRbFqHLm1ibi_7FGz_oNWs-kkyAkETzK3r4_Q,35
|
@@ -42,11 +42,11 @@ abstract_webtools/managers/requestManager/__init__.py,sha256=z2qGtweEoO_OKr959LG
|
|
42
42
|
abstract_webtools/managers/requestManager/requestManager.py,sha256=0d1Z5dFIjOg8KyJakzOilJiiq6SR3iKUr5vfnssWDu8,20048
|
43
43
|
abstract_webtools/managers/soupManager/__init__.py,sha256=mqfXfqM9sWlYpOkoXUqtBoVvk2KQx1862NnmRVJwGtY,27
|
44
44
|
abstract_webtools/managers/soupManager/asoueces.py,sha256=OaXqolZl0dI7b09NYwJ3Wnhuxf89ahZ1GjsOqy0GXfk,3506
|
45
|
-
abstract_webtools/managers/soupManager/soupManager.py,sha256=
|
45
|
+
abstract_webtools/managers/soupManager/soupManager.py,sha256=6vWYnZGuimStbNiuH_V4UMPRk1W0SZo_fZkh8f7fVmM,17938
|
46
46
|
abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
|
47
47
|
abstract_webtools/managers/urlManager/urlManager (Copy).py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
|
48
48
|
abstract_webtools/managers/urlManager/urlManager.py,sha256=vY4KQXtcrlC2YtlultxQpVe581l5kAuT5VGA0WrI16g,8945
|
49
|
-
abstract_webtools-0.1.6.
|
50
|
-
abstract_webtools-0.1.6.
|
51
|
-
abstract_webtools-0.1.6.
|
52
|
-
abstract_webtools-0.1.6.
|
49
|
+
abstract_webtools-0.1.6.146.dist-info/METADATA,sha256=TirJooA5jA2sXf7KpS0Hy24m4LpvMK2TmtFMbcTaH_E,7289
|
50
|
+
abstract_webtools-0.1.6.146.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
51
|
+
abstract_webtools-0.1.6.146.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
52
|
+
abstract_webtools-0.1.6.146.dist-info/RECORD,,
|
@@ -1,119 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
from ..abstract_webtools import *
|
3
|
-
from .urlManager import *
|
4
|
-
from urllib.parse import urlparse
|
5
|
-
from abstract_utilities import *
|
6
|
-
from selenium import webdriver
|
7
|
-
from selenium.webdriver.chrome.options import Options
|
8
|
-
import logging
|
9
|
-
import urllib3
|
10
|
-
|
11
|
-
# Suppress urllib3 warnings and debug logs
|
12
|
-
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
13
|
-
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
14
|
-
|
15
|
-
# Suppress Selenium logs
|
16
|
-
logging.getLogger("selenium").setLevel(logging.WARNING)
|
17
|
-
|
18
|
-
import os
|
19
|
-
from selenium import webdriver
|
20
|
-
from selenium.webdriver.chrome.options import Options
|
21
|
-
|
22
|
-
# Setup Chrome options
|
23
|
-
chrome_options = Options()
|
24
|
-
#chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
|
25
|
-
chrome_options.add_argument("--headless") # Run in headless mode
|
26
|
-
chrome_options.add_argument("--no-sandbox")
|
27
|
-
chrome_options.add_argument("--disable-dev-shm-usage")
|
28
|
-
chrome_options.add_argument("--disable-gpu")
|
29
|
-
chrome_options.add_argument("--disable-software-rasterizer")
|
30
|
-
chrome_options.add_argument("--disable-extensions")
|
31
|
-
chrome_options.add_argument("--remote-debugging-port=9222")
|
32
|
-
|
33
|
-
|
34
|
-
class SingletonMeta(type):
|
35
|
-
_instances = {}
|
36
|
-
def __call__(cls, *args, **kwargs):
|
37
|
-
if cls not in cls._instances:
|
38
|
-
instance = super().__call__(*args, **kwargs)
|
39
|
-
cls._instances[cls] = instance
|
40
|
-
return cls._instances[cls]
|
41
|
-
|
42
|
-
class seleniumManager(metaclass=SingletonMeta):
|
43
|
-
def __init__(self, url):
|
44
|
-
if not hasattr(self, 'initialized'): # Prevent reinitialization
|
45
|
-
self.initialized = True
|
46
|
-
parsed_url = urlparse(url)
|
47
|
-
self.domain = parsed_url.netloc
|
48
|
-
self.scheme = parsed_url.scheme
|
49
|
-
self.base_url= f"{self.scheme}{self.domain}"
|
50
|
-
self.site_dir = os.path.join(os.getcwd(), self.domain)
|
51
|
-
os.makedirs(self.site_dir, exist_ok=True)
|
52
|
-
self.drivers = {}
|
53
|
-
self.page_type = []
|
54
|
-
|
55
|
-
def get_url_to_path(self, url):
|
56
|
-
url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
|
57
|
-
parsed_url = urlparse(url)
|
58
|
-
if parsed_url.netloc == self.domain:
|
59
|
-
paths = parsed_url.path.split('/')
|
60
|
-
dir_path = self.site_dir
|
61
|
-
for path in paths[:-1]:
|
62
|
-
dir_path = os.path.join(dir_path, path)
|
63
|
-
os.makedirs(dir_path, exist_ok=True)
|
64
|
-
self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
|
65
|
-
|
66
|
-
dir_path = os.path.join(dir_path, paths[-1])
|
67
|
-
return dir_path
|
68
|
-
|
69
|
-
def saved_url_check(self, url):
|
70
|
-
path = self.get_url_to_path(url)
|
71
|
-
return path
|
72
|
-
|
73
|
-
def get_with_netloc(self, url):
|
74
|
-
parsed_url = urlparse(url)
|
75
|
-
if parsed_url.netloc == '':
|
76
|
-
url = f"{self.scheme}://{self.domain}/{url.strip()}"
|
77
|
-
return url
|
78
|
-
|
79
|
-
def get_driver(self, url):
|
80
|
-
if url and url not in self.drivers:
|
81
|
-
chrome_options = Options()
|
82
|
-
chrome_options.add_argument("--headless")
|
83
|
-
driver = webdriver.Chrome(options=chrome_options)
|
84
|
-
self.drivers[url] = driver
|
85
|
-
driver.get(url)
|
86
|
-
return self.drivers[url]
|
87
|
-
def normalize_url(url, base_url=None):
|
88
|
-
"""
|
89
|
-
Normalize and resolve relative URLs, ensuring proper domain and format.
|
90
|
-
"""
|
91
|
-
# If URL starts with the base URL repeated, remove the extra part
|
92
|
-
manager = seleniumManager(url)
|
93
|
-
base_url = manager.base_url
|
94
|
-
if url.startswith(base_url):
|
95
|
-
url = url[len(base_url):]
|
96
|
-
|
97
|
-
# Resolve the URL against the base URL
|
98
|
-
normalized_url = urljoin(base_url, url.split('#')[0])
|
99
|
-
|
100
|
-
# Ensure only URLs belonging to the base domain are kept
|
101
|
-
if not normalized_url.startswith(base_url):
|
102
|
-
return None
|
103
|
-
|
104
|
-
return normalized_url
|
105
|
-
# Function to get Selenium page source
|
106
|
-
def get_selenium_source(url):
|
107
|
-
url_mgr = urlManager(url)
|
108
|
-
if url_mgr.url:
|
109
|
-
url = str(url_mgr.url)
|
110
|
-
manager = seleniumManager(url)
|
111
|
-
driver = manager.get_driver(url)
|
112
|
-
try:
|
113
|
-
# Get page source
|
114
|
-
page_source = driver.page_source
|
115
|
-
return page_source
|
116
|
-
finally:
|
117
|
-
# Don't quit the driver unless you're done with all interactions
|
118
|
-
pass
|
119
|
-
|
File without changes
|
{abstract_webtools-0.1.6.145.dist-info → abstract_webtools-0.1.6.146.dist-info}/top_level.txt
RENAMED
File without changes
|