abstract-webtools 0.1.6.145__py3-none-any.whl → 0.1.6.146__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,21 +1,20 @@
1
1
  class CipherManager:
2
2
  @staticmethod
3
- def get_default_ciphers()-> list:
3
+ def get_default_ciphers() -> list:
4
4
  return [
5
- "ECDHE-RSA-AES256-GCM-SHA384", "ECDHE-ECDSA-AES256-GCM-SHA384",
6
- "ECDHE-RSA-AES256-SHA384", "ECDHE-ECDSA-AES256-SHA384",
7
- "ECDHE-RSA-AES256-SHA", "ECDHE-ECDSA-AES256-SHA",
8
- "ECDHE-RSA-AES128-GCM-SHA256", "ECDHE-RSA-AES128-SHA256",
9
- "ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-ECDSA-AES128-SHA256",
10
- "AES256-SHA", "AES128-SHA"
5
+ "ECDHE-RSA-AES256-GCM-SHA384","ECDHE-ECDSA-AES256-GCM-SHA384",
6
+ "ECDHE-RSA-AES256-SHA384","ECDHE-ECDSA-AES256-SHA384",
7
+ "ECDHE-RSA-AES256-SHA","ECDHE-ECDSA-AES256-SHA",
8
+ "ECDHE-RSA-AES128-GCM-SHA256","ECDHE-RSA-AES128-SHA256",
9
+ "ECDHE-ECDSA-AES128-GCM-SHA256","ECDHE-ECDSA-AES128-SHA256",
10
+ "AES256-SHA","AES128-SHA"
11
11
  ]
12
+ def __init__(self, cipher_list=None):
13
+ self.cipher_list = cipher_list or self.get_default_ciphers()
14
+ if isinstance(self.cipher_list, str):
15
+ self.cipher_list = [c.strip() for c in self.cipher_list.split(',') if c.strip()]
16
+ self.ciphers_string = ','.join(self.cipher_list) if self.cipher_list else ''
12
17
 
13
- def __init__(self,cipher_list=None):
14
- if cipher_list == None:
15
- cipher_list=self.get_default_ciphers()
16
- self.cipher_list = cipher_list
17
- self.create_list()
18
- self.ciphers_string = self.add_string_list()
19
18
  def add_string_list(self):
20
19
  if len(self.cipher_list)==0:
21
20
  return ''
@@ -119,14 +119,42 @@ if __name__ == "__main__":
119
119
  generator.run()
120
120
 
121
121
  class crawlManager:
122
- def __init__(self, url=None, req_mgr=None, url_mgr=None, source_code=None, parse_type="html.parser"):
122
+ def __init__(self, url, req_mgr, url_mgr, source_code=None, parse_type="html.parser"):
123
+ self.url_mgr = url_mgr
124
+ self.req_mgr = req_mgr
123
125
  self.url = url
124
- self.source_code = source_code
125
126
  self.parse_type = parse_type
126
- self.url_mgr = url_mgr or urlManager(url=self.url)
127
- self.req_mgr = req_mgr or requestManager(url_mgr=self.url_mgr)
128
- self.get_new_source_and_url(url)
129
- self.sime_map_mgr = SitemapGenerator(self.url_mgr.domain)
127
+ self.source_code = source_code or req_mgr.source_code
128
+ self.soup = BeautifulSoup(self.source_code or "", parse_type)
129
+ self.base_netloc = urlparse(self.url).netloc
130
+
131
+ def is_internal(self, link):
132
+ u = urlparse(link)
133
+ return (not u.netloc) or (u.netloc == self.base_netloc)
134
+
135
+ def links_on_page(self):
136
+ out = set()
137
+ for a in self.soup.find_all("a", href=True):
138
+ out.add(urljoin(self.url, a["href"]))
139
+ return out
140
+
141
+ def crawl(self, start=None, max_depth=2, _depth=0, visited=None, session=None):
142
+ start = start or self.url
143
+ visited = visited or set()
144
+ if _depth > max_depth or start in visited:
145
+ return visited
146
+ visited.add(start)
147
+
148
+ # fetch
149
+ r = self.req_mgr.session.get(start, timeout=30)
150
+ r.raise_for_status()
151
+ soup = BeautifulSoup(r.text, self.parse_type)
152
+
153
+ for a in soup.find_all("a", href=True):
154
+ link = urljoin(start, a["href"])
155
+ if self.is_internal(link) and link not in visited:
156
+ self.crawl(link, max_depth=max_depth, _depth=_depth+1, visited=visited)
157
+ return visited
130
158
  def get_new_source_and_url(self, url=None):
131
159
  """Fetches new source code and response for a given URL."""
132
160
  url = url
@@ -194,26 +222,7 @@ class crawlManager:
194
222
  return ('yearly', '0.3')
195
223
  return ('weekly', '1.0')
196
224
 
197
- def crawl(self, url, max_depth=3, depth=1, visited=None):
198
- """Recursively crawls the site up to max_depth and returns valid internal links."""
199
- visited = visited or set()
200
- if depth > max_depth or url in visited:
201
- return []
202
-
203
- visited.add(url)
204
- try:
205
- soup = get_soup(url)
206
- links = []
207
- for tag in soup.find_all('a', href=True):
208
- link = urljoin(url, tag['href'])
209
- if urlparse(link).netloc == urlparse(url).netloc and link not in visited:
210
- links.append(link)
211
- self.crawl(link, max_depth, depth + 1, visited)
212
- return links
213
- except Exception as e:
214
- print(f"Error crawling {url}: {e}")
215
- return []
216
-
225
+
217
226
  def get_meta_info(self, url=None):
218
227
  """Fetches metadata, including title and meta tags, from the page."""
219
228
  url = url or self.url
@@ -1,48 +1,26 @@
1
- import os
2
- import requests
3
- import os
4
- import subprocess
5
- import stat
6
-
7
- def get_site(website, destination_dir, filename):
8
- # Ensure the directory exists
9
- os.makedirs(destination_dir, exist_ok=True)
10
-
11
- # Adjust directory permissions if needed (e.g. rwxr-xr-x -> 0o755)
12
- os.chmod(destination_dir, 0o755)
13
-
14
- # Construct the complete file path
15
- destination_path = os.path.join(destination_dir, filename)
16
-
17
- # Use curl to download the site
18
- # The example user-agent is arbitrary; you can change it to your needs
19
- os.system(
20
- f'curl -L --output "{destination_path}" '
21
- f'-H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
22
- f'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 '
23
- f'Safari/537.36" -H "Accept: */*" "{website}"'
24
- )
25
-
26
- def download_site(website, destination_dir, filename):
27
- os.makedirs(destination_dir, exist_ok=True)
28
- os.chmod(destination_dir, 0o755) # set directory permissions if needed
29
-
30
- destination_path = os.path.join(destination_dir, filename)
31
-
32
- # GET the resource
33
- response = requests.get(website, headers={
34
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
35
- "AppleWebKit/537.36 (KHTML, like Gecko) "
36
- "Chrome/91.0.4472.124 Safari/537.36",
37
- "Accept": "*/*"
38
- }, allow_redirects=True)
39
-
40
- # Raise an exception if the download fails
41
- response.raise_for_status()
42
-
43
- # Write content to file
1
+ import os, subprocess, requests
2
+
3
+ def curl_download(website, destination_path, user_agent=None):
4
+ os.makedirs(os.path.dirname(destination_path), exist_ok=True)
5
+ ua = user_agent or ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
6
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
7
+ "Chrome/91.0.4472.124 Safari/537.36")
8
+ subprocess.run([
9
+ "curl","-L","--output", destination_path,
10
+ "-H", f"User-Agent: {ua}",
11
+ "-H", "Accept: */*",
12
+ website
13
+ ], check=True)
14
+
15
+ def requests_download(website, destination_path, headers=None):
16
+ os.makedirs(os.path.dirname(destination_path), exist_ok=True)
17
+ hdr = {"User-Agent": ("Mozilla/5.0 ... Chrome/91.0 Safari/537.36"),
18
+ "Accept": "*/*"}
19
+ if headers: hdr.update(headers)
20
+ r = requests.get(website, headers=hdr, allow_redirects=True, timeout=30)
21
+ r.raise_for_status()
44
22
  with open(destination_path, "wb") as f:
45
- f.write(response.content)
46
- website = 'https://www.pornhub.com'
47
- destination = '/home/computron/Documents/doge'
48
- get_site(website,destination,'doge')
23
+ f.write(r.content)
24
+
25
+ if __name__ == "__main__":
26
+ pass # no side effects
@@ -0,0 +1,27 @@
1
+ # meta_dump.py
2
+ from abstract_webtools.managers.networkManager import NetworkManager
3
+ from abstract_webtools.managers.userAgentManager import UserAgentManager
4
+ from abstract_webtools.managers.soupManager.soupManager import soupManager
5
+ import json, sys
6
+
7
+ def dump_all_meta(url: str):
8
+ ua = UserAgentManager(browser="Chrome", operating_system="Windows")
9
+ net = NetworkManager(user_agent_manager=ua)
10
+
11
+ r = net.session.get(url, timeout=30)
12
+ r.raise_for_status()
13
+
14
+ sm = soupManager(url=url, source_code=r.text, req_mgr=net)
15
+ out = {
16
+ "url": url,
17
+ "title": sm.soup.title.string.strip() if sm.soup.title and sm.soup.title.string else None,
18
+ "meta": sm.all_meta(),
19
+ "citation": sm.citation_dict(),
20
+ "links": sm.all_links(),
21
+ "json_ld": sm.all_jsonld(),
22
+ }
23
+ print(json.dumps(out, indent=2, ensure_ascii=False))
24
+
25
+ if __name__ == "__main__":
26
+ url = sys.argv[1]
27
+ dump_all_meta(url)
@@ -1,15 +1,50 @@
1
+ from typing import Optional, List
2
+ import requests
1
3
  from ..abstract_webtools import *
2
- from ..big_user_agent_list import *
4
+ from .sslManager import SSLManager
5
+ from .cipherManager import CipherManager
6
+
7
+ class TLSAdapter(HTTPAdapter):
8
+ def __init__(self, ssl_manager: SSLManager=None):
9
+ ssl_manager = ssl_manager or SSLManager()
10
+ self.ssl_context = ssl_manager.ssl_context
11
+ super().__init__()
12
+ def init_poolmanager(self, *args, **kwargs):
13
+ kwargs['ssl_context'] = self.ssl_context
14
+ return super().init_poolmanager(*args, **kwargs)
15
+
3
16
  class NetworkManager:
4
- def __init__(self, user_agent_manager=None,ssl_manager=None, tls_adapter=None,user_agent=None,proxies=None,cookies=None,ciphers=None, certification: Optional[str] = None, ssl_options: Optional[List[str]] = None):
5
- if ssl_manager == None:
6
- ssl_manager = SSLManager(ciphers=ciphers, ssl_options=ssl_options, certification=certification)
7
- self.ssl_manager=ssl_manager
8
- if tls_adapter == None:
9
- tls_adapter=TLSAdapter(ssl_manager=ssl_manager,ciphers=ciphers, certification=certification, ssl_options=ssl_options)
10
- self.tls_adapter=tls_adapter
11
- self.ciphers=tls_adapter.ciphers
12
- self.certification=tls_adapter.certification
13
- self.ssl_options=tls_adapter.ssl_options
14
- self.proxies=None or {}
15
- self.cookies=cookies or "cb4c883efc59d0e990caf7508902591f4569e7bf-1617321078-0-150"
17
+ def __init__(self, user_agent_manager=None, ssl_manager=None, proxies=None, cookies=None,
18
+ ciphers=None, certification: Optional[str]=None, ssl_options: Optional[List[str]]=None):
19
+ self.ua_mgr = user_agent_manager or UserAgentManager()
20
+ self.ssl_mgr = ssl_manager or SSLManager(
21
+ ciphers=ciphers or CipherManager().ciphers_string,
22
+ ssl_options=ssl_options,
23
+ certification=certification
24
+ )
25
+
26
+ self.session = requests.Session()
27
+ self.session.headers.update({
28
+ "User-Agent": self.ua_mgr.user_agent,
29
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
30
+ "Accept-Language": "en-US,en;q=0.9",
31
+ "Connection": "keep-alive"
32
+ })
33
+ adapter = TLSAdapter(self.ssl_mgr)
34
+ self.session.mount("https://", adapter)
35
+ self.session.mount("http://", HTTPAdapter())
36
+
37
+ if proxies:
38
+ self.session.proxies = proxies
39
+ if cookies:
40
+ if isinstance(cookies, requests.cookies.RequestsCookieJar):
41
+ self.session.cookies = cookies
42
+ elif isinstance(cookies, dict):
43
+ jar = requests.cookies.RequestsCookieJar()
44
+ for k,v in cookies.items(): jar.set(k,v)
45
+ self.session.cookies = jar
46
+ # if string: up to you—parse or ignore
47
+
48
+ # retries (optional)
49
+ from requests.adapters import Retry
50
+ self.session.adapters['https://'].max_retries = Retry(total=5, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
@@ -1,6 +1,9 @@
1
1
  from ...abstract_webtools import *
2
2
  from ..urlManager import *
3
3
  from ..requestManager import *
4
+ from bs4 import BeautifulSoup
5
+ import re, json
6
+
4
7
  class soupManager:
5
8
  """
6
9
  SoupManager is a class for managing and parsing HTML source code using BeautifulSoup.
@@ -39,25 +42,49 @@ class soupManager:
39
42
  - The SoupManager class is designed for parsing HTML source code using BeautifulSoup.
40
43
  - It provides various methods to extract data and discover elements within the source code.
41
44
  """
42
- def __init__(self,url=None,source_code=None,url_mgr=None,req_mgr=None,soup=None, parse_type="html.parser"):
43
- self.soup=[]
44
- url = get_url(url=url,url_mgr=url_mgr)
45
- self.url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
46
- self.url=self.url_mgr.url
47
- self.req_mgr = get_req_mgr(req_mgr=req_mgr,url=self.url,url_mgr=self.url_mgr,source_code=source_code)
48
- self.parse_type = parse_type
49
- source_code = source_code or self.req_mgr.source_code or self.req_mgr.source_code_bytes
50
- if source_code:
51
- source_code = str(source_code)
52
- self.source_code = source_code or ''
53
- self.soup= soup or BeautifulSoup(self.source_code, self.parse_type)
54
- self.all_tags_and_attribute_names = self.get_all_tags_and_attribute_names()
55
- self.all_tags = self.all_tags_and_attribute_names.get('tags')
56
- self.all_attribute_names = self.all_tags_and_attribute_names.get('attributes')
57
- self.all_tags_and_attributes = self.all_tags + self.all_attribute_names
58
-
59
- self._all_links_data = None
60
- self._meta_tags_data = None
45
+
46
+ def __init__(self, url=None, source_code=None, req_mgr=None, parse_type="html.parser"):
47
+ self.url = url
48
+ self.req_mgr = req_mgr
49
+ self.source_code = (source_code or (req_mgr.source_code if req_mgr else "")) or ""
50
+ self.soup = BeautifulSoup(self.source_code, parse_type)
51
+
52
+ def all_meta(self):
53
+ out = []
54
+ for m in self.soup.find_all("meta"):
55
+ row = {}
56
+ for k in ("name","property","http-equiv","itemprop","charset","content"):
57
+ v = m.get(k)
58
+ if v: row[k] = v
59
+ if row: out.append(row)
60
+ return out
61
+
62
+ def citation_dict(self):
63
+ out = {}
64
+ for m in self.soup.find_all("meta"):
65
+ k = (m.get("name") or m.get("property") or "").lower()
66
+ if k.startswith("citation_") and m.get("content"):
67
+ out.setdefault(k, []).append(m["content"])
68
+ return out
69
+
70
+ def all_links(self):
71
+ res = []
72
+ for l in self.soup.find_all("link"):
73
+ rel = l.get("rel")
74
+ if isinstance(rel, list): rel = " ".join(rel)
75
+ res.append({
76
+ "rel": rel, "href": l.get("href"),
77
+ "type": l.get("type"), "title": l.get("title"), "hreflang": l.get("hreflang")
78
+ })
79
+ return res
80
+
81
+ def all_jsonld(self):
82
+ blocks = []
83
+ for s in self.soup.find_all("script", type=re.compile("application/ld\\+json", re.I)):
84
+ txt = s.get_text(strip=True)
85
+ try: blocks.append(json.loads(txt))
86
+ except Exception: blocks.append({"raw": txt})
87
+ return blocks
61
88
  def re_initialize(self):
62
89
  self.soup= BeautifulSoup(self.source_code, self.parse_type)
63
90
  self._all_links_data = None
@@ -1,12 +1,21 @@
1
1
  from ..abstract_webtools import *
2
+ # sslManager.py
3
+ from ..abstract_webtools import * # must expose ssl, ssl_
4
+ from .cipherManager import CipherManager # be explicit, safer
5
+
2
6
  class SSLManager:
3
7
  def __init__(self, ciphers=None, ssl_options=None, certification=None):
4
8
  self.ciphers = ciphers or CipherManager().ciphers_string
5
9
  self.ssl_options = ssl_options or self.get_default_ssl_settings()
6
10
  self.certification = certification or ssl.CERT_REQUIRED
7
11
  self.ssl_context = self.get_context()
12
+
8
13
  def get_default_ssl_settings(self):
9
14
  return ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1 | ssl.OP_NO_COMPRESSION
10
- def get_context(self):
11
- return ssl_.create_urllib3_context(ciphers=self.ciphers, cert_reqs=self.certification, options=self.ssl_options)
12
15
 
16
+ def get_context(self):
17
+ return ssl_.create_urllib3_context(
18
+ ciphers=self.ciphers,
19
+ cert_reqs=self.certification,
20
+ options=self.ssl_options
21
+ )
@@ -1,29 +1,27 @@
1
+ # userAgentManager.py
1
2
  from ..abstract_webtools import *
2
3
  import random
4
+
3
5
  operating_systems = ['Macintosh','Windows','Linux']
4
6
  browsers = ['Firefox','Chrome','IceDragon','Waterfox','Gecko','Safari','MetaSr']
5
- def get_itter(iter_input,itter_list):
6
- if not iter_input:
7
- return itter_list[0]
8
- if iter_input in itter_list:
9
- return iter_input
10
- iter_input_lower = iter_input.lower()
11
- for itter in itter_list:
12
- itter_lower = itter.lower()
13
- if iter_input_lower in itter_lower:
14
- return itter
15
- return itter_list[0]
16
- def get_browser(browser=None):
17
- return get_itter(browser,browsers)
18
- def get_operating_system(operating_system=None):
19
- return get_itter(operating_system,operating_systems)
7
+
8
+ def _pick(val, options):
9
+ if not val: return options[0]
10
+ if val in options: return val
11
+ l = val.lower()
12
+ for o in options:
13
+ if l in o.lower():
14
+ return o
15
+ return options[0]
16
+
20
17
  class UserAgentManager:
21
- def __init__(self, operating_system=None, browser=None, version=None,user_agent=None):
22
- self.operating_system = get_operating_system(operating_system=operating_system)
23
- self.browser = get_browser(browser=browser)
18
+ def __init__(self, operating_system=None, browser=None, version=None, user_agent=None):
19
+ self.operating_system = _pick(operating_system, operating_systems)
20
+ self.browser = _pick(browser, browsers)
24
21
  self.version = version or '42.0'
25
22
  self.user_agent = user_agent or self.get_user_agent()
26
- self.header = self.user_agent_header()
23
+ self.header = {"user-agent": self.user_agent}
24
+
27
25
  @staticmethod
28
26
  def user_agent_db():
29
27
  from ..big_user_agent_list import big_user_agent_dict
@@ -31,30 +29,23 @@ class UserAgentManager:
31
29
 
32
30
  def get_user_agent(self):
33
31
  ua_db = self.user_agent_db()
32
+ os_db = ua_db.get(self.operating_system) or random.choice(list(ua_db.values()))
33
+ br_db = os_db.get(self.browser) or random.choice(list(os_db.values()))
34
+ if self.version in br_db:
35
+ return br_db[self.version]
36
+ return random.choice(list(br_db.values()))
34
37
 
35
- if self.operating_system and self.operating_system in ua_db:
36
- operating_system_db = ua_db[self.operating_system]
37
- else:
38
- operating_system_db = random.choice(list(ua_db.values()))
39
-
40
- if self.browser and self.browser in operating_system_db:
41
- browser_db = operating_system_db[self.browser]
42
- else:
43
- browser_db = random.choice(list(operating_system_db.values()))
44
-
45
- if self.version and self.version in browser_db:
46
- return browser_db[self.version]
47
- else:
48
- return random.choice(list(browser_db.values()))
49
-
50
- def user_agent_header(self):
51
- return {"user-agent": self.user_agent}
52
38
  class UserAgentManagerSingleton:
53
39
  _instance = None
40
+
54
41
  @staticmethod
55
- def get_instance(user_agent=UserAgentManager().get_user_agent()[0]):
42
+ def get_instance(**kwargs):
43
+ ua = kwargs.get("user_agent")
56
44
  if UserAgentManagerSingleton._instance is None:
57
- UserAgentManagerSingleton._instance = UserAgentManager(user_agent=user_agent)
58
- elif UserAgentManagerSingleton._instance.user_agent != user_agent:
59
- UserAgentManagerSingleton._instance = UserAgentManager(user_agent=user_agent)
45
+ UserAgentManagerSingleton._instance = UserAgentManager(**kwargs)
46
+ else:
47
+ # rebuild if user_agent explicitly changed
48
+ inst = UserAgentManagerSingleton._instance
49
+ if ua and ua != inst.user_agent:
50
+ UserAgentManagerSingleton._instance = UserAgentManager(**kwargs)
60
51
  return UserAgentManagerSingleton._instance
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.145
3
+ Version: 0.1.6.146
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -12,20 +12,20 @@ abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE7
12
12
  abstract_webtools/url_grabber_new.py,sha256=xb23qo4anOY0Ax3CAfaHJ8s5VEz61Sinh-XpEDFW7Is,3621
13
13
  abstract_webtools/managers/__init__.py,sha256=RXQAK5z9nYlocM91P2OC4jR352-MiqT5bAi4xZl7_FU,470
14
14
  abstract_webtools/managers/allss\.py,sha256=IBhlyRQHfK-BtwUnSEbIPqlI1MtZ8-XsdaHv0b91HQ0,269
15
- abstract_webtools/managers/cipherManager.py,sha256=NHQGdR11eNSm-1H-GezD5dyQgsPTJwY5kczt8Sher2s,1621
16
- abstract_webtools/managers/crawlManager.py,sha256=62Ej6AQC6-qXX_EWOmcJ2szNvEjmebFGugMz65HF1qI,12983
15
+ abstract_webtools/managers/cipherManager.py,sha256=trmjY6AoBDKnh4oprtJaGhGV-jyNmyUoPnw44s0C4PA,1707
16
+ abstract_webtools/managers/crawlManager.py,sha256=mytGHr4bQDboq0q9XhqtPApJt48sL1KlMFsUV1MxvxM,13141
17
17
  abstract_webtools/managers/crawlmgr2.py,sha256=PvHas-FSlp98osc-2so9zw-2c7amUMdwIj6tmc6Rl00,1910
18
- abstract_webtools/managers/curlMgr.py,sha256=ghi0QsSAxjZu3HALFST5Kv_262XhHSAPGlQLvmguxPY,1657
18
+ abstract_webtools/managers/curlMgr.py,sha256=gM_TzsnYIQGiK__YXarHt5XpRLdE-RgaJqRYKzsXm34,1025
19
19
  abstract_webtools/managers/domainManager.py,sha256=95znOBv05W77mW_fbZAfl4RmlENDlYqhEOMkL02L220,3610
20
20
  abstract_webtools/managers/dynamicRateLimiter.py,sha256=ycn5VQEPnmxjNMew4IVh-t5t43jhM39uypoOK9bJDDg,7662
21
21
  abstract_webtools/managers/get_test.py,sha256=nISrhUGdyvRv18wTGoifGhizBFoHeK0N3FymMASloFw,825
22
+ abstract_webtools/managers/meta_dump.py,sha256=3U-P-CRF5YfjtJuIoGlt6XMOu8Xdt1ijZNxfT9lmH1w,937
22
23
  abstract_webtools/managers/mySocketClient.py,sha256=-j1Q8Ds9RCSbjZdx3ZF9mVpgwxaO0BBssanUcpYVQoY,2045
23
- abstract_webtools/managers/networkManager.py,sha256=Op2QDXrP-gmm0tCToe-Ryt9xuOtMppcN2KLKP1WZiu0,952
24
+ abstract_webtools/managers/networkManager.py,sha256=eVnQACciE0r3E-xYqLjqNc26c_VCUZsiajusx2q4Pu4,2164
24
25
  abstract_webtools/managers/seleneumManager.py,sha256=wyo4SpocgRz3W50b33GW3po32_uxYwmdE1TFZ_0k07s,9539
25
- abstract_webtools/managers/seleniumManager.py,sha256=RRpA1_oOnZuzzQ4S6VX7tDFcI31E_mOou2CZOOZH6yI,4274
26
- abstract_webtools/managers/sslManager.py,sha256=I9YUqJo8_KwLOwfBTAoSfzKSfR4Vtjw1HQXsXRnCV-g,641
26
+ abstract_webtools/managers/sslManager.py,sha256=jvWFnZ80Quyb-kD8C41xWGC1dEzR4vvTS_QUIXgmPQQ,827
27
27
  abstract_webtools/managers/tlsAdapter.py,sha256=XZSMZz9EUOhv-h3_Waf6mjV1dA3oN_M_oWuoo4VZ_HE,1454
28
- abstract_webtools/managers/userAgentManager.py,sha256=cUaOlcCTzftVBCp9ZHwMXR9IB1wAE-03YSVwUBaIFLM,2514
28
+ abstract_webtools/managers/userAgentManager.py,sha256=Lmpa0cvTkzXJ51Lmfcb_TuPPSNETcJbHY38pyQadaIU,1885
29
29
  abstract_webtools/managers/videoDownloader.py,sha256=mKXhKYNnJwPaiqAsHvFTBGLdXFgR3wdV0G1OIimiKbE,15424
30
30
  abstract_webtools/managers/videoDownloader2.py,sha256=v3H6akdhvVWGrB-r35m3cp_-aKkNWadpfCiMylOnv6w,12748
31
31
  abstract_webtools/managers/clownworld/__init__.py,sha256=eq25euhRbFqHLm1ibi_7FGz_oNWs-kkyAkETzK3r4_Q,35
@@ -42,11 +42,11 @@ abstract_webtools/managers/requestManager/__init__.py,sha256=z2qGtweEoO_OKr959LG
42
42
  abstract_webtools/managers/requestManager/requestManager.py,sha256=0d1Z5dFIjOg8KyJakzOilJiiq6SR3iKUr5vfnssWDu8,20048
43
43
  abstract_webtools/managers/soupManager/__init__.py,sha256=mqfXfqM9sWlYpOkoXUqtBoVvk2KQx1862NnmRVJwGtY,27
44
44
  abstract_webtools/managers/soupManager/asoueces.py,sha256=OaXqolZl0dI7b09NYwJ3Wnhuxf89ahZ1GjsOqy0GXfk,3506
45
- abstract_webtools/managers/soupManager/soupManager.py,sha256=75gwqVXIRwgVqzATBC-DiJF2AT_AdE6FSBWy3DbW5ZA,17393
45
+ abstract_webtools/managers/soupManager/soupManager.py,sha256=6vWYnZGuimStbNiuH_V4UMPRk1W0SZo_fZkh8f7fVmM,17938
46
46
  abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
47
47
  abstract_webtools/managers/urlManager/urlManager (Copy).py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
48
48
  abstract_webtools/managers/urlManager/urlManager.py,sha256=vY4KQXtcrlC2YtlultxQpVe581l5kAuT5VGA0WrI16g,8945
49
- abstract_webtools-0.1.6.145.dist-info/METADATA,sha256=7eU_thbiawnNyvNUcQOBHclY44_tH3DikGbdnllhtXE,7289
50
- abstract_webtools-0.1.6.145.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
51
- abstract_webtools-0.1.6.145.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
52
- abstract_webtools-0.1.6.145.dist-info/RECORD,,
49
+ abstract_webtools-0.1.6.146.dist-info/METADATA,sha256=TirJooA5jA2sXf7KpS0Hy24m4LpvMK2TmtFMbcTaH_E,7289
50
+ abstract_webtools-0.1.6.146.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
51
+ abstract_webtools-0.1.6.146.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
52
+ abstract_webtools-0.1.6.146.dist-info/RECORD,,
@@ -1,119 +0,0 @@
1
- import os
2
- from ..abstract_webtools import *
3
- from .urlManager import *
4
- from urllib.parse import urlparse
5
- from abstract_utilities import *
6
- from selenium import webdriver
7
- from selenium.webdriver.chrome.options import Options
8
- import logging
9
- import urllib3
10
-
11
- # Suppress urllib3 warnings and debug logs
12
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
13
- logging.getLogger("urllib3").setLevel(logging.WARNING)
14
-
15
- # Suppress Selenium logs
16
- logging.getLogger("selenium").setLevel(logging.WARNING)
17
-
18
- import os
19
- from selenium import webdriver
20
- from selenium.webdriver.chrome.options import Options
21
-
22
- # Setup Chrome options
23
- chrome_options = Options()
24
- #chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
25
- chrome_options.add_argument("--headless") # Run in headless mode
26
- chrome_options.add_argument("--no-sandbox")
27
- chrome_options.add_argument("--disable-dev-shm-usage")
28
- chrome_options.add_argument("--disable-gpu")
29
- chrome_options.add_argument("--disable-software-rasterizer")
30
- chrome_options.add_argument("--disable-extensions")
31
- chrome_options.add_argument("--remote-debugging-port=9222")
32
-
33
-
34
- class SingletonMeta(type):
35
- _instances = {}
36
- def __call__(cls, *args, **kwargs):
37
- if cls not in cls._instances:
38
- instance = super().__call__(*args, **kwargs)
39
- cls._instances[cls] = instance
40
- return cls._instances[cls]
41
-
42
- class seleniumManager(metaclass=SingletonMeta):
43
- def __init__(self, url):
44
- if not hasattr(self, 'initialized'): # Prevent reinitialization
45
- self.initialized = True
46
- parsed_url = urlparse(url)
47
- self.domain = parsed_url.netloc
48
- self.scheme = parsed_url.scheme
49
- self.base_url= f"{self.scheme}{self.domain}"
50
- self.site_dir = os.path.join(os.getcwd(), self.domain)
51
- os.makedirs(self.site_dir, exist_ok=True)
52
- self.drivers = {}
53
- self.page_type = []
54
-
55
- def get_url_to_path(self, url):
56
- url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
57
- parsed_url = urlparse(url)
58
- if parsed_url.netloc == self.domain:
59
- paths = parsed_url.path.split('/')
60
- dir_path = self.site_dir
61
- for path in paths[:-1]:
62
- dir_path = os.path.join(dir_path, path)
63
- os.makedirs(dir_path, exist_ok=True)
64
- self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
65
-
66
- dir_path = os.path.join(dir_path, paths[-1])
67
- return dir_path
68
-
69
- def saved_url_check(self, url):
70
- path = self.get_url_to_path(url)
71
- return path
72
-
73
- def get_with_netloc(self, url):
74
- parsed_url = urlparse(url)
75
- if parsed_url.netloc == '':
76
- url = f"{self.scheme}://{self.domain}/{url.strip()}"
77
- return url
78
-
79
- def get_driver(self, url):
80
- if url and url not in self.drivers:
81
- chrome_options = Options()
82
- chrome_options.add_argument("--headless")
83
- driver = webdriver.Chrome(options=chrome_options)
84
- self.drivers[url] = driver
85
- driver.get(url)
86
- return self.drivers[url]
87
- def normalize_url(url, base_url=None):
88
- """
89
- Normalize and resolve relative URLs, ensuring proper domain and format.
90
- """
91
- # If URL starts with the base URL repeated, remove the extra part
92
- manager = seleniumManager(url)
93
- base_url = manager.base_url
94
- if url.startswith(base_url):
95
- url = url[len(base_url):]
96
-
97
- # Resolve the URL against the base URL
98
- normalized_url = urljoin(base_url, url.split('#')[0])
99
-
100
- # Ensure only URLs belonging to the base domain are kept
101
- if not normalized_url.startswith(base_url):
102
- return None
103
-
104
- return normalized_url
105
- # Function to get Selenium page source
106
- def get_selenium_source(url):
107
- url_mgr = urlManager(url)
108
- if url_mgr.url:
109
- url = str(url_mgr.url)
110
- manager = seleniumManager(url)
111
- driver = manager.get_driver(url)
112
- try:
113
- # Get page source
114
- page_source = driver.page_source
115
- return page_source
116
- finally:
117
- # Don't quit the driver unless you're done with all interactions
118
- pass
119
-