abstract-webtools 0.1.5.90__tar.gz → 0.1.5.92__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {abstract_webtools-0.1.5.90/src/abstract_webtools.egg-info → abstract_webtools-0.1.5.92}/PKG-INFO +1 -1
  2. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/setup.py +1 -1
  3. abstract_webtools-0.1.5.92/src/abstract_webtools/managers/domainManager.py +100 -0
  4. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/managers/requestManager.py +5 -6
  5. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/managers/seleniumManager.py +4 -1
  6. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92/src/abstract_webtools.egg-info}/PKG-INFO +1 -1
  7. abstract_webtools-0.1.5.90/src/abstract_webtools/managers/domainManager.py +0 -48
  8. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/LICENSE +0 -0
  9. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/README.md +0 -0
  10. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/pyproject.toml +0 -0
  11. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/setup.cfg +0 -0
  12. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/__init__.py +0 -0
  13. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/abstract_webtools.py +0 -0
  14. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/big_user_agent_list.py +0 -0
  15. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/main.py +0 -0
  16. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/managers/__init__.py +0 -0
  17. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/managers/cipherManager.py +0 -0
  18. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/managers/crawlManager.py +0 -0
  19. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
  20. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/managers/linkManager.py +0 -0
  21. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/managers/mySocketClient.py +0 -0
  22. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/managers/networkManager.py +0 -0
  23. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/managers/soupManager.py +0 -0
  24. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/managers/sslManager.py +0 -0
  25. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
  26. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/managers/urlManager.py +0 -0
  27. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/managers/userAgentManager.py +0 -0
  28. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/managers/videoDownloader.py +0 -0
  29. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/soup_gui.py +0 -0
  30. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/url_grabber.py +0 -0
  31. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools/url_grabber_new.py +0 -0
  32. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools.egg-info/SOURCES.txt +0 -0
  33. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
  34. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools.egg-info/requires.txt +0 -0
  35. {abstract_webtools-0.1.5.90 → abstract_webtools-0.1.5.92}/src/abstract_webtools.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: abstract_webtools
3
- Version: 0.1.5.90
3
+ Version: 0.1.5.92
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
4
4
  long_description = fh.read()
5
5
  setuptools.setup(
6
6
  name='abstract_webtools',
7
- version='0.1.5.90',
7
+ version='0.1.5.92',
8
8
  author='putkoff',
9
9
  author_email='partners@abstractendeavors.com',
10
10
  description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
@@ -0,0 +1,100 @@
1
+ from abstract_webtools import *
2
+ import os
3
+ from abstract_webtools import *
4
+ #from .urlManager import *
5
+ from urllib.parse import urlparse
6
+ from abstract_utilities import *
7
+ from selenium import webdriver
8
+ from selenium.webdriver.chrome.options import Options
9
+ import logging
10
+ import urllib3
11
+
12
+ # Suppress urllib3 warnings and debug logs
13
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
14
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
15
+
16
+ # Suppress Selenium logs
17
+ logging.getLogger("selenium").setLevel(logging.WARNING)
18
+
19
+ import os
20
+ from selenium import webdriver
21
+ from selenium.webdriver.chrome.options import Options
22
+
23
+ # Setup Chrome options
24
+ chrome_options = Options()
25
+ chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
26
+ chrome_options.add_argument("--headless") # Run in headless mode
27
+ chrome_options.add_argument("--no-sandbox")
28
+ chrome_options.add_argument("--disable-dev-shm-usage")
29
+ chrome_options.add_argument("--disable-gpu")
30
+ chrome_options.add_argument("--disable-software-rasterizer")
31
+ chrome_options.add_argument("--disable-extensions")
32
+ chrome_options.add_argument("--remote-debugging-port=9222")
33
+
34
+
35
+ class domainManager(metaclass=SingletonMeta):
36
+ def __init__(self, url):
37
+ if not hasattr(self, 'initialized'): # Prevent reinitialization
38
+ self.initialized = True
39
+ parsed_url = urlparse(url)
40
+ self.domain = parsed_url.netloc
41
+ self.scheme = parsed_url.scheme
42
+ self.site_dir = os.path.join(os.getcwd(), self.domain)
43
+ os.makedirs(self.site_dir, exist_ok=True)
44
+ self.drivers = {}
45
+ self.page_type = []
46
+ def get_url_to_path(self, url):
47
+ url = eatAll(str(url),['',' ','\n','\t','\\','/'])
48
+ parsed_url = urlparse(url)
49
+ if 'data:image' in url:
50
+ input(url)
51
+ if parsed_url.netloc == self.domain:
52
+ paths = parsed_url.path.split('/')
53
+ dir_path =self.site_dir
54
+ for path in paths[:-1]:
55
+ dir_path = os.path.join(dir_path, path)
56
+ os.makedirs(dir_path, exist_ok=True)
57
+ #if 'svg' in url:
58
+ #$ input(url)
59
+ # dir_path = get_image_name('contents',directory=dir_path,ext='png',url=item_url)
60
+
61
+
62
+ self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
63
+
64
+ dir_path = os.path.join(dir_path, paths[-1])
65
+ return dir_path
66
+
67
+ def saved_url_check(self, url):
68
+
69
+ path = self.get_url_to_path(url)
70
+ return path
71
+
72
+ def get_with_netloc(self, url):
73
+ parsed_url = urlparse(url)
74
+ if parsed_url.netloc == '':
75
+ url = f"{self.scheme}://{self.domain}/{url.strip()}"
76
+ return url
77
+
78
+ def get_driver(self, url):
79
+ if url and url not in self.drivers:
80
+ chrome_options = Options()
81
+ chrome_options.add_argument("--headless")
82
+ driver = webdriver.Chrome(options=chrome_options)
83
+ self.drivers[url] = driver
84
+ driver.get(url)
85
+ return self.drivers[url]
86
+ def get_selenium_source(url):
87
+ url_mgr = urlManager(url)
88
+ if url_mgr.url:
89
+ url = str(url_mgr.url)
90
+ manager = domainManager(url)
91
+ driver = manager.get_driver(url)
92
+ try:
93
+ # Get page source
94
+ page_source = driver.page_source
95
+ return page_source
96
+ finally:
97
+ # Don't quit the driver unless you're done with all interactions
98
+ pass
99
+ driver = get_selenium_source('http://solpump.io/')
100
+ input(driver)
@@ -1,11 +1,12 @@
1
1
  from ..abstract_webtools import *
2
- from .urlManager import *
2
+
3
3
  from .userAgentManager import *
4
4
  from .cipherManager import *
5
5
  from .sslManager import *
6
6
  from .tlsAdapter import *
7
7
  from .networkManager import *
8
8
  from .seleniumManager import *
9
+ from .urlManager import *
9
10
  class requestManager:
10
11
  """
11
12
  SafeRequest is a class for making HTTP requests with error handling and retries.
@@ -340,12 +341,10 @@ class SafeRequestSingleton:
340
341
  elif SafeRequestSingleton._instance.url != url or SafeRequestSingleton._instance.headers != headers or SafeRequestSingleton._instance.max_retries != max_retries or SafeRequestSingleton._instance.request_wait_limit != request_wait_limit:
341
342
  SafeRequestSingleton._instance = SafeRequest(url,url_mgr=urlManagerSingleton,headers=headers,max_retries=max_retries,last_request_time=last_request_time,request_wait_limit=request_wait_limit)
342
343
  return SafeRequestSingleton._instance
343
- def get_req_mgr(req_mgr=None,url=None,url_mgr=None,source_code=None):
344
- if req_mgr:
345
- url_mgr = req_mgr.url_mgr
344
+ def get_req_mgr(url=None,url_mgr=None,source_code=None):
346
345
  url = get_url(url=url,url_mgr=url_mgr)
347
- url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
348
- req_mgr = req_mgr or requestManager(url_mgr=url_mgr,url=url,source_code=source_code)
346
+ url_mgr = get_url_mgr(url=url,url_mgr=url_mgr )
347
+ req_mgr = requestManager(url_mgr=url_mgr,url=url,source_code=source_code)
349
348
  return req_mgr
350
349
  def get_source(url=None,url_mgr=None,source_code=None):
351
350
  # Placeholder for actual implementation.
@@ -39,7 +39,7 @@ class SingletonMeta(type):
39
39
  cls._instances[cls] = instance
40
40
  return cls._instances[cls]
41
41
 
42
- class domainManager(metaclass=SingletonMeta):
42
+ class seleneumManager(metaclass=SingletonMeta):
43
43
  def __init__(self, url):
44
44
  if not hasattr(self, 'initialized'): # Prevent reinitialization
45
45
  self.initialized = True
@@ -77,6 +77,8 @@ class domainManager(metaclass=SingletonMeta):
77
77
 
78
78
  def get_driver(self, url):
79
79
  if url and url not in self.drivers:
80
+ chrome_options = Options()
81
+ chrome_options.add_argument("--headless")
80
82
  driver = webdriver.Chrome(options=chrome_options)
81
83
  self.drivers[url] = driver
82
84
  driver.get(url)
@@ -96,3 +98,4 @@ def get_selenium_source(url):
96
98
  finally:
97
99
  # Don't quit the driver unless you're done with all interactions
98
100
  pass
101
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: abstract_webtools
3
- Version: 0.1.5.90
3
+ Version: 0.1.5.92
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -1,48 +0,0 @@
1
- from ..abstract_webtools import *
2
- class domainManager(metaclass=SingletonMeta):
3
- def __init__(self, url):
4
- if not hasattr(self, 'initialized'): # Prevent reinitialization
5
- self.initialized = True
6
- parsed_url = urlparse(url)
7
- self.domain = parsed_url.netloc
8
- self.scheme = parsed_url.scheme
9
- def get_url_to_path(self, url):
10
- url = eatAll(str(url),['',' ','\n','\t','\\','/'])
11
- parsed_url = urlparse(url)
12
- if 'data:image' in url:
13
- input(url)
14
- if parsed_url.netloc == self.domain:
15
- paths = parsed_url.path.split('/')
16
- dir_path =self.site_dir
17
- for path in paths[:-1]:
18
- dir_path = os.path.join(dir_path, path)
19
- os.makedirs(dir_path, exist_ok=True)
20
- #if 'svg' in url:
21
- #$ input(url)
22
- # dir_path = get_image_name('contents',directory=dir_path,ext='png',url=item_url)
23
-
24
-
25
- self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
26
-
27
- dir_path = os.path.join(dir_path, paths[-1])
28
- return dir_path
29
-
30
- def saved_url_check(self, url):
31
-
32
- path = self.get_url_to_path(url)
33
- return path
34
-
35
- def get_with_netloc(self, url):
36
- parsed_url = urlparse(url)
37
- if parsed_url.netloc == '':
38
- url = f"{self.scheme}://{self.domain}/{url.strip()}"
39
- return url
40
-
41
- def get_driver(self, url):
42
- if url and url not in self.drivers:
43
- chrome_options = Options()
44
- chrome_options.add_argument("--headless")
45
- driver = webdriver.Chrome(options=chrome_options)
46
- self.drivers[url] = driver
47
- driver.get(url)
48
- return self.drivers[url]