abstract-webtools 0.1.6.40__tar.gz → 0.1.6.41__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {abstract_webtools-0.1.6.40/src/abstract_webtools.egg-info → abstract_webtools-0.1.6.41}/PKG-INFO +1 -1
  2. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/setup.py +1 -1
  3. abstract_webtools-0.1.6.41/src/abstract_webtools/managers/seleniumManager.py +119 -0
  4. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41/src/abstract_webtools.egg-info}/PKG-INFO +1 -1
  5. abstract_webtools-0.1.6.40/src/abstract_webtools/managers/seleniumManager.py +0 -128
  6. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/LICENSE +0 -0
  7. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/README.md +0 -0
  8. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/pyproject.toml +0 -0
  9. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/setup.cfg +0 -0
  10. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/__init__.py +0 -0
  11. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/abstract_webtools.py +0 -0
  12. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/big_user_agent_list.py +0 -0
  13. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/main.py +0 -0
  14. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/__init__.py +0 -0
  15. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/cipherManager.py +0 -0
  16. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/crawlManager.py +0 -0
  17. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/crawlmgr2.py +0 -0
  18. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/curlMgr.py +0 -0
  19. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/domainManager.py +0 -0
  20. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
  21. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/get_test.py +0 -0
  22. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/linkManager/__init__.py +0 -0
  23. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/linkManager/linkManager.py +0 -0
  24. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/mySocketClient.py +0 -0
  25. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/networkManager.py +0 -0
  26. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/requestManager/__init__.py +0 -0
  27. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/requestManager/requestManager.py +0 -0
  28. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/soupManager/__init__.py +0 -0
  29. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/soupManager/asoueces.py +0 -0
  30. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/soupManager/soupManager.py +0 -0
  31. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/sslManager.py +0 -0
  32. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
  33. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/urlManager/__init__.py +0 -0
  34. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/urlManager/urlManager.py +0 -0
  35. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/userAgentManager.py +0 -0
  36. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/videoDownloader.py +0 -0
  37. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/managers/videoDownloader2.py +0 -0
  38. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/soup_gui.py +0 -0
  39. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/url_grabber.py +0 -0
  40. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools/url_grabber_new.py +0 -0
  41. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools.egg-info/SOURCES.txt +0 -0
  42. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
  43. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools.egg-info/requires.txt +0 -0
  44. {abstract_webtools-0.1.6.40 → abstract_webtools-0.1.6.41}/src/abstract_webtools.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.40
3
+ Version: 0.1.6.41
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
4
4
  long_description = fh.read()
5
5
  setuptools.setup(
6
6
  name='abstract_webtools',
7
- version='0.1.6.40',
7
+ version='0.1.6.41',
8
8
  author='putkoff',
9
9
  author_email='partners@abstractendeavors.com',
10
10
  description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
@@ -0,0 +1,119 @@
1
+ import os
2
+ from ..abstract_webtools import *
3
+ from .urlManager import *
4
+ from urllib.parse import urlparse
5
+ from abstract_utilities import *
6
+ from selenium import webdriver
7
+ from selenium.webdriver.chrome.options import Options
8
+ import logging
9
+ import urllib3
10
+
11
+ # Suppress urllib3 warnings and debug logs
12
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
13
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
14
+
15
+ # Suppress Selenium logs
16
+ logging.getLogger("selenium").setLevel(logging.WARNING)
17
+
18
+ import os
19
+ from selenium import webdriver
20
+ from selenium.webdriver.chrome.options import Options
21
+
22
+ # Setup Chrome options
23
+ chrome_options = Options()
24
+ chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
25
+ chrome_options.add_argument("--headless") # Run in headless mode
26
+ chrome_options.add_argument("--no-sandbox")
27
+ chrome_options.add_argument("--disable-dev-shm-usage")
28
+ chrome_options.add_argument("--disable-gpu")
29
+ chrome_options.add_argument("--disable-software-rasterizer")
30
+ chrome_options.add_argument("--disable-extensions")
31
+ chrome_options.add_argument("--remote-debugging-port=9222")
32
+
33
+
34
+ class SingletonMeta(type):
35
+ _instances = {}
36
+ def __call__(cls, *args, **kwargs):
37
+ if cls not in cls._instances:
38
+ instance = super().__call__(*args, **kwargs)
39
+ cls._instances[cls] = instance
40
+ return cls._instances[cls]
41
+
42
+ class seleniumManager(metaclass=SingletonMeta):
43
+ def __init__(self, url):
44
+ if not hasattr(self, 'initialized'): # Prevent reinitialization
45
+ self.initialized = True
46
+ parsed_url = urlparse(url)
47
+ self.domain = parsed_url.netloc
48
+ self.scheme = parsed_url.scheme
49
+ self.base_url= f"{self.scheme}{self.domain}"
50
+ self.site_dir = os.path.join(os.getcwd(), self.domain)
51
+ os.makedirs(self.site_dir, exist_ok=True)
52
+ self.drivers = {}
53
+ self.page_type = []
54
+
55
+ def get_url_to_path(self, url):
56
+ url = eatAll(str(url), ['',' ','\n','\t','\\','/'])
57
+ parsed_url = urlparse(url)
58
+ if parsed_url.netloc == self.domain:
59
+ paths = parsed_url.path.split('/')
60
+ dir_path = self.site_dir
61
+ for path in paths[:-1]:
62
+ dir_path = os.path.join(dir_path, path)
63
+ os.makedirs(dir_path, exist_ok=True)
64
+ self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if len(self.page_type) == 0 else self.page_type[-1])
65
+
66
+ dir_path = os.path.join(dir_path, paths[-1])
67
+ return dir_path
68
+
69
+ def saved_url_check(self, url):
70
+ path = self.get_url_to_path(url)
71
+ return path
72
+
73
+ def get_with_netloc(self, url):
74
+ parsed_url = urlparse(url)
75
+ if parsed_url.netloc == '':
76
+ url = f"{self.scheme}://{self.domain}/{url.strip()}"
77
+ return url
78
+
79
+ def get_driver(self, url):
80
+ if url and url not in self.drivers:
81
+ chrome_options = Options()
82
+ chrome_options.add_argument("--headless")
83
+ driver = webdriver.Chrome(options=chrome_options)
84
+ self.drivers[url] = driver
85
+ driver.get(url)
86
+ return self.drivers[url]
87
+ def normalize_url(url, base_url=None):
88
+ """
89
+ Normalize and resolve relative URLs, ensuring proper domain and format.
90
+ """
91
+ # If URL starts with the base URL repeated, remove the extra part
92
+ manager = seleniumManager(url)
93
+ base_url = manager.base_url
94
+ if url.startswith(base_url):
95
+ url = url[len(base_url):]
96
+
97
+ # Resolve the URL against the base URL
98
+ normalized_url = urljoin(base_url, url.split('#')[0])
99
+
100
+ # Ensure only URLs belonging to the base domain are kept
101
+ if not normalized_url.startswith(base_url):
102
+ return None
103
+
104
+ return normalized_url
105
+ # Function to get Selenium page source
106
+ def get_selenium_source(url):
107
+ url_mgr = urlManager(url)
108
+ if url_mgr.url:
109
+ url = str(url_mgr.url)
110
+ manager = seleniumManager(url)
111
+ driver = manager.get_driver(url)
112
+ try:
113
+ # Get page source
114
+ page_source = driver.page_source
115
+ return page_source
116
+ finally:
117
+ # Don't quit the driver unless you're done with all interactions
118
+ pass
119
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.40
3
+ Version: 0.1.6.41
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -1,128 +0,0 @@
1
- import os
2
- from urllib.parse import urlparse
3
- from selenium import webdriver
4
- from selenium.webdriver.chrome.options import Options
5
- from selenium.webdriver.chrome.service import Service
6
- from webdriver_manager.chrome import ChromeDriverManager # For automatic ChromeDriver installation
7
- import logging
8
- import urllib3
9
- from ..abstract_webtools import * # Assuming this is a valid import
10
- from .urlManager import *
11
-
12
- # Suppress urllib3 warnings and debug logs
13
- urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
14
- logging.getLogger("urllib3").setLevel(logging.WARNING)
15
- logging.getLogger("selenium").setLevel(logging.WARNING)
16
-
17
- # Default Chrome options (can be overridden)
18
- DEFAULT_CHROME_OPTIONS = [
19
- "--headless", # Run in headless mode
20
- "--no-sandbox",
21
- "--disable-dev-shm-usage", # Avoid memory issues on servers
22
- "--disable-gpu",
23
- "--disable-software-rasterizer",
24
- "--disable-extensions",
25
- "--remote-debugging-port=9222"
26
- ]
27
-
28
- class SingletonMeta(type):
29
- _instances = {}
30
- def __call__(cls, *args, **kwargs):
31
- if cls not in cls._instances:
32
- instance = super().__call__(*args, **kwargs)
33
- cls._instances[cls] = instance
34
- return cls._instances[cls]
35
-
36
- class SeleniumManager(metaclass=SingletonMeta):
37
- def __init__(self, url):
38
- if not hasattr(self, 'initialized'): # Prevent reinitialization
39
- self.initialized = True
40
- parsed_url = urlparse(url)
41
- self.domain = parsed_url.netloc
42
- self.scheme = parsed_url.scheme or "https" # Default to https if scheme is missing
43
- self.base_url = f"{self.scheme}://{self.domain}"
44
- self.site_dir = os.path.join(os.getcwd(), self.domain)
45
- os.makedirs(self.site_dir, exist_ok=True)
46
- self.drivers = {}
47
- self.page_type = []
48
-
49
- def get_url_to_path(self, url):
50
- url = eatAll(str(url), ['', ' ', '\n', '\t', '\\', '/']) # Assuming eatAll is defined elsewhere
51
- parsed_url = urlparse(url)
52
- if parsed_url.netloc == self.domain:
53
- paths = parsed_url.path.split('/')
54
- dir_path = self.site_dir
55
- for path in paths[:-1]:
56
- dir_path = os.path.join(dir_path, path)
57
- os.makedirs(dir_path, exist_ok=True)
58
- self.page_type.append(os.path.splitext(paths[-1])[-1] or 'html' if not self.page_type else self.page_type[-1])
59
- dir_path = os.path.join(dir_path, paths[-1])
60
- return dir_path
61
-
62
- def saved_url_check(self, url):
63
- return self.get_url_to_path(url)
64
-
65
- def get_with_netloc(self, url):
66
- parsed_url = urlparse(url)
67
- if not parsed_url.netloc:
68
- url = f"{self.scheme}://{self.domain}/{url.strip('/')}"
69
- return url
70
-
71
- def get_driver(self, url):
72
- if url and url not in self.drivers:
73
- # Set up Chrome options
74
- chrome_options = Options()
75
- for option in DEFAULT_CHROME_OPTIONS:
76
- chrome_options.add_argument(option)
77
-
78
- # Specify Chrome binary location if needed (optional, comment out if not applicable)
79
- # chrome_options.binary_location = "/home/profiles/solcatcher/.cache/selenium/chrome/linux64/130.0.6723.58/chrome"
80
-
81
- # Automatically install and use ChromeDriver
82
- service = Service(ChromeDriverManager().install())
83
- driver = webdriver.Chrome(service=service, options=chrome_options)
84
-
85
- self.drivers[url] = driver
86
- driver.get(url)
87
- return self.drivers[url]
88
-
89
- def quit_driver(self, url):
90
- """Clean up a specific driver instance."""
91
- if url in self.drivers:
92
- self.drivers[url].quit()
93
- del self.drivers[url]
94
-
95
- def quit_all_drivers(self):
96
- """Clean up all driver instances."""
97
- for driver in self.drivers.values():
98
- driver.quit()
99
- self.drivers.clear()
100
-
101
- def normalize_url(url, base_url=None):
102
- """Normalize and resolve relative URLs."""
103
- manager = SeleniumManager(url)
104
- base_url = manager.base_url if base_url is None else base_url
105
- if url.startswith(base_url):
106
- url = url[len(base_url):]
107
- normalized_url = urljoin(base_url, url.split('#')[0])
108
- if not normalized_url.startswith(base_url):
109
- return None
110
- return normalized_url
111
-
112
- def get_selenium_source(url):
113
- """Fetch page source using Selenium."""
114
- url_mgr = urlManager(url) # Assuming urlManager is defined elsewhere
115
- if url_mgr.url:
116
- url = str(url_mgr.url)
117
- manager = SeleniumManager(url)
118
- driver = manager.get_driver(url)
119
- try:
120
- return driver.page_source
121
- except Exception as e:
122
- logging.error(f"Error fetching page source for {url}: {e}")
123
- return None
124
- # Note: Driver is not quit here to maintain Singleton behavior
125
-
126
- # Ensure cleanup on program exit (optional)
127
- import atexit
128
- atexit.register(lambda: SeleniumManager(url="").quit_all_drivers()) # Cleanup all drivers on exit