abstract-webtools 0.1.6.18__tar.gz → 0.1.6.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/LICENSE +0 -0
  2. {abstract_webtools-0.1.6.18/src/abstract_webtools.egg-info → abstract_webtools-0.1.6.19}/PKG-INFO +1 -1
  3. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/README.md +0 -0
  4. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/pyproject.toml +0 -0
  5. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/setup.cfg +0 -0
  6. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/setup.py +1 -1
  7. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/__init__.py +0 -0
  8. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/abstract_webtools.py +0 -0
  9. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/big_user_agent_list.py +0 -0
  10. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/main.py +0 -0
  11. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/__init__.py +0 -0
  12. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/cipherManager.py +0 -0
  13. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/crawlManager.py +116 -2
  14. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/domainManager.py +0 -0
  15. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
  16. abstract_webtools-0.1.6.19/src/abstract_webtools/managers/get_test.py +39 -0
  17. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/linkManager.py +0 -0
  18. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/mySocketClient.py +0 -0
  19. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/networkManager.py +0 -0
  20. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/requestManager.py +0 -0
  21. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/seleniumManager.py +0 -0
  22. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/soupManager.py +0 -0
  23. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/sslManager.py +0 -0
  24. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
  25. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/urlManager.py +0 -0
  26. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/userAgentManager.py +0 -0
  27. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/managers/videoDownloader.py +2 -1
  28. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/soup_gui.py +0 -0
  29. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/url_grabber.py +0 -0
  30. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools/url_grabber_new.py +0 -0
  31. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19/src/abstract_webtools.egg-info}/PKG-INFO +1 -1
  32. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools.egg-info/SOURCES.txt +1 -0
  33. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
  34. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools.egg-info/requires.txt +0 -0
  35. {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.19}/src/abstract_webtools.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.18
3
+ Version: 0.1.6.19
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
4
4
  long_description = fh.read()
5
5
  setuptools.setup(
6
6
  name='abstract_webtools',
7
- version='0.1.6.18',
7
+ version='0.1.6.19',
8
8
  author='putkoff',
9
9
  author_email='partners@abstractendeavors.com',
10
10
  description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
@@ -1,8 +1,122 @@
1
- from .soupManager import * #.soupManager import *
1
+ from abstract_webtools import * #.soupManager import *
2
2
  from urllib.parse import urlparse, urljoin
3
3
  import os
4
4
  import xml.etree.ElementTree as ET
5
5
  from bs4 import BeautifulSoup
6
+ import requests
7
+ from urllib.parse import urlparse, urljoin
8
+ from bs4 import BeautifulSoup
9
+ import xml.etree.ElementTree as ET
10
+ import time
11
+
12
+ class SitemapGenerator:
13
+ def __init__(self, base_url):
14
+ self.base_url = base_url.rstrip('/')
15
+ self.visited = set() # Track visited URLs
16
+ self.sitemap_data = {} # Store URL metadata including images and documents
17
+
18
+ def crawl(self, url, max_depth=3, depth=1):
19
+ """Recursively crawl website and collect internal URLs, images, and documents."""
20
+ if depth > max_depth or url in self.visited:
21
+ return
22
+
23
+ print(f"Crawling: {url}")
24
+ self.visited.add(url)
25
+
26
+ try:
27
+ response = requests.get(url)
28
+
29
+ if response.status_code == 200:
30
+ soup = get_all_attribute_values(url)
31
+ input(soup)
32
+ # Initialize data storage for this URL
33
+ self.sitemap_data[url] = {
34
+ 'images': [],
35
+ 'documents': [],
36
+ 'changefreq': 'weekly',
37
+ 'priority': '0.5',
38
+ 'lastmod': time.strftime('%Y-%m-%d')
39
+ }
40
+
41
+ # Extract images
42
+ images = [img.get('src') for img in soup.find_all('img', src=True)]
43
+ images = [urljoin(url, img) for img in images]
44
+ images = [img for img in images if self.is_internal_url(img)]
45
+ self.sitemap_data[url]['images'].extend(images)
46
+
47
+ # Extract documents (e.g., PDFs, DOCs)
48
+ documents = []
49
+ for link in soup.find_all('a', href=True):
50
+ href = link['href']
51
+ full_url = urljoin(url, href)
52
+ if self.is_internal_url(full_url):
53
+ if any(full_url.endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx']):
54
+ documents.append(full_url)
55
+ else:
56
+ if full_url not in self.visited:
57
+ self.crawl(full_url, max_depth, depth + 1)
58
+ self.sitemap_data[url]['documents'].extend(documents)
59
+
60
+ # Extract and crawl internal links
61
+ for link in soup.find_all('a', href=True):
62
+ href = link['href']
63
+ full_url = urljoin(url, href)
64
+ if self.is_internal_url(full_url) and full_url not in self.visited:
65
+ self.crawl(full_url, max_depth, depth + 1)
66
+
67
+ except Exception as e:
68
+ print(f"Error crawling {url}: {e}")
69
+
70
+ def is_internal_url(self, url):
71
+ """Check if URL is within the same domain."""
72
+ parsed_url = urlparse(url)
73
+ base_parsed_url = urlparse(self.base_url)
74
+ return (parsed_url.netloc == base_parsed_url.netloc or parsed_url.netloc == '') and not parsed_url.scheme.startswith('mailto')
75
+
76
+ def generate_sitemap_xml(self):
77
+ """Generate XML for the sitemap including URLs, images, and documents."""
78
+ NSMAP = {
79
+ None: "http://www.sitemaps.org/schemas/sitemap/0.9",
80
+ 'image': "http://www.google.com/schemas/sitemap-image/1.1"
81
+ }
82
+ urlset = ET.Element("urlset", xmlns=NSMAP[None], attrib={'xmlns:image': NSMAP['image']})
83
+
84
+ for url, data in self.sitemap_data.items():
85
+ url_element = ET.SubElement(urlset, "url")
86
+ ET.SubElement(url_element, "loc").text = url
87
+ ET.SubElement(url_element, "lastmod").text = data['lastmod']
88
+ ET.SubElement(url_element, "changefreq").text = data['changefreq']
89
+ ET.SubElement(url_element, "priority").text = data['priority']
90
+
91
+ # Add images
92
+ for img_url in data['images']:
93
+ image_element = ET.SubElement(url_element, "{http://www.google.com/schemas/sitemap-image/1.1}image")
94
+ ET.SubElement(image_element, "{http://www.google.com/schemas/sitemap-image/1.1}loc").text = img_url
95
+
96
+ # Add documents as separate URLs
97
+ for doc_url in data['documents']:
98
+ doc_element = ET.SubElement(urlset, "url")
99
+ ET.SubElement(doc_element, "loc").text = doc_url
100
+ ET.SubElement(doc_element, "lastmod").text = data['lastmod']
101
+ ET.SubElement(doc_element, "changefreq").text = data['changefreq']
102
+ ET.SubElement(doc_element, "priority").text = data['priority']
103
+
104
+ # Write to sitemap.xml
105
+ tree = ET.ElementTree(urlset)
106
+ tree.write("sitemap.xml", encoding="utf-8", xml_declaration=True)
107
+ print("Sitemap generated and saved as sitemap.xml")
108
+
109
+ def run(self):
110
+ """Run the sitemap generator."""
111
+ self.crawl(self.base_url)
112
+ self.generate_sitemap_xml()
113
+
114
+ # Example usage:
115
+ if __name__ == "__main__":
116
+ base_url = 'https://pump.fun' # Replace with your website URL
117
+ input(linkManager(base_url).find_all_domain())
118
+ generator = SitemapGenerator(base_url)
119
+ generator.run()
6
120
 
7
121
  class crawlManager:
8
122
  def __init__(self, url=None, req_mgr=None, url_mgr=None, source_code=None, parse_type="html.parser"):
@@ -12,7 +126,7 @@ class crawlManager:
12
126
  self.url_mgr = url_mgr or urlManager(url=self.url)
13
127
  self.req_mgr = req_mgr or requestManager(url_mgr=self.url_mgr)
14
128
  self.get_new_source_and_url(url)
15
-
129
+ self.sime_map_mgr = SitemapGenerator(self.url_mgr.domain)
16
130
  def get_new_source_and_url(self, url=None):
17
131
  """Fetches new source code and response for a given URL."""
18
132
  url = url
@@ -0,0 +1,39 @@
1
+ from abstract_webtools import *
2
+
3
+
4
+
5
+ url = 'https://pump.fun'
6
+ input()
7
+ print('starting')
8
+ #SELENIUM/DOMAIN MANAGER
9
+ domain_mgr = seleniumManager(url)
10
+ print(domain_mgr.domain)
11
+
12
+ #URL MANAGER
13
+ url_mgr = urlManager(url=domain_mgr.domain)
14
+ url = url_mgr.url
15
+ print(url)
16
+
17
+ #REQUEST MANAGER
18
+ req_mgr = requestManager(url=url,url_mgr=url_mgr)
19
+ source_code = req_mgr.source_code
20
+ print(source_code)
21
+
22
+ #SOUP MANAGER
23
+ soup_mgr = soupManager(url_mgr=url_mgr,req_mgr=req_mgr)
24
+ soup = soup_mgr.soup
25
+ print(soup)
26
+ all_attributes = soup_mgr.get_all_attribute_values()
27
+ print(all_attributes)
28
+
29
+ #LINK MANAGER
30
+ link_mgr = linkManager(url)
31
+ all_domains = link_mgr.find_all_domain()
32
+ print(all_domains)
33
+
34
+ all_desired_links = link_mgr.all_desired_links
35
+ print(all_desired_links)
36
+
37
+ all_desired_image_links = link_mgr.all_desired_image_links
38
+ print(all_desired_image_links)
39
+
@@ -1,4 +1,5 @@
1
1
  import os
2
+ from .soupManager import *
2
3
  class VideoDownloader:
3
4
  """
4
5
  VideoDownloader is a class for downloading videos from URLs using YouTube-DL.
@@ -188,7 +189,7 @@ class VideoDownloader:
188
189
  self.start()
189
190
 
190
191
  def start(self):
191
- download_thread = self.thread_manager.add_thread(name='download_thread',target=self.download)
192
+ download_thread = self.thread_manager.add_thread(name='download_thread',target_function=self.download)
192
193
  monitor_thread = self.thread_manager.add_thread(name='monitor_thread',target_function=self.monitor)
193
194
  self.thread_manager.start(name='download_thread')
194
195
  self.thread_manager.start(name='monitor_thread')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: abstract_webtools
3
- Version: 0.1.6.18
3
+ Version: 0.1.6.19
4
4
  Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
5
5
  Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
6
6
  Author: putkoff
@@ -20,6 +20,7 @@ src/abstract_webtools/managers/cipherManager.py
20
20
  src/abstract_webtools/managers/crawlManager.py
21
21
  src/abstract_webtools/managers/domainManager.py
22
22
  src/abstract_webtools/managers/dynamicRateLimiter.py
23
+ src/abstract_webtools/managers/get_test.py
23
24
  src/abstract_webtools/managers/linkManager.py
24
25
  src/abstract_webtools/managers/mySocketClient.py
25
26
  src/abstract_webtools/managers/networkManager.py