PyPI - abstract-webtools - Versions diffs - 0.1.6.18__tar.gz → 0.1.6.20__tar.gz - Mend

abstract-webtools 0.1.6.18tar.gz → 0.1.6.20tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/LICENSE RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18/src/abstract_webtools.egg-info → abstract_webtools-0.1.6.20}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: abstract_webtools
-Version: 0.1.6.18
+Version: 0.1.6.20
 Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
 Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
 Author: putkoff

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/README.md RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/pyproject.toml RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/setup.cfg RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/setup.py RENAMED Viewed

@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
     long_description = fh.read()
 setuptools.setup(
     name='abstract_webtools',
-    version='0.1.6.18',
+    version='0.1.6.20',
     author='putkoff',
     author_email='partners@abstractendeavors.com',
     description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/__init__.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/abstract_webtools.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/big_user_agent_list.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/main.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/__init__.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/cipherManager.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/crawlManager.py RENAMED Viewed

@@ -1,8 +1,122 @@
-from .soupManager import * #.soupManager import *
+from abstract_webtools import * #.soupManager import *
 from urllib.parse import urlparse, urljoin
 import os
 import xml.etree.ElementTree as ET
 from bs4 import BeautifulSoup
+import requests
+from urllib.parse import urlparse, urljoin
+from bs4 import BeautifulSoup
+import xml.etree.ElementTree as ET
+import time
+class SitemapGenerator:
+    def __init__(self, base_url):
+        self.base_url = base_url.rstrip('/')
+        self.visited = set()  # Track visited URLs
+        self.sitemap_data = {}  # Store URL metadata including images and documents
+    def crawl(self, url, max_depth=3, depth=1):
+        """Recursively crawl website and collect internal URLs, images, and documents."""
+        if depth > max_depth or url in self.visited:
+            return
+        print(f"Crawling: {url}")
+        self.visited.add(url)
+        try:
+            response = requests.get(url)
+            if response.status_code == 200:
+                soup = get_all_attribute_values(url)
+                input(soup)
+                # Initialize data storage for this URL
+                self.sitemap_data[url] = {
+                    'images': [],
+                    'documents': [],
+                    'changefreq': 'weekly',
+                    'priority': '0.5',
+                    'lastmod': time.strftime('%Y-%m-%d')
+                }
+                # Extract images
+                images = [img.get('src') for img in soup.find_all('img', src=True)]
+                images = [urljoin(url, img) for img in images]
+                images = [img for img in images if self.is_internal_url(img)]
+                self.sitemap_data[url]['images'].extend(images)
+                # Extract documents (e.g., PDFs, DOCs)
+                documents = []
+                for link in soup.find_all('a', href=True):
+                    href = link['href']
+                    full_url = urljoin(url, href)
+                    if self.is_internal_url(full_url):
+                        if any(full_url.endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx']):
+                            documents.append(full_url)
+                        else:
+                            if full_url not in self.visited:
+                                self.crawl(full_url, max_depth, depth + 1)
+                self.sitemap_data[url]['documents'].extend(documents)
+                # Extract and crawl internal links
+                for link in soup.find_all('a', href=True):
+                    href = link['href']
+                    full_url = urljoin(url, href)
+                    if self.is_internal_url(full_url) and full_url not in self.visited:
+                        self.crawl(full_url, max_depth, depth + 1)
+        except Exception as e:
+            print(f"Error crawling {url}: {e}")
+    def is_internal_url(self, url):
+        """Check if URL is within the same domain."""
+        parsed_url = urlparse(url)
+        base_parsed_url = urlparse(self.base_url)
+        return (parsed_url.netloc == base_parsed_url.netloc or parsed_url.netloc == '') and not parsed_url.scheme.startswith('mailto')
+    def generate_sitemap_xml(self):
+        """Generate XML for the sitemap including URLs, images, and documents."""
+        NSMAP = {
+            None: "http://www.sitemaps.org/schemas/sitemap/0.9",
+            'image': "http://www.google.com/schemas/sitemap-image/1.1"
+        }
+        urlset = ET.Element("urlset", xmlns=NSMAP[None], attrib={'xmlns:image': NSMAP['image']})
+        for url, data in self.sitemap_data.items():
+            url_element = ET.SubElement(urlset, "url")
+            ET.SubElement(url_element, "loc").text = url
+            ET.SubElement(url_element, "lastmod").text = data['lastmod']
+            ET.SubElement(url_element, "changefreq").text = data['changefreq']
+            ET.SubElement(url_element, "priority").text = data['priority']
+            # Add images
+            for img_url in data['images']:
+                image_element = ET.SubElement(url_element, "{http://www.google.com/schemas/sitemap-image/1.1}image")
+                ET.SubElement(image_element, "{http://www.google.com/schemas/sitemap-image/1.1}loc").text = img_url
+            # Add documents as separate URLs
+            for doc_url in data['documents']:
+                doc_element = ET.SubElement(urlset, "url")
+                ET.SubElement(doc_element, "loc").text = doc_url
+                ET.SubElement(doc_element, "lastmod").text = data['lastmod']
+                ET.SubElement(doc_element, "changefreq").text = data['changefreq']
+                ET.SubElement(doc_element, "priority").text = data['priority']
+        # Write to sitemap.xml
+        tree = ET.ElementTree(urlset)
+        tree.write("sitemap.xml", encoding="utf-8", xml_declaration=True)
+        print("Sitemap generated and saved as sitemap.xml")
+    def run(self):
+        """Run the sitemap generator."""
+        self.crawl(self.base_url)
+        self.generate_sitemap_xml()
+# Example usage:
+if __name__ == "__main__":
+    base_url = 'https://pump.fun'  # Replace with your website URL
+    input(linkManager(base_url).find_all_domain())
+    generator = SitemapGenerator(base_url)
+    generator.run()
 class crawlManager:
     def __init__(self, url=None, req_mgr=None, url_mgr=None, source_code=None, parse_type="html.parser"):
@@ -12,7 +126,7 @@ class crawlManager:
         self.url_mgr = url_mgr or urlManager(url=self.url)
         self.req_mgr = req_mgr or requestManager(url_mgr=self.url_mgr)
         self.get_new_source_and_url(url)
+        self.sime_map_mgr = SitemapGenerator(self.url_mgr.domain)
     def get_new_source_and_url(self, url=None):
         """Fetches new source code and response for a given URL."""
         url = url
@@ -158,7 +272,7 @@ def get_crawl_mgr(url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type
     url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
     url = get_url(url=url,url_mgr=url_mgr)
     req_mgr=get_req_mgr(url=url,url_mgr=url_mgr,source_code=source_code)
-    source_code = get_source(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr)
+    source_code = get_source(url=url,url_mgr=url_mgr,source_code=source_code)
     soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,parse_type=parse_type)
     crawl_mgr = crawlManager(url=url,req_mgr=req_mgr,url_mgr=url_mgr,source_code=source_code,parse_type=parse_type)
     return crawl_mgr

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/domainManager.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/dynamicRateLimiter.py RENAMED Viewed

File without changes

abstract_webtools-0.1.6.20/src/abstract_webtools/managers/get_test.py ADDED Viewed

@@ -0,0 +1,39 @@
+from abstract_webtools import *
+url = 'https://pump.fun'
+input()
+print('starting')
+#SELENIUM/DOMAIN MANAGER
+domain_mgr = seleniumManager(url)
+print(domain_mgr.domain)
+#URL MANAGER
+url_mgr = urlManager(url=domain_mgr.domain)
+url = url_mgr.url
+print(url)
+#REQUEST MANAGER
+req_mgr = requestManager(url=url,url_mgr=url_mgr)
+source_code = req_mgr.source_code
+print(source_code)
+#SOUP MANAGER
+soup_mgr = soupManager(url_mgr=url_mgr,req_mgr=req_mgr)
+soup = soup_mgr.soup
+print(soup)
+all_attributes = soup_mgr.get_all_attribute_values()
+print(all_attributes)
+#LINK MANAGER
+link_mgr = linkManager(url)
+all_domains = link_mgr.find_all_domain()
+print(all_domains)
+all_desired_links = link_mgr.all_desired_links
+print(all_desired_links)
+all_desired_image_links = link_mgr.all_desired_image_links
+print(all_desired_image_links)

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/linkManager.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/mySocketClient.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/networkManager.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/requestManager.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/seleniumManager.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/soupManager.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/sslManager.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/tlsAdapter.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/urlManager.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/userAgentManager.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/videoDownloader.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import os
+from .soupManager import *
 class VideoDownloader:
     """
     VideoDownloader is a class for downloading videos from URLs using YouTube-DL.
@@ -188,7 +189,7 @@ class VideoDownloader:
                         self.start()
     def start(self):
-        download_thread = self.thread_manager.add_thread(name='download_thread',target=self.download)
+        download_thread = self.thread_manager.add_thread(name='download_thread',target_function=self.download)
         monitor_thread = self.thread_manager.add_thread(name='monitor_thread',target_function=self.monitor)
         self.thread_manager.start(name='download_thread')
         self.thread_manager.start(name='monitor_thread')

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/soup_gui.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/url_grabber.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/url_grabber_new.py RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20/src/abstract_webtools.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: abstract_webtools
-Version: 0.1.6.18
+Version: 0.1.6.20
 Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
 Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
 Author: putkoff

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools.egg-info/SOURCES.txt RENAMED Viewed

@@ -20,6 +20,7 @@ src/abstract_webtools/managers/cipherManager.py
 src/abstract_webtools/managers/crawlManager.py
 src/abstract_webtools/managers/domainManager.py
 src/abstract_webtools/managers/dynamicRateLimiter.py
+src/abstract_webtools/managers/get_test.py
 src/abstract_webtools/managers/linkManager.py
 src/abstract_webtools/managers/mySocketClient.py
 src/abstract_webtools/managers/networkManager.py

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools.egg-info/requires.txt RENAMED Viewed

File without changes

{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools.egg-info/top_level.txt RENAMED Viewed

File without changes

abstract-webtools 0.1.6.18__tar.gz → 0.1.6.20__tar.gz

abstract-webtools 0.1.6.18tar.gz → 0.1.6.20tar.gz