abstract-webtools 0.1.6.18__tar.gz → 0.1.6.20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/LICENSE +0 -0
- {abstract_webtools-0.1.6.18/src/abstract_webtools.egg-info → abstract_webtools-0.1.6.20}/PKG-INFO +1 -1
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/README.md +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/pyproject.toml +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/setup.cfg +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/setup.py +1 -1
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/__init__.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/abstract_webtools.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/big_user_agent_list.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/main.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/__init__.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/cipherManager.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/crawlManager.py +117 -3
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/domainManager.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/dynamicRateLimiter.py +0 -0
- abstract_webtools-0.1.6.20/src/abstract_webtools/managers/get_test.py +39 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/linkManager.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/mySocketClient.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/networkManager.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/requestManager.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/seleniumManager.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/soupManager.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/sslManager.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/tlsAdapter.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/urlManager.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/userAgentManager.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/videoDownloader.py +2 -1
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/soup_gui.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/url_grabber.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/url_grabber_new.py +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20/src/abstract_webtools.egg-info}/PKG-INFO +1 -1
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools.egg-info/SOURCES.txt +1 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools.egg-info/dependency_links.txt +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools.egg-info/requires.txt +0 -0
- {abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools.egg-info/top_level.txt +0 -0
File without changes
|
{abstract_webtools-0.1.6.18/src/abstract_webtools.egg-info → abstract_webtools-0.1.6.20}/PKG-INFO
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.20
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
File without changes
|
File without changes
|
File without changes
|
@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
4
|
long_description = fh.read()
|
5
5
|
setuptools.setup(
|
6
6
|
name='abstract_webtools',
|
7
|
-
version='0.1.6.
|
7
|
+
version='0.1.6.20',
|
8
8
|
author='putkoff',
|
9
9
|
author_email='partners@abstractendeavors.com',
|
10
10
|
description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',
|
File without changes
|
{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/abstract_webtools.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/managers/__init__.py
RENAMED
File without changes
|
File without changes
|
@@ -1,8 +1,122 @@
|
|
1
|
-
from
|
1
|
+
from abstract_webtools import * #.soupManager import *
|
2
2
|
from urllib.parse import urlparse, urljoin
|
3
3
|
import os
|
4
4
|
import xml.etree.ElementTree as ET
|
5
5
|
from bs4 import BeautifulSoup
|
6
|
+
import requests
|
7
|
+
from urllib.parse import urlparse, urljoin
|
8
|
+
from bs4 import BeautifulSoup
|
9
|
+
import xml.etree.ElementTree as ET
|
10
|
+
import time
|
11
|
+
|
12
|
+
class SitemapGenerator:
|
13
|
+
def __init__(self, base_url):
|
14
|
+
self.base_url = base_url.rstrip('/')
|
15
|
+
self.visited = set() # Track visited URLs
|
16
|
+
self.sitemap_data = {} # Store URL metadata including images and documents
|
17
|
+
|
18
|
+
def crawl(self, url, max_depth=3, depth=1):
|
19
|
+
"""Recursively crawl website and collect internal URLs, images, and documents."""
|
20
|
+
if depth > max_depth or url in self.visited:
|
21
|
+
return
|
22
|
+
|
23
|
+
print(f"Crawling: {url}")
|
24
|
+
self.visited.add(url)
|
25
|
+
|
26
|
+
try:
|
27
|
+
response = requests.get(url)
|
28
|
+
|
29
|
+
if response.status_code == 200:
|
30
|
+
soup = get_all_attribute_values(url)
|
31
|
+
input(soup)
|
32
|
+
# Initialize data storage for this URL
|
33
|
+
self.sitemap_data[url] = {
|
34
|
+
'images': [],
|
35
|
+
'documents': [],
|
36
|
+
'changefreq': 'weekly',
|
37
|
+
'priority': '0.5',
|
38
|
+
'lastmod': time.strftime('%Y-%m-%d')
|
39
|
+
}
|
40
|
+
|
41
|
+
# Extract images
|
42
|
+
images = [img.get('src') for img in soup.find_all('img', src=True)]
|
43
|
+
images = [urljoin(url, img) for img in images]
|
44
|
+
images = [img for img in images if self.is_internal_url(img)]
|
45
|
+
self.sitemap_data[url]['images'].extend(images)
|
46
|
+
|
47
|
+
# Extract documents (e.g., PDFs, DOCs)
|
48
|
+
documents = []
|
49
|
+
for link in soup.find_all('a', href=True):
|
50
|
+
href = link['href']
|
51
|
+
full_url = urljoin(url, href)
|
52
|
+
if self.is_internal_url(full_url):
|
53
|
+
if any(full_url.endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx']):
|
54
|
+
documents.append(full_url)
|
55
|
+
else:
|
56
|
+
if full_url not in self.visited:
|
57
|
+
self.crawl(full_url, max_depth, depth + 1)
|
58
|
+
self.sitemap_data[url]['documents'].extend(documents)
|
59
|
+
|
60
|
+
# Extract and crawl internal links
|
61
|
+
for link in soup.find_all('a', href=True):
|
62
|
+
href = link['href']
|
63
|
+
full_url = urljoin(url, href)
|
64
|
+
if self.is_internal_url(full_url) and full_url not in self.visited:
|
65
|
+
self.crawl(full_url, max_depth, depth + 1)
|
66
|
+
|
67
|
+
except Exception as e:
|
68
|
+
print(f"Error crawling {url}: {e}")
|
69
|
+
|
70
|
+
def is_internal_url(self, url):
|
71
|
+
"""Check if URL is within the same domain."""
|
72
|
+
parsed_url = urlparse(url)
|
73
|
+
base_parsed_url = urlparse(self.base_url)
|
74
|
+
return (parsed_url.netloc == base_parsed_url.netloc or parsed_url.netloc == '') and not parsed_url.scheme.startswith('mailto')
|
75
|
+
|
76
|
+
def generate_sitemap_xml(self):
|
77
|
+
"""Generate XML for the sitemap including URLs, images, and documents."""
|
78
|
+
NSMAP = {
|
79
|
+
None: "http://www.sitemaps.org/schemas/sitemap/0.9",
|
80
|
+
'image': "http://www.google.com/schemas/sitemap-image/1.1"
|
81
|
+
}
|
82
|
+
urlset = ET.Element("urlset", xmlns=NSMAP[None], attrib={'xmlns:image': NSMAP['image']})
|
83
|
+
|
84
|
+
for url, data in self.sitemap_data.items():
|
85
|
+
url_element = ET.SubElement(urlset, "url")
|
86
|
+
ET.SubElement(url_element, "loc").text = url
|
87
|
+
ET.SubElement(url_element, "lastmod").text = data['lastmod']
|
88
|
+
ET.SubElement(url_element, "changefreq").text = data['changefreq']
|
89
|
+
ET.SubElement(url_element, "priority").text = data['priority']
|
90
|
+
|
91
|
+
# Add images
|
92
|
+
for img_url in data['images']:
|
93
|
+
image_element = ET.SubElement(url_element, "{http://www.google.com/schemas/sitemap-image/1.1}image")
|
94
|
+
ET.SubElement(image_element, "{http://www.google.com/schemas/sitemap-image/1.1}loc").text = img_url
|
95
|
+
|
96
|
+
# Add documents as separate URLs
|
97
|
+
for doc_url in data['documents']:
|
98
|
+
doc_element = ET.SubElement(urlset, "url")
|
99
|
+
ET.SubElement(doc_element, "loc").text = doc_url
|
100
|
+
ET.SubElement(doc_element, "lastmod").text = data['lastmod']
|
101
|
+
ET.SubElement(doc_element, "changefreq").text = data['changefreq']
|
102
|
+
ET.SubElement(doc_element, "priority").text = data['priority']
|
103
|
+
|
104
|
+
# Write to sitemap.xml
|
105
|
+
tree = ET.ElementTree(urlset)
|
106
|
+
tree.write("sitemap.xml", encoding="utf-8", xml_declaration=True)
|
107
|
+
print("Sitemap generated and saved as sitemap.xml")
|
108
|
+
|
109
|
+
def run(self):
|
110
|
+
"""Run the sitemap generator."""
|
111
|
+
self.crawl(self.base_url)
|
112
|
+
self.generate_sitemap_xml()
|
113
|
+
|
114
|
+
# Example usage:
|
115
|
+
if __name__ == "__main__":
|
116
|
+
base_url = 'https://pump.fun' # Replace with your website URL
|
117
|
+
input(linkManager(base_url).find_all_domain())
|
118
|
+
generator = SitemapGenerator(base_url)
|
119
|
+
generator.run()
|
6
120
|
|
7
121
|
class crawlManager:
|
8
122
|
def __init__(self, url=None, req_mgr=None, url_mgr=None, source_code=None, parse_type="html.parser"):
|
@@ -12,7 +126,7 @@ class crawlManager:
|
|
12
126
|
self.url_mgr = url_mgr or urlManager(url=self.url)
|
13
127
|
self.req_mgr = req_mgr or requestManager(url_mgr=self.url_mgr)
|
14
128
|
self.get_new_source_and_url(url)
|
15
|
-
|
129
|
+
self.sime_map_mgr = SitemapGenerator(self.url_mgr.domain)
|
16
130
|
def get_new_source_and_url(self, url=None):
|
17
131
|
"""Fetches new source code and response for a given URL."""
|
18
132
|
url = url
|
@@ -158,7 +272,7 @@ def get_crawl_mgr(url=None,req_mgr=None,url_mgr=None,source_code=None,parse_type
|
|
158
272
|
url_mgr = get_url_mgr(url=url,url_mgr=url_mgr)
|
159
273
|
url = get_url(url=url,url_mgr=url_mgr)
|
160
274
|
req_mgr=get_req_mgr(url=url,url_mgr=url_mgr,source_code=source_code)
|
161
|
-
source_code = get_source(url=url,url_mgr=url_mgr,source_code=source_code
|
275
|
+
source_code = get_source(url=url,url_mgr=url_mgr,source_code=source_code)
|
162
276
|
soup_mgr = get_soup_mgr(url=url,url_mgr=url_mgr,source_code=source_code,req_mgr=req_mgr,parse_type=parse_type)
|
163
277
|
crawl_mgr = crawlManager(url=url,req_mgr=req_mgr,url_mgr=url_mgr,source_code=source_code,parse_type=parse_type)
|
164
278
|
return crawl_mgr
|
File without changes
|
File without changes
|
@@ -0,0 +1,39 @@
|
|
1
|
+
from abstract_webtools import *
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
url = 'https://pump.fun'
|
6
|
+
input()
|
7
|
+
print('starting')
|
8
|
+
#SELENIUM/DOMAIN MANAGER
|
9
|
+
domain_mgr = seleniumManager(url)
|
10
|
+
print(domain_mgr.domain)
|
11
|
+
|
12
|
+
#URL MANAGER
|
13
|
+
url_mgr = urlManager(url=domain_mgr.domain)
|
14
|
+
url = url_mgr.url
|
15
|
+
print(url)
|
16
|
+
|
17
|
+
#REQUEST MANAGER
|
18
|
+
req_mgr = requestManager(url=url,url_mgr=url_mgr)
|
19
|
+
source_code = req_mgr.source_code
|
20
|
+
print(source_code)
|
21
|
+
|
22
|
+
#SOUP MANAGER
|
23
|
+
soup_mgr = soupManager(url_mgr=url_mgr,req_mgr=req_mgr)
|
24
|
+
soup = soup_mgr.soup
|
25
|
+
print(soup)
|
26
|
+
all_attributes = soup_mgr.get_all_attribute_values()
|
27
|
+
print(all_attributes)
|
28
|
+
|
29
|
+
#LINK MANAGER
|
30
|
+
link_mgr = linkManager(url)
|
31
|
+
all_domains = link_mgr.find_all_domain()
|
32
|
+
print(all_domains)
|
33
|
+
|
34
|
+
all_desired_links = link_mgr.all_desired_links
|
35
|
+
print(all_desired_links)
|
36
|
+
|
37
|
+
all_desired_image_links = link_mgr.all_desired_image_links
|
38
|
+
print(all_desired_image_links)
|
39
|
+
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import os
|
2
|
+
from .soupManager import *
|
2
3
|
class VideoDownloader:
|
3
4
|
"""
|
4
5
|
VideoDownloader is a class for downloading videos from URLs using YouTube-DL.
|
@@ -188,7 +189,7 @@ class VideoDownloader:
|
|
188
189
|
self.start()
|
189
190
|
|
190
191
|
def start(self):
|
191
|
-
download_thread = self.thread_manager.add_thread(name='download_thread',
|
192
|
+
download_thread = self.thread_manager.add_thread(name='download_thread',target_function=self.download)
|
192
193
|
monitor_thread = self.thread_manager.add_thread(name='monitor_thread',target_function=self.monitor)
|
193
194
|
self.thread_manager.start(name='download_thread')
|
194
195
|
self.thread_manager.start(name='monitor_thread')
|
File without changes
|
{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/url_grabber.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools/url_grabber_new.py
RENAMED
File without changes
|
{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20/src/abstract_webtools.egg-info}/PKG-INFO
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.20
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
{abstract_webtools-0.1.6.18 → abstract_webtools-0.1.6.20}/src/abstract_webtools.egg-info/SOURCES.txt
RENAMED
@@ -20,6 +20,7 @@ src/abstract_webtools/managers/cipherManager.py
|
|
20
20
|
src/abstract_webtools/managers/crawlManager.py
|
21
21
|
src/abstract_webtools/managers/domainManager.py
|
22
22
|
src/abstract_webtools/managers/dynamicRateLimiter.py
|
23
|
+
src/abstract_webtools/managers/get_test.py
|
23
24
|
src/abstract_webtools/managers/linkManager.py
|
24
25
|
src/abstract_webtools/managers/mySocketClient.py
|
25
26
|
src/abstract_webtools/managers/networkManager.py
|
File without changes
|
File without changes
|
File without changes
|