PyPI - abstract-webtools - Versions diffs - 0.1.6.105__tar.gz → 0.1.6.107__tar.gz - Mend

abstract-webtools 0.1.6.105tar.gz → 0.1.6.107tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

{abstract_webtools-0.1.6.105 → abstract_webtools-0.1.6.107}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: abstract_webtools
-Version: 0.1.6.105
+Version: 0.1.6.107
 Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
 Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
 Author: putkoff

{abstract_webtools-0.1.6.105 → abstract_webtools-0.1.6.107}/setup.py RENAMED Viewed

@@ -4,7 +4,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
     long_description = fh.read()
 setuptools.setup(
     name='abstract_webtools',
-    version='0.1.6.105',
+    version='0.1.6.107',
     author='putkoff',
     author_email='partners@abstractendeavors.com',
     description='Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.',

abstract_webtools-0.1.6.107/src/abstract_webtools/k2s_downloader.py ADDED Viewed

@@ -0,0 +1,227 @@
+import os
+import re
+import time
+import requests
+import hashlib
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from abstract_security import *
+from abstract_webtools import *
+from abstract_utilities import safe_dump_to_file, safe_load_from_json
+DOWNLOAD_DIR = os.path.abspath("./downloads")
+class K2SDownloader:
+    def __init__(self, env_path=None, download_dir=None, json_file_path=None):
+        self.download_dir = download_dir or DOWNLOAD_DIR
+        self.json_file_path = json_file_path
+        os.makedirs(self.download_dir, exist_ok=True)
+        self.env_path = env_path
+        self.session = requests.Session()
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        })
+        self.driver = self._init_driver()
+        self.logged_in = False
+    def _init_driver(self):
+        options = webdriver.ChromeOptions()
+        options.add_argument("--disable-blink-features=AutomationControlled")
+        options.add_argument("--headless")
+        return webdriver.Chrome(options=options)
+    def login(self):
+        userName = get_env_value('userName', path=self.env_path)
+        passWord = get_env_value('passWord', path=self.env_path)
+        try:
+            self.driver.get("https://k2s.cc/auth/login")
+            print("Navigating to login page")
+            time.sleep(3)
+            email_input = WebDriverWait(self.driver, 10).until(
+                EC.presence_of_element_located((By.NAME, "email"))
+            )
+            password_input = WebDriverWait(self.driver, 10).until(
+                EC.presence_of_element_located((By.NAME, "password"))  # Updated field name
+            )
+            email_input.send_keys(userName)
+            password_input.send_keys(passWord)
+            password_input.send_keys(Keys.RETURN)
+            print("Submitted login credentials")
+            WebDriverWait(self.driver, 15).until(
+                EC.url_contains("dashboard")  # Adjust based on post-login URL
+            )
+            self.logged_in = True
+            print("Login successful")
+        except Exception as e:
+            print(f"Login failed: {e}")
+            with open('login_error.html', 'w', encoding='utf-8') as f:
+                f.write(self.driver.page_source)
+            raise
+    def get_file_metadata(self, download_url):
+        """Fetch filename and metadata using a HEAD request or page inspection."""
+        metadata = {'url': download_url, 'filename': None, 'size': None}
+        try:
+            # Try HEAD request first
+            response = self.session.head(download_url, allow_redirects=True)
+            if response.status_code == 200:
+                cd = response.headers.get('Content-Disposition', '')
+                if 'filename=' in cd:
+                    metadata['filename'] = cd.split('filename=')[-1].strip('"')
+                metadata['size'] = response.headers.get('Content-Length')
+                if not metadata['filename']:
+                    metadata['filename'] = download_url.split('/')[-1].split('?')[0]
+            else:
+                # Fallback to page inspection if HEAD fails
+                self.driver.get(download_url)
+                WebDriverWait(self.driver, 10).until(
+                    EC.presence_of_element_located((By.TAG_NAME, "body"))
+                )
+                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
+                filename_tag = soup.select_one('a[href*="/download"]')
+                metadata['filename'] = filename_tag.text.strip() if filename_tag else download_url.split('/')[-1]
+                size_tag = soup.find(string=re.compile(r'\d+\.?\d*\s*(MB|GB|KB)'))
+                metadata['size'] = size_tag.strip() if size_tag else None
+        except Exception as e:
+            print(f"Failed to fetch metadata for {download_url}: {e}")
+        return metadata
+    def download_file(self, url):
+        if not self.logged_in:
+            self.login()
+        print(f"Navigating to: {url}")
+        self.driver.get(url)
+        WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
+        if 'captcha' in self.driver.page_source.lower():
+            print("CAPTCHA detected. Manual intervention required.")
+            return None
+        try:
+            download_button = WebDriverWait(self.driver, 30).until(
+                EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[href*="/download"], button[class*="download"]'))
+            )
+            print("Download button found; attempting to fetch URL")
+            download_url = download_button.get_attribute('href')
+            if download_url:
+                # Get metadata before downloading
+                metadata = self.get_file_metadata(download_url)
+                file_name = metadata['filename'] or self._extract_filename(None, download_url)
+                file_path = os.path.join(self.download_dir, file_name)
+                # Download the file
+                response = self.session.get(download_url, stream=True)
+                response.raise_for_status()
+                with open(file_path, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        f.write(chunk)
+                print(f"Downloaded: {file_path}")
+                # Update metadata with file size if not already set
+                if not metadata['size']:
+                    metadata['size'] = os.path.getsize(file_path)
+                metadata['file_path'] = file_path
+                return metadata
+            else:
+                download_button.click()
+                print("Button clicked. Waiting for download...")
+                time.sleep(30)
+                return None
+        except Exception as e:
+            print(f"Download failed for {url}: {e}")
+            return None
+    def _extract_filename(self, response, url):
+        if response:
+            cd = response.headers.get('Content-Disposition', '')
+            if 'filename=' in cd:
+                return cd.split('filename=')[-1].strip('"')
+        return url.split('/')[-1].split('?')[0]
+def get_json_key_value(json_data, key):
+    if json_data and isinstance(json_data, dict):
+        return json_data.get(key)
+def compare_keys(json_data, comp_json_data, key):
+    json_key_value = get_json_key_value(json_data, key)
+    comp_json_key_value = get_json_key_value(comp_json_data, key)
+    return json_key_value and comp_json_key_value and json_key_value == comp_json_key_value
+def check_json_data(json_list, new_data):
+    keys = ['k2s', 'filename', 'size']  # Check k2s URL, filename, and size
+    for json_data in json_list:
+        for key in keys:
+            if compare_keys(json_data, new_data, key):
+                return True
+    return False
+class dlsManager:
+    def __init__(self, downloader):
+        self.downloader = downloader
+        self.json_file_path = self.downloader.json_file_path
+        all_dls = None
+        if self.json_file_path:
+            all_dls = safe_load_from_json(self.json_file_path)
+        self.all_dls = all_dls or []
+        self.last_data = None
+    def is_prev_dl(self, data):
+        # Include metadata in data for duplicate checking
+        extended_data = data.copy()
+        if data.get('k2s'):
+            metadata = self.downloader.get_file_metadata(data['k2s'])
+            extended_data.update({
+                'filename': metadata['filename'],
+                'size': metadata['size']
+            })
+        if check_json_data(self.all_dls, extended_data):
+            self.last_data = None
+            return True
+        self.last_data = extended_data
+        return False
+    def dl_k2s_link(self, k2s_link):
+        if k2s_link:
+            print(f"Downloading: {k2s_link}")
+            metadata = self.downloader.download_file(k2s_link)
+            time.sleep(10)
+            if metadata and self.json_file_path and self.last_data:
+                self.last_data.update(metadata)  # Merge download metadata
+                self.all_dls.append(self.last_data)
+                safe_dump_to_file(data=self.all_dls, file_path=self.json_file_path)
+def get_soup(url):
+    try:
+        resp = requests.get(url)
+        resp.raise_for_status()
+        return BeautifulSoup(resp.text, 'html.parser')
+    except Exception as e:
+        print(f"Failed to fetch soup for {url}: {e}")
+        return None
+def get_k2s_link(soup):
+    match = re.search(r'https://k2s\.cc/file/[^"<]+', str(soup))
+    return match.group(0) if match else None
+def get_sections_content(content,get_post_attribute,dls_mgr):
+    results=[]
+    if not content:
+        return []
+    for section in content:
+        data = get_post_attribute(section)
+        if data and data.get('k2s') and not dls_mgr.is_prev_dl(data):
+            dls_mgr.dl_k2s_link(data['k2s'])
+            results.append(data)
+    return results

{abstract_webtools-0.1.6.105 → abstract_webtools-0.1.6.107}/src/abstract_webtools.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: abstract_webtools
-Version: 0.1.6.105
+Version: 0.1.6.107
 Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
 Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
 Author: putkoff

abstract_webtools-0.1.6.105/src/abstract_webtools/k2s_downloader.py DELETED Viewed

@@ -1,165 +0,0 @@
-import os
-import re
-import time
-import requests
-from bs4 import BeautifulSoup
-from urllib.parse import urljoin
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.common.keys import Keys
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from abstract_security import *
-from abstract_webtools import *
-from abstract_utilities import safe_dump_to_file,safe_load_from_json
-DOWNLOAD_DIR = os.path.abspath("./downloads")
-class K2SDownloader:
-    def __init__(self,env_path=None,download_dir=None,json_file_path=None):
-        self.download_dir = download_dir or DOWNLOAD_DIR
-        self.json_file_path = json_file_path
-        os.makedirs(self.download_dir, exist_ok=True)
-        self.env_path = env_path
-        self.session = requests.Session()
-        self.session.headers.update({
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-        })
-        self.driver = self._init_driver()
-        self.logged_in = False
-    def _init_driver(self):
-        options = webdriver.ChromeOptions()
-        options.add_argument("--disable-blink-features=AutomationControlled")
-        options.add_argument("--headless")
-        return webdriver.Chrome(options=options)
-    def login(self):
-        userName = get_env_value('userName',path=self.env_path)
-        passWord = get_env_value('passWord',path=self.env_path)
-        self.driver.get("https://k2s.cc/auth/login")
-        time.sleep(3)
-        email_input = self.driver.find_element(By.NAME, "email")
-        password_input = self.driver.find_element(By.NAME, "input-password-auto-complete-on")
-        email_input.send_keys(userName)
-        password_input.send_keys(passWord)
-        password_input.send_keys(Keys.RETURN)
-        #WebDriverWait(self.driver, 20).until(
-        #    EC.presence_of_element_located((By.XPATH, "//a[contains(text(), 'Logout')]"))
-        #)
-        self.logged_in = True
-        print("Login successful")
-        #except Exception as e:
-        #    print(f"Login failed: {e}")
-        #    with open('login_error.html', 'w', encoding='utf-8') as f:
-        #        f.write(self.driver.page_source)
-    def download_file(self, url):
-        if not self.logged_in:
-            self.login()
-        print(f"Navigating to: {url}")
-        self.driver.get(url)
-        time.sleep(5)
-        if 'captcha' in self.driver.page_source.lower():
-            print("CAPTCHA detected. Manual intervention required.")
-            return
-        try:
-            download_button = WebDriverWait(self.driver, 30).until(
-                EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[href*="/download"], button[class*="download"]'))
-            )
-            print("Download button found; attempting to click or fetch URL")
-            download_url = download_button.get_attribute('href')
-            if download_url:
-                response = self.session.get(download_url, stream=True)
-                file_name = self._extract_filename(response, download_url)
-                file_path = os.path.join(self.download_dir, file_name)
-                with open(file_path, 'wb') as f:
-                    for chunk in response.iter_content(chunk_size=8192):
-                        f.write(chunk)
-                print(f"Downloaded: {file_path}")
-                return file_path
-            else:
-                download_button.click()
-                print("Button clicked. Waiting for download...")
-                time.sleep(30)  # adjust as needed
-        except Exception as e:
-            print(f"Download failed for {url}: {e}")
-    def _extract_filename(self, response, url):
-        cd = response.headers.get('Content-Disposition', '')
-        if 'filename=' in cd:
-            return cd.split('filename=')[-1].strip('"')
-        return url.split('/')[-1].split('?')[0]
-def get_json_key_value(json_data,key):
-    if json_data and isinstance(json_data,dict):
-        return json_data.get(key)
-def compare_keys(json_data,comp_json_data,key):
-    json_key_value = get_json_key_value(json_data,key)
-    comp_json_key_value = get_json_key_value(comp_json_data,key)
-    if json_key_value and comp_json_key_value and comp_json_key_value==json_key_value:
-        return True
-def check_json_data(json_list,new_data):
-    keys = ['k2s','link','name']
-    for json_data in json_list:
-        for key in keys:
-            result = compare_keys(json_data,new_data,key)
-            if result:
-                return result
-class dlsManager:
-    def __init__(self, downloader):
-        self.downloader = downloader
-        self.json_file_path = self.downloader.json_file_path
-        all_dls= None
-        if self.json_file_path:
-            all_dls = safe_load_from_json(self.json_file_path)
-        self.all_dls = all_dls or  []
-        self.last_data = None
-    def is_prev_dl(self, data):
-        if check_json_data(self.all_dls,data):
-            self.last_data = None
-            return True
-        self.last_data = data
-        return False
-    def dl_k2s_link(self, k2s_link):
-        if k2s_link:
-            print(f"Downloading: {k2s_link}")
-            self.downloader.download_file(k2s_link)
-            time.sleep(10)
-            if self.json_file_path:
-                self.all_dls.append(self.last_data)
-                safe_dump_to_file(data=self.all_dls,
-                                  file_path=self.json_file_path)
-def get_soup(url):
-    try:
-        resp = requests.get(url)
-        resp.raise_for_status()
-        return BeautifulSoup(resp.text, 'html.parser')
-    except Exception as e:
-        print(f"Failed to fetch soup for {url}: {e}")
-        return None
-def get_k2s_link(soup):
-    match = re.search(r'https://k2s\.cc/file/[^"<]+', str(soup))
-    return match.group(0) if match else None
-def get_sections_content(content,get_post_attribute,dls_mgr):
-    results=[]
-    if not content:
-        return []
-    for section in content:
-        data = get_post_attribute(section)
-        if data and data.get('k2s') and not dls_mgr.is_prev_dl(data):
-            dls_mgr.dl_k2s_link(data['k2s'])
-            results.append(data)
-    return results