abstract-webtools 0.1.6.105__py3-none-any.whl → 0.1.6.107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_webtools/k2s_downloader.py +117 -55
- {abstract_webtools-0.1.6.105.dist-info → abstract_webtools-0.1.6.107.dist-info}/METADATA +1 -1
- {abstract_webtools-0.1.6.105.dist-info → abstract_webtools-0.1.6.107.dist-info}/RECORD +5 -5
- {abstract_webtools-0.1.6.105.dist-info → abstract_webtools-0.1.6.107.dist-info}/WHEEL +0 -0
- {abstract_webtools-0.1.6.105.dist-info → abstract_webtools-0.1.6.107.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@ import os
|
|
2
2
|
import re
|
3
3
|
import time
|
4
4
|
import requests
|
5
|
+
import hashlib
|
5
6
|
from bs4 import BeautifulSoup
|
6
7
|
from urllib.parse import urljoin
|
7
8
|
from selenium import webdriver
|
@@ -11,10 +12,12 @@ from selenium.webdriver.support.ui import WebDriverWait
|
|
11
12
|
from selenium.webdriver.support import expected_conditions as EC
|
12
13
|
from abstract_security import *
|
13
14
|
from abstract_webtools import *
|
14
|
-
from abstract_utilities import safe_dump_to_file,safe_load_from_json
|
15
|
+
from abstract_utilities import safe_dump_to_file, safe_load_from_json
|
16
|
+
|
15
17
|
DOWNLOAD_DIR = os.path.abspath("./downloads")
|
18
|
+
|
16
19
|
class K2SDownloader:
|
17
|
-
def __init__(self,env_path=None,download_dir=None,json_file_path=None):
|
20
|
+
def __init__(self, env_path=None, download_dir=None, json_file_path=None):
|
18
21
|
self.download_dir = download_dir or DOWNLOAD_DIR
|
19
22
|
self.json_file_path = json_file_path
|
20
23
|
os.makedirs(self.download_dir, exist_ok=True)
|
@@ -33,28 +36,63 @@ class K2SDownloader:
|
|
33
36
|
return webdriver.Chrome(options=options)
|
34
37
|
|
35
38
|
def login(self):
|
36
|
-
userName = get_env_value('userName',path=self.env_path)
|
37
|
-
passWord = get_env_value('passWord',path=self.env_path)
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
39
|
+
userName = get_env_value('userName', path=self.env_path)
|
40
|
+
passWord = get_env_value('passWord', path=self.env_path)
|
41
|
+
|
42
|
+
try:
|
43
|
+
self.driver.get("https://k2s.cc/auth/login")
|
44
|
+
print("Navigating to login page")
|
45
|
+
time.sleep(3)
|
46
|
+
|
47
|
+
email_input = WebDriverWait(self.driver, 10).until(
|
48
|
+
EC.presence_of_element_located((By.NAME, "email"))
|
49
|
+
)
|
50
|
+
password_input = WebDriverWait(self.driver, 10).until(
|
51
|
+
EC.presence_of_element_located((By.NAME, "password")) # Updated field name
|
52
|
+
)
|
53
|
+
email_input.send_keys(userName)
|
54
|
+
password_input.send_keys(passWord)
|
55
|
+
password_input.send_keys(Keys.RETURN)
|
56
|
+
print("Submitted login credentials")
|
57
|
+
|
58
|
+
WebDriverWait(self.driver, 15).until(
|
59
|
+
EC.url_contains("dashboard") # Adjust based on post-login URL
|
60
|
+
)
|
61
|
+
self.logged_in = True
|
62
|
+
print("Login successful")
|
63
|
+
except Exception as e:
|
64
|
+
print(f"Login failed: {e}")
|
65
|
+
with open('login_error.html', 'w', encoding='utf-8') as f:
|
66
|
+
f.write(self.driver.page_source)
|
67
|
+
raise
|
68
|
+
|
69
|
+
def get_file_metadata(self, download_url):
|
70
|
+
"""Fetch filename and metadata using a HEAD request or page inspection."""
|
71
|
+
metadata = {'url': download_url, 'filename': None, 'size': None}
|
72
|
+
try:
|
73
|
+
# Try HEAD request first
|
74
|
+
response = self.session.head(download_url, allow_redirects=True)
|
75
|
+
if response.status_code == 200:
|
76
|
+
cd = response.headers.get('Content-Disposition', '')
|
77
|
+
if 'filename=' in cd:
|
78
|
+
metadata['filename'] = cd.split('filename=')[-1].strip('"')
|
79
|
+
metadata['size'] = response.headers.get('Content-Length')
|
80
|
+
if not metadata['filename']:
|
81
|
+
metadata['filename'] = download_url.split('/')[-1].split('?')[0]
|
82
|
+
else:
|
83
|
+
# Fallback to page inspection if HEAD fails
|
84
|
+
self.driver.get(download_url)
|
85
|
+
WebDriverWait(self.driver, 10).until(
|
86
|
+
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
87
|
+
)
|
88
|
+
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
|
89
|
+
filename_tag = soup.select_one('a[href*="/download"]')
|
90
|
+
metadata['filename'] = filename_tag.text.strip() if filename_tag else download_url.split('/')[-1]
|
91
|
+
size_tag = soup.find(string=re.compile(r'\d+\.?\d*\s*(MB|GB|KB)'))
|
92
|
+
metadata['size'] = size_tag.strip() if size_tag else None
|
93
|
+
except Exception as e:
|
94
|
+
print(f"Failed to fetch metadata for {download_url}: {e}")
|
95
|
+
return metadata
|
58
96
|
|
59
97
|
def download_file(self, url):
|
60
98
|
if not self.logged_in:
|
@@ -62,83 +100,107 @@ class K2SDownloader:
|
|
62
100
|
|
63
101
|
print(f"Navigating to: {url}")
|
64
102
|
self.driver.get(url)
|
65
|
-
|
103
|
+
WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
66
104
|
|
67
105
|
if 'captcha' in self.driver.page_source.lower():
|
68
106
|
print("CAPTCHA detected. Manual intervention required.")
|
69
|
-
return
|
107
|
+
return None
|
70
108
|
|
71
109
|
try:
|
72
110
|
download_button = WebDriverWait(self.driver, 30).until(
|
73
111
|
EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[href*="/download"], button[class*="download"]'))
|
74
112
|
)
|
75
|
-
print("Download button found; attempting to
|
113
|
+
print("Download button found; attempting to fetch URL")
|
76
114
|
download_url = download_button.get_attribute('href')
|
77
115
|
|
78
116
|
if download_url:
|
79
|
-
|
80
|
-
|
117
|
+
# Get metadata before downloading
|
118
|
+
metadata = self.get_file_metadata(download_url)
|
119
|
+
file_name = metadata['filename'] or self._extract_filename(None, download_url)
|
81
120
|
file_path = os.path.join(self.download_dir, file_name)
|
82
121
|
|
122
|
+
# Download the file
|
123
|
+
response = self.session.get(download_url, stream=True)
|
124
|
+
response.raise_for_status()
|
125
|
+
|
83
126
|
with open(file_path, 'wb') as f:
|
84
127
|
for chunk in response.iter_content(chunk_size=8192):
|
85
128
|
f.write(chunk)
|
86
129
|
print(f"Downloaded: {file_path}")
|
87
|
-
|
130
|
+
|
131
|
+
# Update metadata with file size if not already set
|
132
|
+
if not metadata['size']:
|
133
|
+
metadata['size'] = os.path.getsize(file_path)
|
134
|
+
metadata['file_path'] = file_path
|
135
|
+
|
136
|
+
return metadata
|
88
137
|
else:
|
89
138
|
download_button.click()
|
90
139
|
print("Button clicked. Waiting for download...")
|
91
|
-
time.sleep(30)
|
140
|
+
time.sleep(30)
|
141
|
+
return None
|
92
142
|
except Exception as e:
|
93
143
|
print(f"Download failed for {url}: {e}")
|
144
|
+
return None
|
94
145
|
|
95
146
|
def _extract_filename(self, response, url):
|
96
|
-
|
97
|
-
|
98
|
-
|
147
|
+
if response:
|
148
|
+
cd = response.headers.get('Content-Disposition', '')
|
149
|
+
if 'filename=' in cd:
|
150
|
+
return cd.split('filename=')[-1].strip('"')
|
99
151
|
return url.split('/')[-1].split('?')[0]
|
100
|
-
|
101
|
-
|
152
|
+
|
153
|
+
def get_json_key_value(json_data, key):
|
154
|
+
if json_data and isinstance(json_data, dict):
|
102
155
|
return json_data.get(key)
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
156
|
+
|
157
|
+
def compare_keys(json_data, comp_json_data, key):
|
158
|
+
json_key_value = get_json_key_value(json_data, key)
|
159
|
+
comp_json_key_value = get_json_key_value(comp_json_data, key)
|
160
|
+
return json_key_value and comp_json_key_value and json_key_value == comp_json_key_value
|
161
|
+
|
162
|
+
def check_json_data(json_list, new_data):
|
163
|
+
keys = ['k2s', 'filename', 'size'] # Check k2s URL, filename, and size
|
110
164
|
for json_data in json_list:
|
111
165
|
for key in keys:
|
112
|
-
|
113
|
-
|
114
|
-
|
166
|
+
if compare_keys(json_data, new_data, key):
|
167
|
+
return True
|
168
|
+
return False
|
115
169
|
|
116
170
|
class dlsManager:
|
117
171
|
def __init__(self, downloader):
|
118
172
|
self.downloader = downloader
|
119
173
|
self.json_file_path = self.downloader.json_file_path
|
120
|
-
all_dls= None
|
174
|
+
all_dls = None
|
121
175
|
if self.json_file_path:
|
122
176
|
all_dls = safe_load_from_json(self.json_file_path)
|
123
|
-
self.all_dls = all_dls or
|
177
|
+
self.all_dls = all_dls or []
|
124
178
|
self.last_data = None
|
179
|
+
|
125
180
|
def is_prev_dl(self, data):
|
126
|
-
|
181
|
+
# Include metadata in data for duplicate checking
|
182
|
+
extended_data = data.copy()
|
183
|
+
if data.get('k2s'):
|
184
|
+
metadata = self.downloader.get_file_metadata(data['k2s'])
|
185
|
+
extended_data.update({
|
186
|
+
'filename': metadata['filename'],
|
187
|
+
'size': metadata['size']
|
188
|
+
})
|
189
|
+
if check_json_data(self.all_dls, extended_data):
|
127
190
|
self.last_data = None
|
128
191
|
return True
|
129
|
-
self.last_data =
|
192
|
+
self.last_data = extended_data
|
130
193
|
return False
|
131
194
|
|
132
195
|
def dl_k2s_link(self, k2s_link):
|
133
196
|
if k2s_link:
|
134
197
|
print(f"Downloading: {k2s_link}")
|
135
|
-
self.downloader.download_file(k2s_link)
|
198
|
+
metadata = self.downloader.download_file(k2s_link)
|
136
199
|
time.sleep(10)
|
137
|
-
if self.json_file_path:
|
200
|
+
if metadata and self.json_file_path and self.last_data:
|
201
|
+
self.last_data.update(metadata) # Merge download metadata
|
138
202
|
self.all_dls.append(self.last_data)
|
139
|
-
safe_dump_to_file(data=self.all_dls,
|
140
|
-
file_path=self.json_file_path)
|
141
|
-
|
203
|
+
safe_dump_to_file(data=self.all_dls, file_path=self.json_file_path)
|
142
204
|
|
143
205
|
def get_soup(url):
|
144
206
|
try:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: abstract_webtools
|
3
|
-
Version: 0.1.6.
|
3
|
+
Version: 0.1.6.107
|
4
4
|
Summary: Abstract Web Tools is a Python package that provides various utility functions for web scraping tasks. It is built on top of popular libraries such as `requests`, `BeautifulSoup`, and `urllib3` to simplify the process of fetching and parsing web content.
|
5
5
|
Home-page: https://github.com/AbstractEndeavors/abstract_essentials/tree/main/abstract_webtools
|
6
6
|
Author: putkoff
|
@@ -6,7 +6,7 @@ abstract_webtools/big_user_agent_list.py,sha256=5ZkrUWmfzYL5yaULREslh9ZiRQeITbSj
|
|
6
6
|
abstract_webtools/domain_identifier.py,sha256=AvWlGD7C19rySa_J_Brxi3kz43LMWvGsshuuZNg7MvI,3320
|
7
7
|
abstract_webtools/extention_list.py,sha256=gRSO4nMbuuXDYzd-ss4s64sS80ZHmUoazMCpgoKG5vE,4884
|
8
8
|
abstract_webtools/find_dirs.py,sha256=BlE4ruzMABqmv03NcutZ1j5N3pCc-Q4uNEAMpNolZCQ,2609
|
9
|
-
abstract_webtools/k2s_downloader.py,sha256=
|
9
|
+
abstract_webtools/k2s_downloader.py,sha256=m2M1LlYdXGSOy3MNn8YPn0Gz70LLbXXDa_aUP3tvUm0,9213
|
10
10
|
abstract_webtools/main.py,sha256=_I7pPXPkoLZOoYGLQDrSLGhGuQt6-PVyXEHZSmglk2g,1329
|
11
11
|
abstract_webtools/soup_gui.py,sha256=n95YAps1R6DpMwR4UbthSqQby0C5WHUa9tsW-f2qpLg,5184
|
12
12
|
abstract_webtools/url_grabber.py,sha256=pnCCev7ZIuM-6cAGTLmK5HfzZg_AX-fLcRpB6ZE70B8,10441
|
@@ -42,7 +42,7 @@ abstract_webtools/managers/soupManager/soupManager.py,sha256=U3_o189-OWoBRaSCe2s
|
|
42
42
|
abstract_webtools/managers/urlManager/__init__.py,sha256=gaJCHeK91Z-eYsBnxgdhbIUten1-gbx-zqx70R6ag-Y,26
|
43
43
|
abstract_webtools/managers/urlManager/urlManager.py,sha256=vCFuLADmv3h7icaaoAsImGqb_49VizPY_ZvMl-C7PYk,7756
|
44
44
|
abstract_webtools/managers/videos/Heather brooke swallo from condom.mp4,sha256=h-bKFLAHt7pGLGu4EcMvSSox7BPRK0Nga3u813iMVKQ,8335544
|
45
|
-
abstract_webtools-0.1.6.
|
46
|
-
abstract_webtools-0.1.6.
|
47
|
-
abstract_webtools-0.1.6.
|
48
|
-
abstract_webtools-0.1.6.
|
45
|
+
abstract_webtools-0.1.6.107.dist-info/METADATA,sha256=std8u1_zW1pWxvRY0djcPaeCuUUX9yohGkY8fT-cwTs,7289
|
46
|
+
abstract_webtools-0.1.6.107.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
|
47
|
+
abstract_webtools-0.1.6.107.dist-info/top_level.txt,sha256=2DMJ7RmjTcjCsa-uwAV0K6eXXlIIkFDEjBLg_uyCmCI,18
|
48
|
+
abstract_webtools-0.1.6.107.dist-info/RECORD,,
|
File without changes
|
{abstract_webtools-0.1.6.105.dist-info → abstract_webtools-0.1.6.107.dist-info}/top_level.txt
RENAMED
File without changes
|