py2ls 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/COMMIT_EDITMSG +1 -0
- py2ls/.git/FETCH_HEAD +1 -0
- py2ls/.git/HEAD +1 -0
- py2ls/.git/config +15 -0
- py2ls/.git/description +1 -0
- py2ls/.git/hooks/applypatch-msg.sample +15 -0
- py2ls/.git/hooks/commit-msg.sample +24 -0
- py2ls/.git/hooks/fsmonitor-watchman.sample +174 -0
- py2ls/.git/hooks/post-update.sample +8 -0
- py2ls/.git/hooks/pre-applypatch.sample +14 -0
- py2ls/.git/hooks/pre-commit.sample +49 -0
- py2ls/.git/hooks/pre-merge-commit.sample +13 -0
- py2ls/.git/hooks/pre-push.sample +53 -0
- py2ls/.git/hooks/pre-rebase.sample +169 -0
- py2ls/.git/hooks/pre-receive.sample +24 -0
- py2ls/.git/hooks/prepare-commit-msg.sample +42 -0
- py2ls/.git/hooks/push-to-checkout.sample +78 -0
- py2ls/.git/hooks/update.sample +128 -0
- py2ls/.git/index +0 -0
- py2ls/.git/info/exclude +6 -0
- py2ls/.git/logs/HEAD +1 -0
- py2ls/.git/logs/refs/heads/main +1 -0
- py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
- py2ls/.git/logs/refs/remotes/origin/main +1 -0
- py2ls/.git/objects/25/b796accd261b9135fd32a2c00785f68edf6c46 +0 -0
- py2ls/.git/objects/36/b4a1b7403abc6c360f8fe2cb656ab945254971 +0 -0
- py2ls/.git/objects/3f/d6561300938afbb3d11976cf9c8f29549280d9 +0 -0
- py2ls/.git/objects/58/20a729045d4dc7e37ccaf8aa8eec126850afe2 +0 -0
- py2ls/.git/objects/60/f273eb1c412d916fa3f11318a7da7a9911b52a +0 -0
- py2ls/.git/objects/61/570cec8c061abe74121f27f5face6c69b98f99 +0 -0
- py2ls/.git/objects/69/13c452ca319f7cbf6a0836dc10a5bb033c84e4 +0 -0
- py2ls/.git/objects/78/3d4167bc95c9d2175e0df03ef1c1c880ba75ab +0 -0
- py2ls/.git/objects/79/7ae089b2212a937840e215276005ce76881307 +0 -0
- py2ls/.git/objects/7e/5956c806b5edc344d46dab599dec337891ba1f +1 -0
- py2ls/.git/objects/8e/55a7d2b96184030211f20c9b9af201eefcac82 +0 -0
- py2ls/.git/objects/91/c69ad88fe0ba94aa7859fb5f7edac5e6f1a3f7 +0 -0
- py2ls/.git/objects/b0/56be4be89ba6b76949dd641df45bb7036050c8 +0 -0
- py2ls/.git/objects/b0/9cd7856d58590578ee1a4f3ad45d1310a97f87 +0 -0
- py2ls/.git/objects/d9/005f2cc7fc4e65f14ed5518276007c08cf2fd0 +0 -0
- py2ls/.git/objects/df/e0770424b2a19faf507a501ebfc23be8f54e7b +0 -0
- py2ls/.git/objects/e9/391ffe371f1cc43b42ef09b705d9c767c2e14f +0 -0
- py2ls/.git/objects/fc/292e793ecfd42240ac43be407023bd731fa9e7 +0 -0
- py2ls/.git/refs/heads/main +1 -0
- py2ls/.git/refs/remotes/origin/HEAD +1 -0
- py2ls/.git/refs/remotes/origin/main +1 -0
- py2ls/.gitattributes +2 -0
- py2ls/.gitignore +152 -0
- py2ls/LICENSE +201 -0
- py2ls/README.md +409 -0
- py2ls/__init__.py +17 -0
- py2ls/brain_atlas.py +145 -0
- py2ls/correlators.py +475 -0
- py2ls/dbhandler.py +97 -0
- py2ls/freqanalysis.py +800 -0
- py2ls/internet_finder.py +405 -0
- py2ls/ips.py +2844 -0
- py2ls/netfinder.py +780 -0
- py2ls/sleep_events_detectors.py +1350 -0
- py2ls/translator.py +686 -0
- py2ls/version.py +1 -0
- py2ls/wb_detector.py +169 -0
- py2ls-0.1.0.dist-info/METADATA +12 -0
- py2ls-0.1.0.dist-info/RECORD +64 -0
- py2ls-0.1.0.dist-info/WHEEL +4 -0
py2ls/netfinder.py
ADDED
@@ -0,0 +1,780 @@
|
|
1
|
+
from bs4 import BeautifulSoup
|
2
|
+
import requests
|
3
|
+
import os
|
4
|
+
from urllib.parse import urlparse, urljoin
|
5
|
+
import base64
|
6
|
+
import pandas as pd
|
7
|
+
from collections import Counter
|
8
|
+
import random
|
9
|
+
import logging
|
10
|
+
from time import sleep
|
11
|
+
import stem.process
|
12
|
+
from stem import Signal
|
13
|
+
from stem.control import Controller
|
14
|
+
import json
|
15
|
+
from fake_useragent import UserAgent
|
16
|
+
from selenium import webdriver
|
17
|
+
from selenium.webdriver.chrome.service import Service
|
18
|
+
from selenium.webdriver.common.by import By
|
19
|
+
from selenium.webdriver.chrome.options import Options
|
20
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
21
|
+
from selenium.webdriver.support import expected_conditions as EC
|
22
|
+
from webdriver_manager.chrome import ChromeDriverManager
|
23
|
+
|
24
|
+
|
25
|
+
# Set up logging
|
26
|
+
logging.basicConfig(level=logging.INFO)
|
27
|
+
logger = logging.getLogger(__name__)
|
28
|
+
# Suppress WDM INFO logs
|
29
|
+
logging.getLogger('WDM').setLevel(logging.WARNING)
|
30
|
+
proxies_glob=None
|
31
|
+
|
32
|
+
# Define supported content types and corresponding parsers
|
33
|
+
CONTENT_PARSERS = {
|
34
|
+
"text/html": lambda text, parser: BeautifulSoup(text, parser),
|
35
|
+
"application/json": lambda text, parser: json.loads(text),
|
36
|
+
"text/xml": lambda text, parser: BeautifulSoup(text, parser),
|
37
|
+
"text/plain": lambda text, parser: text.text,
|
38
|
+
}
|
39
|
+
|
40
|
+
def user_agent(browsers=["chrome", "edge", "firefox", "safari"], platforms=["pc", "tablet"],verbose=False,os=["windows", "macos", "linux"]):
|
41
|
+
ua = UserAgent(browsers=browsers, platforms=platforms,os=os)
|
42
|
+
output_ua = ua.random
|
43
|
+
if verbose:
|
44
|
+
print(output_ua)
|
45
|
+
return output_ua
|
46
|
+
# def extract_text_from_content(content,where,what,extend=False):
|
47
|
+
# if extend:
|
48
|
+
# texts = ""
|
49
|
+
|
50
|
+
# def extract_text(element):
|
51
|
+
# nonlocal texts
|
52
|
+
# if isinstance(element, str) and element.strip():
|
53
|
+
# texts += element.strip()
|
54
|
+
# elif hasattr(element, "children"):
|
55
|
+
# for child in element.children:
|
56
|
+
# extract_text(child)
|
57
|
+
|
58
|
+
# result_set = (
|
59
|
+
# content.find_all(where, class_=what)
|
60
|
+
# if what
|
61
|
+
# else content.find_all(where)
|
62
|
+
# )
|
63
|
+
# for tag in result_set:
|
64
|
+
# extract_text(tag)
|
65
|
+
|
66
|
+
# text = [tx for tx in texts.split("\n") if tx]
|
67
|
+
# return text
|
68
|
+
# else:
|
69
|
+
# result_set = (
|
70
|
+
# content.find_all(where, class_=what)
|
71
|
+
# if what
|
72
|
+
# else content.find_all(where)
|
73
|
+
# )
|
74
|
+
# texts_ = " ".join(tag.get_text() + "\n" for tag in result_set)
|
75
|
+
# texts = [tx.strip() for tx in texts_.split("\n") if tx]
|
76
|
+
# return texts
|
77
|
+
# def extract_text_from_content(content, where, what=None, extend=True):
|
78
|
+
# if extend:
|
79
|
+
# def extract_text(element):
|
80
|
+
# texts = ""
|
81
|
+
# if isinstance(element, str) and element.strip():
|
82
|
+
# texts += element.strip()
|
83
|
+
# elif hasattr(element, "children"):
|
84
|
+
# for child in element.children:
|
85
|
+
# texts += extract_text(child)
|
86
|
+
# return texts
|
87
|
+
|
88
|
+
# result_set = content.find_all(where, class_=what) if what else content.find_all(where)
|
89
|
+
# texts = ""
|
90
|
+
# for tag in result_set:
|
91
|
+
# texts += extract_text(tag) + "\n"
|
92
|
+
# text_list = [tx.strip() for tx in texts.split("\n") if tx.strip()]
|
93
|
+
# return text_list
|
94
|
+
# else:
|
95
|
+
# result_set = content.find_all(where, class_=what) if what else content.find_all(where)
|
96
|
+
# texts_ = " ".join(tag.get_text() for tag in result_set)
|
97
|
+
# texts = [tx.strip() for tx in texts_.split("\n") if tx.strip()]
|
98
|
+
# return texts
|
99
|
+
def extract_text_from_content(content, content_type="text/html", where=None, what=None, extend=True, **kwargs):
|
100
|
+
if content is None:
|
101
|
+
logger.error("Content is None, cannot extract text.")
|
102
|
+
return []
|
103
|
+
|
104
|
+
if content_type not in CONTENT_PARSERS:
|
105
|
+
logger.error(f"Unsupported content type: {content_type}")
|
106
|
+
return []
|
107
|
+
|
108
|
+
if "json" in content_type:
|
109
|
+
where = None
|
110
|
+
return extract_text_from_json(content, where)
|
111
|
+
elif 'text' in content_type:
|
112
|
+
if extend:
|
113
|
+
def extract_text(element):
|
114
|
+
texts = ""
|
115
|
+
if isinstance(element, str) and element.strip():
|
116
|
+
texts += element.strip()
|
117
|
+
elif hasattr(element, "children"):
|
118
|
+
for child in element.children:
|
119
|
+
texts += extract_text(child)
|
120
|
+
return texts
|
121
|
+
|
122
|
+
search_kwargs = {**kwargs}
|
123
|
+
if what:
|
124
|
+
search_kwargs["class_"] = what
|
125
|
+
|
126
|
+
result_set = content.find_all(where, **search_kwargs)
|
127
|
+
texts = ""
|
128
|
+
for tag in result_set:
|
129
|
+
texts += extract_text(tag) + "\n"
|
130
|
+
text_list = [tx.strip() for tx in texts.split("\n") if tx.strip()]
|
131
|
+
return text_list
|
132
|
+
else:
|
133
|
+
search_kwargs = {**kwargs}
|
134
|
+
if what:
|
135
|
+
search_kwargs["class_"] = what
|
136
|
+
|
137
|
+
result_set = content.find_all(where, **search_kwargs)
|
138
|
+
texts_ = " ".join(tag.get_text() for tag in result_set)
|
139
|
+
texts = [tx.strip() for tx in texts_.split("\n") if tx.strip()]
|
140
|
+
return texts
|
141
|
+
|
142
|
+
def extract_text_from_json(content, key=None):
|
143
|
+
if key:
|
144
|
+
if isinstance(content, list):
|
145
|
+
return [str(item.get(key, '')) for item in content if key in item]
|
146
|
+
if isinstance(content, dict):
|
147
|
+
return [str(content.get(key, ''))]
|
148
|
+
else:
|
149
|
+
return [str(value) for key, value in flatten_json(content).items()]
|
150
|
+
|
151
|
+
def flatten_json(y):
|
152
|
+
out = {}
|
153
|
+
def flatten(x, name=''):
|
154
|
+
if isinstance(x, dict):
|
155
|
+
for a in x:
|
156
|
+
flatten(x[a], name + a + '_')
|
157
|
+
elif isinstance(x, list):
|
158
|
+
i = 0
|
159
|
+
for a in x:
|
160
|
+
flatten(a, name + str(i) + '_')
|
161
|
+
i += 1
|
162
|
+
else:
|
163
|
+
out[name[:-1]] = x
|
164
|
+
flatten(y)
|
165
|
+
return out
|
166
|
+
|
167
|
+
def get_proxy():
|
168
|
+
list_ = []
|
169
|
+
headers = {"User-Agent": user_agent()}
|
170
|
+
response = requests.get("https://free-proxy-list.net", headers=headers)
|
171
|
+
content = BeautifulSoup(response.content, "html.parser")
|
172
|
+
info = extract_text_from_content(content, where="td", extend=0)[0].split()
|
173
|
+
count, pair_proxy = 0, 2
|
174
|
+
for i, j in enumerate(info):
|
175
|
+
if "." in j:
|
176
|
+
list_.append(j + ":" + info[i + 1])
|
177
|
+
# list_.append() # Assuming the next item is the value
|
178
|
+
count += 1
|
179
|
+
# if count == pair_proxy: # Stop after extracting the desired number of pairs
|
180
|
+
# break
|
181
|
+
prox = random.sample(list_, 2)
|
182
|
+
proxies = {
|
183
|
+
"http": f"http://" + prox[0],
|
184
|
+
"https": f"http://" + prox[1],
|
185
|
+
}
|
186
|
+
return proxies
|
187
|
+
# proxies_glob=get_proxy()
|
188
|
+
|
189
|
+
def fetch_all(url, parser="lxml"): # lxml is faster, # parser="html.parser"
|
190
|
+
try:
|
191
|
+
# Generate a random user-agent string
|
192
|
+
headers = {"User-Agent": user_agent()}
|
193
|
+
|
194
|
+
# Send the initial request
|
195
|
+
response = requests.get(url, headers=headers,proxies=proxies_glob)
|
196
|
+
|
197
|
+
# If the response is a redirect, follow it
|
198
|
+
while response.is_redirect:
|
199
|
+
logger.info(f"Redirecting to: {response.headers['Location']}")
|
200
|
+
response = requests.get(response.headers["Location"], headers=headers,proxies=proxies_glob)
|
201
|
+
# Check for a 403 error
|
202
|
+
if response.status_code == 403:
|
203
|
+
logger.warning("403 Forbidden error. Retrying...")
|
204
|
+
# Retry the request after a short delay
|
205
|
+
sleep(random.uniform(1, 3))
|
206
|
+
response = requests.get(url, headers=headers,proxies=proxies_glob)
|
207
|
+
# Raise an error if retry also fails
|
208
|
+
response.raise_for_status()
|
209
|
+
|
210
|
+
# Raise an error for other HTTP status codes
|
211
|
+
response.raise_for_status()
|
212
|
+
|
213
|
+
# Get the content type
|
214
|
+
content_type = response.headers.get("content-type", "").split(";")[0].lower()
|
215
|
+
if response.encoding:
|
216
|
+
content = response.content.decode(response.encoding)
|
217
|
+
else:
|
218
|
+
content=None
|
219
|
+
# logger.info(f"Content type: {content_type}")
|
220
|
+
|
221
|
+
# Check if content type is supported
|
222
|
+
if content_type in CONTENT_PARSERS and content:
|
223
|
+
return content_type, CONTENT_PARSERS[content_type](content, parser)
|
224
|
+
else:
|
225
|
+
logger.warning("Unsupported content type")
|
226
|
+
return None, None
|
227
|
+
except requests.RequestException as e:
|
228
|
+
logger.error(f"Error fetching URL '{url}': {e}")
|
229
|
+
return None, None
|
230
|
+
|
231
|
+
|
232
|
+
|
233
|
+
# # Function to change Tor IP address
|
234
|
+
# def renew_tor_ip():
|
235
|
+
# with Controller.from_port(port=9051) as controller:
|
236
|
+
# controller.authenticate()
|
237
|
+
# controller.signal(Signal.NEWNYM)
|
238
|
+
|
239
|
+
# # Function to make requests through Tor
|
240
|
+
# def make_tor_request(url, max_retries=3):
|
241
|
+
# renew_tor_ip()
|
242
|
+
# headers = {"User-Agent": user_agent()}
|
243
|
+
# session = requests.Session()
|
244
|
+
# session.proxies = {"http": "socks5h://localhost:9050", "https": "socks5h://localhost:9050"}
|
245
|
+
|
246
|
+
# for i in range(max_retries):
|
247
|
+
# try:
|
248
|
+
# response = session.get(url, headers=headers, timeout=10)
|
249
|
+
# if response.status_code == 200:
|
250
|
+
# return response.text
|
251
|
+
# except requests.exceptions.RequestException as e:
|
252
|
+
# print(f"Error: {e}")
|
253
|
+
# time.sleep(2) # Add a delay between retries
|
254
|
+
|
255
|
+
# return None
|
256
|
+
|
257
|
+
|
258
|
+
def find_links(url):
|
259
|
+
links_href,cond_ex= [],["javascript:","mailto:","tel:","fax:"]
|
260
|
+
content_type, soup = fetch_all(url)
|
261
|
+
if soup:
|
262
|
+
base_url = urlparse(url)
|
263
|
+
links = soup.find_all("a", href=True, recursive=True)
|
264
|
+
for link in links:
|
265
|
+
link_href = link["href"]
|
266
|
+
if not link_href.startswith(("http")):
|
267
|
+
link_href = urljoin(base_url.geturl(), link_href)
|
268
|
+
cond_ex_ = all([i not in link_href for i in cond_ex])
|
269
|
+
if cond_ex_:
|
270
|
+
links_href.append(link_href)
|
271
|
+
return list(set(links_href))
|
272
|
+
elif url.split('.')[-1] in ['pdf']:
|
273
|
+
return url
|
274
|
+
else:
|
275
|
+
return None
|
276
|
+
|
277
|
+
|
278
|
+
def find_domain(links):
|
279
|
+
if not links:
|
280
|
+
return None
|
281
|
+
domains = [urlparse(link).netloc for link in links]
|
282
|
+
domain_counts = Counter(domains)
|
283
|
+
if domain_counts.most_common(1):
|
284
|
+
most_common_domain_tuple = domain_counts.most_common(1)[0]
|
285
|
+
if most_common_domain_tuple:
|
286
|
+
most_common_domain = most_common_domain_tuple[0]
|
287
|
+
return most_common_domain
|
288
|
+
else:
|
289
|
+
return None
|
290
|
+
else:
|
291
|
+
return None
|
292
|
+
|
293
|
+
|
294
|
+
# To determine which links are related to target domains(e.g., pages) you are interested in
|
295
|
+
def filter_links(links, contains="html"):
|
296
|
+
filtered_links = []
|
297
|
+
if isinstance(contains, str):
|
298
|
+
contains = [contains]
|
299
|
+
if isinstance(links,str):
|
300
|
+
links=find_links(links)
|
301
|
+
for link in links:
|
302
|
+
parsed_link = urlparse(link)
|
303
|
+
condition = (all([i in link for i in contains])
|
304
|
+
and "javascript:" not in parsed_link
|
305
|
+
)
|
306
|
+
if condition:
|
307
|
+
filtered_links.append(link)
|
308
|
+
return filtered_links
|
309
|
+
|
310
|
+
|
311
|
+
def pdf_detector(url, contains=None, dir_save=None,booster=False):
|
312
|
+
def fname_pdf_corr(fname):
|
313
|
+
if fname[-4:]!='.pdf':
|
314
|
+
fname = fname[:-4] + '.pdf'
|
315
|
+
return fname
|
316
|
+
if isinstance(contains, str):
|
317
|
+
contains = [contains]
|
318
|
+
if isinstance(url,str):
|
319
|
+
if '.pdf' in url:
|
320
|
+
pdf_links=url
|
321
|
+
else:
|
322
|
+
if booster:
|
323
|
+
links_all=[]
|
324
|
+
if 'http' in url and url:
|
325
|
+
[links_all.append(i) for i in find_links(url) if 'http' in i]
|
326
|
+
print(links_all)
|
327
|
+
else:
|
328
|
+
links_all=url
|
329
|
+
if contains is not None:
|
330
|
+
pdf_links = filter_links(links=links_all, contains=[".pdf"] + contains)
|
331
|
+
else:
|
332
|
+
pdf_links = filter_links(links=links_all, contains=[".pdf"])
|
333
|
+
elif isinstance(url,list):
|
334
|
+
links_all=url
|
335
|
+
if contains is not None:
|
336
|
+
pdf_links = filter_links(links=links_all, contains=["pdf"] + contains)
|
337
|
+
else:
|
338
|
+
pdf_links = filter_links(links=links_all, contains=["pdf"])
|
339
|
+
else:
|
340
|
+
links_all = find_links(url)
|
341
|
+
if contains is not None:
|
342
|
+
pdf_links = filter_links(links=links_all, contains=["pdf"] + contains)
|
343
|
+
else:
|
344
|
+
pdf_links = filter_links(links=links_all, contains=["pdf"])
|
345
|
+
|
346
|
+
if pdf_links:
|
347
|
+
print(f"pdf detected\n{pdf_links}")
|
348
|
+
else:
|
349
|
+
print('no pdf file')
|
350
|
+
if dir_save:
|
351
|
+
print("... is trying to download to local")
|
352
|
+
fnames = [pdf_link_.split("/")[-1] for pdf_link_ in pdf_links]
|
353
|
+
idx = 0
|
354
|
+
for pdf_link in pdf_links:
|
355
|
+
headers = {"User-Agent": user_agent()}
|
356
|
+
response = requests.get(pdf_link, headers=headers)
|
357
|
+
# Check if the request was successful (status code 200)
|
358
|
+
if response.status_code == 200:
|
359
|
+
# Save the PDF content to a file
|
360
|
+
with open(dir_save + fname_pdf_corr(fnames[idx]), "wb") as pdf:
|
361
|
+
pdf.write(response.content)
|
362
|
+
print("PDF downloaded successfully!")
|
363
|
+
else:
|
364
|
+
print("Failed to download PDF:", response.status_code)
|
365
|
+
idx += 1
|
366
|
+
print(f'{len(fnames)} files are downloaded:\n{fnames}\n to local: \n{dir_save}')
|
367
|
+
|
368
|
+
|
369
|
+
def find_img(url, dir_save="images"):
|
370
|
+
"""
|
371
|
+
Save images referenced in HTML content locally.
|
372
|
+
Args:
|
373
|
+
content (str or BeautifulSoup): HTML content or BeautifulSoup object.
|
374
|
+
url (str): URL of the webpage.
|
375
|
+
content_type (str): Type of content. Default is "html".
|
376
|
+
dir_save (str): Directory to save images. Default is "images".
|
377
|
+
Returns:
|
378
|
+
str: HTML content with updated image URLs pointing to local files.
|
379
|
+
"""
|
380
|
+
content_type, content = fetch_all(url)
|
381
|
+
if "html" in content_type.lower():
|
382
|
+
# Create the directory if it doesn't exist
|
383
|
+
os.makedirs(dir_save, exist_ok=True)
|
384
|
+
|
385
|
+
# Parse HTML content if it's not already a BeautifulSoup object
|
386
|
+
if isinstance(content, str):
|
387
|
+
content = BeautifulSoup(content, "html.parser")
|
388
|
+
image_links = []
|
389
|
+
# Extracting images
|
390
|
+
images = content.find_all("img", src=True)
|
391
|
+
for i, image in enumerate(images):
|
392
|
+
try:
|
393
|
+
# Get the image URL
|
394
|
+
image_url = image["src"]
|
395
|
+
|
396
|
+
if image_url.startswith("data:image"):
|
397
|
+
# Extract the image data from the data URI
|
398
|
+
mime_type, base64_data = image_url.split(",", 1)
|
399
|
+
# Determine the file extension from the MIME type
|
400
|
+
if ":" in mime_type:
|
401
|
+
# image_extension = mime_type.split(":")[1].split(";")[0]
|
402
|
+
image_extension = (
|
403
|
+
mime_type.split(":")[1].split(";")[0].split("/")[-1]
|
404
|
+
)
|
405
|
+
else:
|
406
|
+
image_extension = (
|
407
|
+
"png" # Default to PNG if extension is not specified
|
408
|
+
)
|
409
|
+
# if 'svg+xml' in image_extension:
|
410
|
+
# image_extension='svg'
|
411
|
+
image_data = base64.b64decode(base64_data)
|
412
|
+
# Save the image data to a file
|
413
|
+
image_filename = os.path.join(
|
414
|
+
dir_save, f"image_{i}.{image_extension}"
|
415
|
+
)
|
416
|
+
with open(image_filename, "wb") as image_file:
|
417
|
+
image_file.write(image_data)
|
418
|
+
|
419
|
+
# Update the src attribute of the image tag to point to the local file
|
420
|
+
image["src"] = image_filename
|
421
|
+
else:
|
422
|
+
# Construct the absolute image URL
|
423
|
+
absolute_image_url = urljoin(url, image_url)
|
424
|
+
|
425
|
+
# Parse the image URL to extract the file extension
|
426
|
+
parsed_url = urlparse(absolute_image_url)
|
427
|
+
image_extension = os.path.splitext(parsed_url.path)[1]
|
428
|
+
|
429
|
+
# Download the image
|
430
|
+
image_response = requests.get(absolute_image_url,proxies=proxies_glob)
|
431
|
+
|
432
|
+
# Save the image to a file
|
433
|
+
image_filename = os.path.join(
|
434
|
+
dir_save, f"image_{i}{image_extension}"
|
435
|
+
)
|
436
|
+
with open(image_filename, "wb") as image_file:
|
437
|
+
image_file.write(image_response.content)
|
438
|
+
|
439
|
+
# Update the src attribute of the image tag to point to the local file
|
440
|
+
image["src"] = image_filename
|
441
|
+
except (requests.RequestException, KeyError) as e:
|
442
|
+
print(f"Failed to process image {image_url}: {e}")
|
443
|
+
print(f"images were saved at\n{dir_save}")
|
444
|
+
# Return the HTML content with updated image URLs
|
445
|
+
return content
|
446
|
+
|
447
|
+
|
448
|
+
def content_div_class(content, div="div", div_class="highlight"):
|
449
|
+
texts = [div.text for div in content.find_all(div, class_=div_class)]
|
450
|
+
return texts
|
451
|
+
|
452
|
+
|
453
|
+
|
454
|
+
def fetch_selenium(
|
455
|
+
url,
|
456
|
+
where="div",
|
457
|
+
what=None,
|
458
|
+
extend=False,
|
459
|
+
by=By.TAG_NAME,
|
460
|
+
timeout=10,
|
461
|
+
retry=2,
|
462
|
+
login_url=None,
|
463
|
+
username=None,
|
464
|
+
password=None,
|
465
|
+
username_field="username",
|
466
|
+
password_field="password",
|
467
|
+
submit_field="submit",
|
468
|
+
username_by=By.NAME,
|
469
|
+
password_by=By.NAME,
|
470
|
+
submit_by=By.NAME,
|
471
|
+
proxy=None, # Add proxy parameter
|
472
|
+
**kwargs
|
473
|
+
):
|
474
|
+
chrome_options = Options()
|
475
|
+
chrome_options.add_argument("--headless")
|
476
|
+
chrome_options.add_argument("--no-sandbox")
|
477
|
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
478
|
+
chrome_options.add_argument(f"user-agent={user_agent()}")
|
479
|
+
if proxy:
|
480
|
+
chrome_options.add_argument(f'--proxy-server={proxy}')
|
481
|
+
|
482
|
+
service = Service(ChromeDriverManager().install())
|
483
|
+
for attempt in range(retry):
|
484
|
+
try:
|
485
|
+
driver = webdriver.Chrome(service=service, options=chrome_options)
|
486
|
+
if login_url:
|
487
|
+
driver.get(login_url)
|
488
|
+
WebDriverWait(driver, timeout).until(
|
489
|
+
EC.presence_of_element_located((username_by, username_field))
|
490
|
+
).send_keys(username)
|
491
|
+
WebDriverWait(driver, timeout).until(
|
492
|
+
EC.presence_of_element_located((password_by, password_field))
|
493
|
+
).send_keys(password)
|
494
|
+
WebDriverWait(driver, timeout).until(
|
495
|
+
EC.element_to_be_clickable((submit_by, submit_field))
|
496
|
+
).click()
|
497
|
+
|
498
|
+
driver.get(url)
|
499
|
+
WebDriverWait(driver, timeout).until(
|
500
|
+
EC.presence_of_element_located((by, where))
|
501
|
+
)
|
502
|
+
page_source = driver.page_source
|
503
|
+
driver.quit()
|
504
|
+
|
505
|
+
content = BeautifulSoup(page_source, "html.parser")
|
506
|
+
texts=extract_text_from_content(content, where=where,what=what,extend=extend,**kwargs)
|
507
|
+
return texts
|
508
|
+
except Exception as e:
|
509
|
+
# logger.error(f"Attempt {attempt + 1} failed with error ")
|
510
|
+
if driver:
|
511
|
+
driver.quit()
|
512
|
+
if attempt == retry - 1:
|
513
|
+
logger.error("Failed to fetch the content after all retries")
|
514
|
+
return []
|
515
|
+
sleep(random.uniform(1, 3))
|
516
|
+
# Return empty list if nothing found after all retries
|
517
|
+
return []
|
518
|
+
|
519
|
+
|
520
|
+
def fetch(url, where="div", what=None, extend=True, booster=False,retry=2,verbose=False, **kws):
|
521
|
+
# for attempt in range(retry):
|
522
|
+
# if verbose and attempt==0:
|
523
|
+
# xample = 'fetch(url,where="div",what=None,extend=True,by=By.TAG_NAME,timeout=10,retry=3,login_url=None,username=None,password=None,username_field="username",password_field="password",submit_field="submit",username_by=By.NAME,password_by=By.NAME,submit_by=By.NAME)'
|
524
|
+
# print(xample)
|
525
|
+
# content_type, content = fetch_all(url, parser="html.parser")
|
526
|
+
# texts=extract_text_from_content(content,content_type=content_type,where=where,what=what,extend=extend, **kws)
|
527
|
+
# if isinstance(texts,pd.core.frame.DataFrame):
|
528
|
+
# condition=[texts.empty, attempt != retry - 1]
|
529
|
+
# else:
|
530
|
+
# condition=[not texts, attempt != retry - 1]
|
531
|
+
# if all(condition):
|
532
|
+
# texts = fetch(url=url, where=where, what=what, extend=extend, retry=retry-1, **kws)
|
533
|
+
# sleep(random.uniform(0.5, 1.5))
|
534
|
+
for attempt in range(retry):
|
535
|
+
if verbose and attempt==0:
|
536
|
+
xample = 'fetch(url,where="div",what=None,extend=True,by=By.TAG_NAME,timeout=10,retry=3,login_url=None,username=None,password=None,username_field="username",password_field="password",submit_field="submit",username_by=By.NAME,password_by=By.NAME,submit_by=By.NAME)'
|
537
|
+
print(xample)
|
538
|
+
content_type, content = fetch_all(url, parser="html.parser")
|
539
|
+
texts=extract_text_from_content(content,content_type=content_type,where=where,what=what,extend=extend, **kws)
|
540
|
+
if isinstance(texts, pd.core.frame.DataFrame):
|
541
|
+
# condition=[texts.empty, attempt != retry - 1]
|
542
|
+
if not texts.empty:
|
543
|
+
break
|
544
|
+
else:
|
545
|
+
# condition=[not texts, attempt != retry - 1]
|
546
|
+
if texts:
|
547
|
+
break
|
548
|
+
# if all(condition):
|
549
|
+
# texts = fetch(url=url, where=where, what=what, extend=extend, retry=retry-1, **kws)
|
550
|
+
sleep(random.uniform(0.5, 1.5))
|
551
|
+
if isinstance(texts,pd.core.frame.DataFrame):
|
552
|
+
condition_=[texts.empty, booster]
|
553
|
+
else:
|
554
|
+
condition_=[not texts, booster]
|
555
|
+
if any(condition_):
|
556
|
+
print("trying to use 'fetcher2'...")
|
557
|
+
texts = fetch_selenium(url=url, where=where, what=what, extend=extend, **kws)
|
558
|
+
return texts
|
559
|
+
|
560
|
+
|
561
|
+
def extract_from_content(content, where="div", what=None):
|
562
|
+
if what is None:
|
563
|
+
result_set = content.find_all(where,recursive=True)
|
564
|
+
texts_ = " ".join(tag.get_text() + "\n" for tag in result_set)
|
565
|
+
texts = [tx for tx in texts_.split("\n") if tx]
|
566
|
+
else:
|
567
|
+
texts_ = " ".join(
|
568
|
+
div.get_text() + "\n" for div in content.find_all(where, class_=what,recursive=True)
|
569
|
+
)
|
570
|
+
texts = [tx for tx in texts_.split("\n") if tx]
|
571
|
+
return texts
|
572
|
+
|
573
|
+
|
574
|
+
def find_forms(url):
|
575
|
+
content_type, content = fetch_all(url)
|
576
|
+
df = pd.DataFrame()
|
577
|
+
# Extracting forms and inputs
|
578
|
+
forms = content.find_all("form",recursive=True)
|
579
|
+
form_data = []
|
580
|
+
for form in forms:
|
581
|
+
if form:
|
582
|
+
form_inputs = form.find_all("input",recursive=True)
|
583
|
+
input_data = {}
|
584
|
+
for input_tag in form_inputs:
|
585
|
+
input_type = input_tag.get("type")
|
586
|
+
input_name = input_tag.get("name")
|
587
|
+
input_value = input_tag.get("value")
|
588
|
+
input_data[input_name] = {"type": input_type, "value": input_value}
|
589
|
+
form_data.append(input_data)
|
590
|
+
return form_data
|
591
|
+
|
592
|
+
|
593
|
+
# to clean strings
|
594
|
+
def clean_string(value):
|
595
|
+
if isinstance(value, str):
|
596
|
+
return value.replace("\n", "").replace("\r", "").replace("\t", "")
|
597
|
+
else:
|
598
|
+
return value
|
599
|
+
|
600
|
+
|
601
|
+
def find_all(url, dir_save=None):
|
602
|
+
content_type, content = fetch_all(url)
|
603
|
+
paragraphs_text = extract_from_content(content, where="p")
|
604
|
+
# Extracting specific elements by class
|
605
|
+
specific_elements_text = [
|
606
|
+
element.text for element in content.find_all(class_="specific-class",recursive=True) if element
|
607
|
+
]
|
608
|
+
# Extracting links (anchor tags)
|
609
|
+
links_href = find_links(url)
|
610
|
+
links_href = filter_links(links_href)
|
611
|
+
|
612
|
+
# Extracting images
|
613
|
+
images_src = [image["src"] for image in content.find_all("img", src=True,recursive=True) if image]
|
614
|
+
|
615
|
+
# Extracting headings (h1, h2, h3, etc.)
|
616
|
+
headings = [f"h{i}" for i in range(1, 7)]
|
617
|
+
headings_text = {
|
618
|
+
heading: [tag.text for tag in content.find_all(heading,recursive=True)]
|
619
|
+
for heading in headings
|
620
|
+
if heading
|
621
|
+
}
|
622
|
+
|
623
|
+
# Extracting lists (ul, ol, li)
|
624
|
+
list_items_text = [
|
625
|
+
item.text
|
626
|
+
for list_ in content.find_all(["ul", "ol"],recursive=True)
|
627
|
+
for item in list_.find_all("li",recursive=True)
|
628
|
+
if item
|
629
|
+
]
|
630
|
+
|
631
|
+
# Extracting tables (table, tr, td)
|
632
|
+
table_cells_text = [
|
633
|
+
cell.text
|
634
|
+
for table in content.find_all("table",recursive=True)
|
635
|
+
for row in table.find_all("tr")
|
636
|
+
for cell in row.find_all("td")
|
637
|
+
if cell
|
638
|
+
]
|
639
|
+
|
640
|
+
# Extracting other elements
|
641
|
+
divs_content = extract_from_content(content, where="div")
|
642
|
+
headers_footer_content = [
|
643
|
+
tag.text for tag in content.find_all(["header", "footer"],recursive=True) if tag
|
644
|
+
]
|
645
|
+
meta_tags_content = [
|
646
|
+
(tag.name, tag.attrs) for tag in content.find_all("meta",recursive=True) if tag
|
647
|
+
]
|
648
|
+
spans_content = extract_from_content(content, where="span")
|
649
|
+
bold_text_content = extract_from_content(content, where="b")
|
650
|
+
italic_text_content = extract_from_content(content, where="i")
|
651
|
+
code_snippets_content = extract_from_content(content, where="code")
|
652
|
+
blockquotes_content = extract_from_content(content, where="blockquote")
|
653
|
+
preformatted_text_content = extract_from_content(content, where="pre")
|
654
|
+
buttons_content = extract_from_content(content, where="button")
|
655
|
+
navs_content = extract_from_content(content, where="nav")
|
656
|
+
sections_content = extract_from_content(content, where="section")
|
657
|
+
articles_content = extract_from_content(content, where="article")
|
658
|
+
figures_content = extract_from_content(content, where="figure")
|
659
|
+
captions_content = extract_from_content(content, where="figcap")
|
660
|
+
abbreviations_content = extract_from_content(content, where="abbr")
|
661
|
+
definitions_content = extract_from_content(content, where="dfn")
|
662
|
+
addresses_content = extract_from_content(content, where="address")
|
663
|
+
time_elements_content = extract_from_content(content, where="time")
|
664
|
+
progress_content = extract_from_content(content, where="process")
|
665
|
+
forms = find_forms(url)
|
666
|
+
|
667
|
+
lists_to_fill = [
|
668
|
+
paragraphs_text,
|
669
|
+
specific_elements_text,
|
670
|
+
links_href,
|
671
|
+
images_src,
|
672
|
+
headings_text["h1"],
|
673
|
+
headings_text["h2"],
|
674
|
+
headings_text["h3"],
|
675
|
+
headings_text["h4"],
|
676
|
+
headings_text["h5"],
|
677
|
+
headings_text["h6"],
|
678
|
+
list_items_text,
|
679
|
+
table_cells_text,
|
680
|
+
divs_content,
|
681
|
+
headers_footer_content,
|
682
|
+
meta_tags_content,
|
683
|
+
spans_content,
|
684
|
+
bold_text_content,
|
685
|
+
italic_text_content,
|
686
|
+
code_snippets_content,
|
687
|
+
blockquotes_content,
|
688
|
+
preformatted_text_content,
|
689
|
+
buttons_content,
|
690
|
+
navs_content,
|
691
|
+
sections_content,
|
692
|
+
articles_content,
|
693
|
+
figures_content,
|
694
|
+
captions_content,
|
695
|
+
abbreviations_content,
|
696
|
+
definitions_content,
|
697
|
+
addresses_content,
|
698
|
+
time_elements_content,
|
699
|
+
progress_content,
|
700
|
+
forms,
|
701
|
+
]
|
702
|
+
# add new features
|
703
|
+
script_texts = content_div_class(content, div="div", div_class="highlight")
|
704
|
+
lists_to_fill.append(script_texts)
|
705
|
+
|
706
|
+
audio_src = [audio["src"] for audio in content.find_all("audio", src=True,recursive=True)]
|
707
|
+
video_src = [video["src"] for video in content.find_all("video", src=True,recursive=True)]
|
708
|
+
iframe_src = [iframe["src"] for iframe in content.find_all("iframe", src=True,recursive=True)]
|
709
|
+
lists_to_fill.extend([audio_src, video_src, iframe_src])
|
710
|
+
|
711
|
+
rss_links = [
|
712
|
+
link["href"]
|
713
|
+
for link in content.find_all(
|
714
|
+
"link", type=["application/rss+xml", "application/atom+xml"],recursive=True
|
715
|
+
)
|
716
|
+
]
|
717
|
+
lists_to_fill.append(rss_links)
|
718
|
+
|
719
|
+
# Find the maximum length among all lists
|
720
|
+
max_length = max(len(lst) for lst in lists_to_fill)
|
721
|
+
|
722
|
+
# Fill missing data with empty strings for each list
|
723
|
+
for lst in lists_to_fill:
|
724
|
+
lst += [""] * (max_length - len(lst))
|
725
|
+
|
726
|
+
# Create DataFrame
|
727
|
+
df = pd.DataFrame(
|
728
|
+
{
|
729
|
+
"h1": headings_text["h1"],
|
730
|
+
"h2": headings_text["h2"],
|
731
|
+
"h3": headings_text["h3"],
|
732
|
+
"h4": headings_text["h4"],
|
733
|
+
"h5": headings_text["h5"],
|
734
|
+
"h6": headings_text["h6"],
|
735
|
+
"paragraphs": paragraphs_text,
|
736
|
+
"divs": divs_content,
|
737
|
+
"items": list_items_text,
|
738
|
+
"tables": table_cells_text,
|
739
|
+
"headers": headers_footer_content,
|
740
|
+
"tags": meta_tags_content,
|
741
|
+
"spans": spans_content,
|
742
|
+
"bold_text": bold_text_content,
|
743
|
+
"italic_text": italic_text_content,
|
744
|
+
"codes": code_snippets_content,
|
745
|
+
"blocks": blockquotes_content,
|
746
|
+
"preformatted_text": preformatted_text_content,
|
747
|
+
"buttons": buttons_content,
|
748
|
+
"navs": navs_content,
|
749
|
+
"sections": sections_content,
|
750
|
+
"articles": articles_content,
|
751
|
+
"figures": figures_content,
|
752
|
+
"captions": captions_content,
|
753
|
+
"abbreviations": abbreviations_content,
|
754
|
+
"definitions": definitions_content,
|
755
|
+
"addresses": addresses_content,
|
756
|
+
"time_elements": time_elements_content,
|
757
|
+
"progress": progress_content,
|
758
|
+
"specific_elements": specific_elements_text,
|
759
|
+
"forms": forms,
|
760
|
+
"scripts": script_texts,
|
761
|
+
"audio": audio_src,
|
762
|
+
"video": video_src,
|
763
|
+
"iframe": iframe_src,
|
764
|
+
"rss": rss_links,
|
765
|
+
"images": images_src,
|
766
|
+
"links": links_href,
|
767
|
+
}
|
768
|
+
)
|
769
|
+
# to remove the '\n\t\r'
|
770
|
+
df = df.apply(
|
771
|
+
lambda x: x.map(clean_string) if x.dtype == "object" else x
|
772
|
+
) # df=df.applymap(clean_string)
|
773
|
+
if dir_save:
|
774
|
+
if not dir_save.endswith(".csv"):
|
775
|
+
dir_save = dir_save + "_df.csv"
|
776
|
+
df.to_csv(dir_save)
|
777
|
+
else:
|
778
|
+
df.to_csv(dir_save)
|
779
|
+
print(f"file has been saved at\n{dir_save}")
|
780
|
+
return df
|