py2ls 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. py2ls/.git/COMMIT_EDITMSG +1 -0
  2. py2ls/.git/FETCH_HEAD +1 -0
  3. py2ls/.git/HEAD +1 -0
  4. py2ls/.git/config +15 -0
  5. py2ls/.git/description +1 -0
  6. py2ls/.git/hooks/applypatch-msg.sample +15 -0
  7. py2ls/.git/hooks/commit-msg.sample +24 -0
  8. py2ls/.git/hooks/fsmonitor-watchman.sample +174 -0
  9. py2ls/.git/hooks/post-update.sample +8 -0
  10. py2ls/.git/hooks/pre-applypatch.sample +14 -0
  11. py2ls/.git/hooks/pre-commit.sample +49 -0
  12. py2ls/.git/hooks/pre-merge-commit.sample +13 -0
  13. py2ls/.git/hooks/pre-push.sample +53 -0
  14. py2ls/.git/hooks/pre-rebase.sample +169 -0
  15. py2ls/.git/hooks/pre-receive.sample +24 -0
  16. py2ls/.git/hooks/prepare-commit-msg.sample +42 -0
  17. py2ls/.git/hooks/push-to-checkout.sample +78 -0
  18. py2ls/.git/hooks/update.sample +128 -0
  19. py2ls/.git/index +0 -0
  20. py2ls/.git/info/exclude +6 -0
  21. py2ls/.git/logs/HEAD +1 -0
  22. py2ls/.git/logs/refs/heads/main +1 -0
  23. py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
  24. py2ls/.git/logs/refs/remotes/origin/main +1 -0
  25. py2ls/.git/objects/25/b796accd261b9135fd32a2c00785f68edf6c46 +0 -0
  26. py2ls/.git/objects/36/b4a1b7403abc6c360f8fe2cb656ab945254971 +0 -0
  27. py2ls/.git/objects/3f/d6561300938afbb3d11976cf9c8f29549280d9 +0 -0
  28. py2ls/.git/objects/58/20a729045d4dc7e37ccaf8aa8eec126850afe2 +0 -0
  29. py2ls/.git/objects/60/f273eb1c412d916fa3f11318a7da7a9911b52a +0 -0
  30. py2ls/.git/objects/61/570cec8c061abe74121f27f5face6c69b98f99 +0 -0
  31. py2ls/.git/objects/69/13c452ca319f7cbf6a0836dc10a5bb033c84e4 +0 -0
  32. py2ls/.git/objects/78/3d4167bc95c9d2175e0df03ef1c1c880ba75ab +0 -0
  33. py2ls/.git/objects/79/7ae089b2212a937840e215276005ce76881307 +0 -0
  34. py2ls/.git/objects/7e/5956c806b5edc344d46dab599dec337891ba1f +1 -0
  35. py2ls/.git/objects/8e/55a7d2b96184030211f20c9b9af201eefcac82 +0 -0
  36. py2ls/.git/objects/91/c69ad88fe0ba94aa7859fb5f7edac5e6f1a3f7 +0 -0
  37. py2ls/.git/objects/b0/56be4be89ba6b76949dd641df45bb7036050c8 +0 -0
  38. py2ls/.git/objects/b0/9cd7856d58590578ee1a4f3ad45d1310a97f87 +0 -0
  39. py2ls/.git/objects/d9/005f2cc7fc4e65f14ed5518276007c08cf2fd0 +0 -0
  40. py2ls/.git/objects/df/e0770424b2a19faf507a501ebfc23be8f54e7b +0 -0
  41. py2ls/.git/objects/e9/391ffe371f1cc43b42ef09b705d9c767c2e14f +0 -0
  42. py2ls/.git/objects/fc/292e793ecfd42240ac43be407023bd731fa9e7 +0 -0
  43. py2ls/.git/refs/heads/main +1 -0
  44. py2ls/.git/refs/remotes/origin/HEAD +1 -0
  45. py2ls/.git/refs/remotes/origin/main +1 -0
  46. py2ls/.gitattributes +2 -0
  47. py2ls/.gitignore +152 -0
  48. py2ls/LICENSE +201 -0
  49. py2ls/README.md +409 -0
  50. py2ls/__init__.py +17 -0
  51. py2ls/brain_atlas.py +145 -0
  52. py2ls/correlators.py +475 -0
  53. py2ls/dbhandler.py +97 -0
  54. py2ls/freqanalysis.py +800 -0
  55. py2ls/internet_finder.py +405 -0
  56. py2ls/ips.py +2844 -0
  57. py2ls/netfinder.py +780 -0
  58. py2ls/sleep_events_detectors.py +1350 -0
  59. py2ls/translator.py +686 -0
  60. py2ls/version.py +1 -0
  61. py2ls/wb_detector.py +169 -0
  62. py2ls-0.1.0.dist-info/METADATA +12 -0
  63. py2ls-0.1.0.dist-info/RECORD +64 -0
  64. py2ls-0.1.0.dist-info/WHEEL +4 -0
py2ls/netfinder.py ADDED
@@ -0,0 +1,780 @@
1
+ from bs4 import BeautifulSoup
2
+ import requests
3
+ import os
4
+ from urllib.parse import urlparse, urljoin
5
+ import base64
6
+ import pandas as pd
7
+ from collections import Counter
8
+ import random
9
+ import logging
10
+ from time import sleep
11
+ import stem.process
12
+ from stem import Signal
13
+ from stem.control import Controller
14
+ import json
15
+ from fake_useragent import UserAgent
16
+ from selenium import webdriver
17
+ from selenium.webdriver.chrome.service import Service
18
+ from selenium.webdriver.common.by import By
19
+ from selenium.webdriver.chrome.options import Options
20
+ from selenium.webdriver.support.ui import WebDriverWait
21
+ from selenium.webdriver.support import expected_conditions as EC
22
+ from webdriver_manager.chrome import ChromeDriverManager
23
+
24
+
25
+ # Set up logging
26
+ logging.basicConfig(level=logging.INFO)
27
+ logger = logging.getLogger(__name__)
28
+ # Suppress WDM INFO logs
29
+ logging.getLogger('WDM').setLevel(logging.WARNING)
30
+ proxies_glob=None
31
+
32
+ # Define supported content types and corresponding parsers
33
+ CONTENT_PARSERS = {
34
+ "text/html": lambda text, parser: BeautifulSoup(text, parser),
35
+ "application/json": lambda text, parser: json.loads(text),
36
+ "text/xml": lambda text, parser: BeautifulSoup(text, parser),
37
+ "text/plain": lambda text, parser: text.text,
38
+ }
39
+
40
+ def user_agent(browsers=["chrome", "edge", "firefox", "safari"], platforms=["pc", "tablet"],verbose=False,os=["windows", "macos", "linux"]):
41
+ ua = UserAgent(browsers=browsers, platforms=platforms,os=os)
42
+ output_ua = ua.random
43
+ if verbose:
44
+ print(output_ua)
45
+ return output_ua
46
+ # def extract_text_from_content(content,where,what,extend=False):
47
+ # if extend:
48
+ # texts = ""
49
+
50
+ # def extract_text(element):
51
+ # nonlocal texts
52
+ # if isinstance(element, str) and element.strip():
53
+ # texts += element.strip()
54
+ # elif hasattr(element, "children"):
55
+ # for child in element.children:
56
+ # extract_text(child)
57
+
58
+ # result_set = (
59
+ # content.find_all(where, class_=what)
60
+ # if what
61
+ # else content.find_all(where)
62
+ # )
63
+ # for tag in result_set:
64
+ # extract_text(tag)
65
+
66
+ # text = [tx for tx in texts.split("\n") if tx]
67
+ # return text
68
+ # else:
69
+ # result_set = (
70
+ # content.find_all(where, class_=what)
71
+ # if what
72
+ # else content.find_all(where)
73
+ # )
74
+ # texts_ = " ".join(tag.get_text() + "\n" for tag in result_set)
75
+ # texts = [tx.strip() for tx in texts_.split("\n") if tx]
76
+ # return texts
77
+ # def extract_text_from_content(content, where, what=None, extend=True):
78
+ # if extend:
79
+ # def extract_text(element):
80
+ # texts = ""
81
+ # if isinstance(element, str) and element.strip():
82
+ # texts += element.strip()
83
+ # elif hasattr(element, "children"):
84
+ # for child in element.children:
85
+ # texts += extract_text(child)
86
+ # return texts
87
+
88
+ # result_set = content.find_all(where, class_=what) if what else content.find_all(where)
89
+ # texts = ""
90
+ # for tag in result_set:
91
+ # texts += extract_text(tag) + "\n"
92
+ # text_list = [tx.strip() for tx in texts.split("\n") if tx.strip()]
93
+ # return text_list
94
+ # else:
95
+ # result_set = content.find_all(where, class_=what) if what else content.find_all(where)
96
+ # texts_ = " ".join(tag.get_text() for tag in result_set)
97
+ # texts = [tx.strip() for tx in texts_.split("\n") if tx.strip()]
98
+ # return texts
99
+ def extract_text_from_content(content, content_type="text/html", where=None, what=None, extend=True, **kwargs):
100
+ if content is None:
101
+ logger.error("Content is None, cannot extract text.")
102
+ return []
103
+
104
+ if content_type not in CONTENT_PARSERS:
105
+ logger.error(f"Unsupported content type: {content_type}")
106
+ return []
107
+
108
+ if "json" in content_type:
109
+ where = None
110
+ return extract_text_from_json(content, where)
111
+ elif 'text' in content_type:
112
+ if extend:
113
+ def extract_text(element):
114
+ texts = ""
115
+ if isinstance(element, str) and element.strip():
116
+ texts += element.strip()
117
+ elif hasattr(element, "children"):
118
+ for child in element.children:
119
+ texts += extract_text(child)
120
+ return texts
121
+
122
+ search_kwargs = {**kwargs}
123
+ if what:
124
+ search_kwargs["class_"] = what
125
+
126
+ result_set = content.find_all(where, **search_kwargs)
127
+ texts = ""
128
+ for tag in result_set:
129
+ texts += extract_text(tag) + "\n"
130
+ text_list = [tx.strip() for tx in texts.split("\n") if tx.strip()]
131
+ return text_list
132
+ else:
133
+ search_kwargs = {**kwargs}
134
+ if what:
135
+ search_kwargs["class_"] = what
136
+
137
+ result_set = content.find_all(where, **search_kwargs)
138
+ texts_ = " ".join(tag.get_text() for tag in result_set)
139
+ texts = [tx.strip() for tx in texts_.split("\n") if tx.strip()]
140
+ return texts
141
+
142
+ def extract_text_from_json(content, key=None):
143
+ if key:
144
+ if isinstance(content, list):
145
+ return [str(item.get(key, '')) for item in content if key in item]
146
+ if isinstance(content, dict):
147
+ return [str(content.get(key, ''))]
148
+ else:
149
+ return [str(value) for key, value in flatten_json(content).items()]
150
+
151
+ def flatten_json(y):
152
+ out = {}
153
+ def flatten(x, name=''):
154
+ if isinstance(x, dict):
155
+ for a in x:
156
+ flatten(x[a], name + a + '_')
157
+ elif isinstance(x, list):
158
+ i = 0
159
+ for a in x:
160
+ flatten(a, name + str(i) + '_')
161
+ i += 1
162
+ else:
163
+ out[name[:-1]] = x
164
+ flatten(y)
165
+ return out
166
+
167
+ def get_proxy():
168
+ list_ = []
169
+ headers = {"User-Agent": user_agent()}
170
+ response = requests.get("https://free-proxy-list.net", headers=headers)
171
+ content = BeautifulSoup(response.content, "html.parser")
172
+ info = extract_text_from_content(content, where="td", extend=0)[0].split()
173
+ count, pair_proxy = 0, 2
174
+ for i, j in enumerate(info):
175
+ if "." in j:
176
+ list_.append(j + ":" + info[i + 1])
177
+ # list_.append() # Assuming the next item is the value
178
+ count += 1
179
+ # if count == pair_proxy: # Stop after extracting the desired number of pairs
180
+ # break
181
+ prox = random.sample(list_, 2)
182
+ proxies = {
183
+ "http": f"http://" + prox[0],
184
+ "https": f"http://" + prox[1],
185
+ }
186
+ return proxies
187
+ # proxies_glob=get_proxy()
188
+
189
+ def fetch_all(url, parser="lxml"): # lxml is faster, # parser="html.parser"
190
+ try:
191
+ # Generate a random user-agent string
192
+ headers = {"User-Agent": user_agent()}
193
+
194
+ # Send the initial request
195
+ response = requests.get(url, headers=headers,proxies=proxies_glob)
196
+
197
+ # If the response is a redirect, follow it
198
+ while response.is_redirect:
199
+ logger.info(f"Redirecting to: {response.headers['Location']}")
200
+ response = requests.get(response.headers["Location"], headers=headers,proxies=proxies_glob)
201
+ # Check for a 403 error
202
+ if response.status_code == 403:
203
+ logger.warning("403 Forbidden error. Retrying...")
204
+ # Retry the request after a short delay
205
+ sleep(random.uniform(1, 3))
206
+ response = requests.get(url, headers=headers,proxies=proxies_glob)
207
+ # Raise an error if retry also fails
208
+ response.raise_for_status()
209
+
210
+ # Raise an error for other HTTP status codes
211
+ response.raise_for_status()
212
+
213
+ # Get the content type
214
+ content_type = response.headers.get("content-type", "").split(";")[0].lower()
215
+ if response.encoding:
216
+ content = response.content.decode(response.encoding)
217
+ else:
218
+ content=None
219
+ # logger.info(f"Content type: {content_type}")
220
+
221
+ # Check if content type is supported
222
+ if content_type in CONTENT_PARSERS and content:
223
+ return content_type, CONTENT_PARSERS[content_type](content, parser)
224
+ else:
225
+ logger.warning("Unsupported content type")
226
+ return None, None
227
+ except requests.RequestException as e:
228
+ logger.error(f"Error fetching URL '{url}': {e}")
229
+ return None, None
230
+
231
+
232
+
233
+ # # Function to change Tor IP address
234
+ # def renew_tor_ip():
235
+ # with Controller.from_port(port=9051) as controller:
236
+ # controller.authenticate()
237
+ # controller.signal(Signal.NEWNYM)
238
+
239
+ # # Function to make requests through Tor
240
+ # def make_tor_request(url, max_retries=3):
241
+ # renew_tor_ip()
242
+ # headers = {"User-Agent": user_agent()}
243
+ # session = requests.Session()
244
+ # session.proxies = {"http": "socks5h://localhost:9050", "https": "socks5h://localhost:9050"}
245
+
246
+ # for i in range(max_retries):
247
+ # try:
248
+ # response = session.get(url, headers=headers, timeout=10)
249
+ # if response.status_code == 200:
250
+ # return response.text
251
+ # except requests.exceptions.RequestException as e:
252
+ # print(f"Error: {e}")
253
+ # time.sleep(2) # Add a delay between retries
254
+
255
+ # return None
256
+
257
+
258
+ def find_links(url):
259
+ links_href,cond_ex= [],["javascript:","mailto:","tel:","fax:"]
260
+ content_type, soup = fetch_all(url)
261
+ if soup:
262
+ base_url = urlparse(url)
263
+ links = soup.find_all("a", href=True, recursive=True)
264
+ for link in links:
265
+ link_href = link["href"]
266
+ if not link_href.startswith(("http")):
267
+ link_href = urljoin(base_url.geturl(), link_href)
268
+ cond_ex_ = all([i not in link_href for i in cond_ex])
269
+ if cond_ex_:
270
+ links_href.append(link_href)
271
+ return list(set(links_href))
272
+ elif url.split('.')[-1] in ['pdf']:
273
+ return url
274
+ else:
275
+ return None
276
+
277
+
278
+ def find_domain(links):
279
+ if not links:
280
+ return None
281
+ domains = [urlparse(link).netloc for link in links]
282
+ domain_counts = Counter(domains)
283
+ if domain_counts.most_common(1):
284
+ most_common_domain_tuple = domain_counts.most_common(1)[0]
285
+ if most_common_domain_tuple:
286
+ most_common_domain = most_common_domain_tuple[0]
287
+ return most_common_domain
288
+ else:
289
+ return None
290
+ else:
291
+ return None
292
+
293
+
294
+ # To determine which links are related to target domains(e.g., pages) you are interested in
295
+ def filter_links(links, contains="html"):
296
+ filtered_links = []
297
+ if isinstance(contains, str):
298
+ contains = [contains]
299
+ if isinstance(links,str):
300
+ links=find_links(links)
301
+ for link in links:
302
+ parsed_link = urlparse(link)
303
+ condition = (all([i in link for i in contains])
304
+ and "javascript:" not in parsed_link
305
+ )
306
+ if condition:
307
+ filtered_links.append(link)
308
+ return filtered_links
309
+
310
+
311
+ def pdf_detector(url, contains=None, dir_save=None,booster=False):
312
+ def fname_pdf_corr(fname):
313
+ if fname[-4:]!='.pdf':
314
+ fname = fname[:-4] + '.pdf'
315
+ return fname
316
+ if isinstance(contains, str):
317
+ contains = [contains]
318
+ if isinstance(url,str):
319
+ if '.pdf' in url:
320
+ pdf_links=url
321
+ else:
322
+ if booster:
323
+ links_all=[]
324
+ if 'http' in url and url:
325
+ [links_all.append(i) for i in find_links(url) if 'http' in i]
326
+ print(links_all)
327
+ else:
328
+ links_all=url
329
+ if contains is not None:
330
+ pdf_links = filter_links(links=links_all, contains=[".pdf"] + contains)
331
+ else:
332
+ pdf_links = filter_links(links=links_all, contains=[".pdf"])
333
+ elif isinstance(url,list):
334
+ links_all=url
335
+ if contains is not None:
336
+ pdf_links = filter_links(links=links_all, contains=["pdf"] + contains)
337
+ else:
338
+ pdf_links = filter_links(links=links_all, contains=["pdf"])
339
+ else:
340
+ links_all = find_links(url)
341
+ if contains is not None:
342
+ pdf_links = filter_links(links=links_all, contains=["pdf"] + contains)
343
+ else:
344
+ pdf_links = filter_links(links=links_all, contains=["pdf"])
345
+
346
+ if pdf_links:
347
+ print(f"pdf detected\n{pdf_links}")
348
+ else:
349
+ print('no pdf file')
350
+ if dir_save:
351
+ print("... is trying to download to local")
352
+ fnames = [pdf_link_.split("/")[-1] for pdf_link_ in pdf_links]
353
+ idx = 0
354
+ for pdf_link in pdf_links:
355
+ headers = {"User-Agent": user_agent()}
356
+ response = requests.get(pdf_link, headers=headers)
357
+ # Check if the request was successful (status code 200)
358
+ if response.status_code == 200:
359
+ # Save the PDF content to a file
360
+ with open(dir_save + fname_pdf_corr(fnames[idx]), "wb") as pdf:
361
+ pdf.write(response.content)
362
+ print("PDF downloaded successfully!")
363
+ else:
364
+ print("Failed to download PDF:", response.status_code)
365
+ idx += 1
366
+ print(f'{len(fnames)} files are downloaded:\n{fnames}\n to local: \n{dir_save}')
367
+
368
+
369
+ def find_img(url, dir_save="images"):
370
+ """
371
+ Save images referenced in HTML content locally.
372
+ Args:
373
+ content (str or BeautifulSoup): HTML content or BeautifulSoup object.
374
+ url (str): URL of the webpage.
375
+ content_type (str): Type of content. Default is "html".
376
+ dir_save (str): Directory to save images. Default is "images".
377
+ Returns:
378
+ str: HTML content with updated image URLs pointing to local files.
379
+ """
380
+ content_type, content = fetch_all(url)
381
+ if "html" in content_type.lower():
382
+ # Create the directory if it doesn't exist
383
+ os.makedirs(dir_save, exist_ok=True)
384
+
385
+ # Parse HTML content if it's not already a BeautifulSoup object
386
+ if isinstance(content, str):
387
+ content = BeautifulSoup(content, "html.parser")
388
+ image_links = []
389
+ # Extracting images
390
+ images = content.find_all("img", src=True)
391
+ for i, image in enumerate(images):
392
+ try:
393
+ # Get the image URL
394
+ image_url = image["src"]
395
+
396
+ if image_url.startswith("data:image"):
397
+ # Extract the image data from the data URI
398
+ mime_type, base64_data = image_url.split(",", 1)
399
+ # Determine the file extension from the MIME type
400
+ if ":" in mime_type:
401
+ # image_extension = mime_type.split(":")[1].split(";")[0]
402
+ image_extension = (
403
+ mime_type.split(":")[1].split(";")[0].split("/")[-1]
404
+ )
405
+ else:
406
+ image_extension = (
407
+ "png" # Default to PNG if extension is not specified
408
+ )
409
+ # if 'svg+xml' in image_extension:
410
+ # image_extension='svg'
411
+ image_data = base64.b64decode(base64_data)
412
+ # Save the image data to a file
413
+ image_filename = os.path.join(
414
+ dir_save, f"image_{i}.{image_extension}"
415
+ )
416
+ with open(image_filename, "wb") as image_file:
417
+ image_file.write(image_data)
418
+
419
+ # Update the src attribute of the image tag to point to the local file
420
+ image["src"] = image_filename
421
+ else:
422
+ # Construct the absolute image URL
423
+ absolute_image_url = urljoin(url, image_url)
424
+
425
+ # Parse the image URL to extract the file extension
426
+ parsed_url = urlparse(absolute_image_url)
427
+ image_extension = os.path.splitext(parsed_url.path)[1]
428
+
429
+ # Download the image
430
+ image_response = requests.get(absolute_image_url,proxies=proxies_glob)
431
+
432
+ # Save the image to a file
433
+ image_filename = os.path.join(
434
+ dir_save, f"image_{i}{image_extension}"
435
+ )
436
+ with open(image_filename, "wb") as image_file:
437
+ image_file.write(image_response.content)
438
+
439
+ # Update the src attribute of the image tag to point to the local file
440
+ image["src"] = image_filename
441
+ except (requests.RequestException, KeyError) as e:
442
+ print(f"Failed to process image {image_url}: {e}")
443
+ print(f"images were saved at\n{dir_save}")
444
+ # Return the HTML content with updated image URLs
445
+ return content
446
+
447
+
448
+ def content_div_class(content, div="div", div_class="highlight"):
449
+ texts = [div.text for div in content.find_all(div, class_=div_class)]
450
+ return texts
451
+
452
+
453
+
454
+ def fetch_selenium(
455
+ url,
456
+ where="div",
457
+ what=None,
458
+ extend=False,
459
+ by=By.TAG_NAME,
460
+ timeout=10,
461
+ retry=2,
462
+ login_url=None,
463
+ username=None,
464
+ password=None,
465
+ username_field="username",
466
+ password_field="password",
467
+ submit_field="submit",
468
+ username_by=By.NAME,
469
+ password_by=By.NAME,
470
+ submit_by=By.NAME,
471
+ proxy=None, # Add proxy parameter
472
+ **kwargs
473
+ ):
474
+ chrome_options = Options()
475
+ chrome_options.add_argument("--headless")
476
+ chrome_options.add_argument("--no-sandbox")
477
+ chrome_options.add_argument("--disable-dev-shm-usage")
478
+ chrome_options.add_argument(f"user-agent={user_agent()}")
479
+ if proxy:
480
+ chrome_options.add_argument(f'--proxy-server={proxy}')
481
+
482
+ service = Service(ChromeDriverManager().install())
483
+ for attempt in range(retry):
484
+ try:
485
+ driver = webdriver.Chrome(service=service, options=chrome_options)
486
+ if login_url:
487
+ driver.get(login_url)
488
+ WebDriverWait(driver, timeout).until(
489
+ EC.presence_of_element_located((username_by, username_field))
490
+ ).send_keys(username)
491
+ WebDriverWait(driver, timeout).until(
492
+ EC.presence_of_element_located((password_by, password_field))
493
+ ).send_keys(password)
494
+ WebDriverWait(driver, timeout).until(
495
+ EC.element_to_be_clickable((submit_by, submit_field))
496
+ ).click()
497
+
498
+ driver.get(url)
499
+ WebDriverWait(driver, timeout).until(
500
+ EC.presence_of_element_located((by, where))
501
+ )
502
+ page_source = driver.page_source
503
+ driver.quit()
504
+
505
+ content = BeautifulSoup(page_source, "html.parser")
506
+ texts=extract_text_from_content(content, where=where,what=what,extend=extend,**kwargs)
507
+ return texts
508
+ except Exception as e:
509
+ # logger.error(f"Attempt {attempt + 1} failed with error ")
510
+ if driver:
511
+ driver.quit()
512
+ if attempt == retry - 1:
513
+ logger.error("Failed to fetch the content after all retries")
514
+ return []
515
+ sleep(random.uniform(1, 3))
516
+ # Return empty list if nothing found after all retries
517
+ return []
518
+
519
+
520
+ def fetch(url, where="div", what=None, extend=True, booster=False,retry=2,verbose=False, **kws):
521
+ # for attempt in range(retry):
522
+ # if verbose and attempt==0:
523
+ # xample = 'fetch(url,where="div",what=None,extend=True,by=By.TAG_NAME,timeout=10,retry=3,login_url=None,username=None,password=None,username_field="username",password_field="password",submit_field="submit",username_by=By.NAME,password_by=By.NAME,submit_by=By.NAME)'
524
+ # print(xample)
525
+ # content_type, content = fetch_all(url, parser="html.parser")
526
+ # texts=extract_text_from_content(content,content_type=content_type,where=where,what=what,extend=extend, **kws)
527
+ # if isinstance(texts,pd.core.frame.DataFrame):
528
+ # condition=[texts.empty, attempt != retry - 1]
529
+ # else:
530
+ # condition=[not texts, attempt != retry - 1]
531
+ # if all(condition):
532
+ # texts = fetch(url=url, where=where, what=what, extend=extend, retry=retry-1, **kws)
533
+ # sleep(random.uniform(0.5, 1.5))
534
+ for attempt in range(retry):
535
+ if verbose and attempt==0:
536
+ xample = 'fetch(url,where="div",what=None,extend=True,by=By.TAG_NAME,timeout=10,retry=3,login_url=None,username=None,password=None,username_field="username",password_field="password",submit_field="submit",username_by=By.NAME,password_by=By.NAME,submit_by=By.NAME)'
537
+ print(xample)
538
+ content_type, content = fetch_all(url, parser="html.parser")
539
+ texts=extract_text_from_content(content,content_type=content_type,where=where,what=what,extend=extend, **kws)
540
+ if isinstance(texts, pd.core.frame.DataFrame):
541
+ # condition=[texts.empty, attempt != retry - 1]
542
+ if not texts.empty:
543
+ break
544
+ else:
545
+ # condition=[not texts, attempt != retry - 1]
546
+ if texts:
547
+ break
548
+ # if all(condition):
549
+ # texts = fetch(url=url, where=where, what=what, extend=extend, retry=retry-1, **kws)
550
+ sleep(random.uniform(0.5, 1.5))
551
+ if isinstance(texts,pd.core.frame.DataFrame):
552
+ condition_=[texts.empty, booster]
553
+ else:
554
+ condition_=[not texts, booster]
555
+ if any(condition_):
556
+ print("trying to use 'fetcher2'...")
557
+ texts = fetch_selenium(url=url, where=where, what=what, extend=extend, **kws)
558
+ return texts
559
+
560
+
561
+ def extract_from_content(content, where="div", what=None):
562
+ if what is None:
563
+ result_set = content.find_all(where,recursive=True)
564
+ texts_ = " ".join(tag.get_text() + "\n" for tag in result_set)
565
+ texts = [tx for tx in texts_.split("\n") if tx]
566
+ else:
567
+ texts_ = " ".join(
568
+ div.get_text() + "\n" for div in content.find_all(where, class_=what,recursive=True)
569
+ )
570
+ texts = [tx for tx in texts_.split("\n") if tx]
571
+ return texts
572
+
573
+
574
+ def find_forms(url):
575
+ content_type, content = fetch_all(url)
576
+ df = pd.DataFrame()
577
+ # Extracting forms and inputs
578
+ forms = content.find_all("form",recursive=True)
579
+ form_data = []
580
+ for form in forms:
581
+ if form:
582
+ form_inputs = form.find_all("input",recursive=True)
583
+ input_data = {}
584
+ for input_tag in form_inputs:
585
+ input_type = input_tag.get("type")
586
+ input_name = input_tag.get("name")
587
+ input_value = input_tag.get("value")
588
+ input_data[input_name] = {"type": input_type, "value": input_value}
589
+ form_data.append(input_data)
590
+ return form_data
591
+
592
+
593
+ # to clean strings
594
+ def clean_string(value):
595
+ if isinstance(value, str):
596
+ return value.replace("\n", "").replace("\r", "").replace("\t", "")
597
+ else:
598
+ return value
599
+
600
+
601
+ def find_all(url, dir_save=None):
602
+ content_type, content = fetch_all(url)
603
+ paragraphs_text = extract_from_content(content, where="p")
604
+ # Extracting specific elements by class
605
+ specific_elements_text = [
606
+ element.text for element in content.find_all(class_="specific-class",recursive=True) if element
607
+ ]
608
+ # Extracting links (anchor tags)
609
+ links_href = find_links(url)
610
+ links_href = filter_links(links_href)
611
+
612
+ # Extracting images
613
+ images_src = [image["src"] for image in content.find_all("img", src=True,recursive=True) if image]
614
+
615
+ # Extracting headings (h1, h2, h3, etc.)
616
+ headings = [f"h{i}" for i in range(1, 7)]
617
+ headings_text = {
618
+ heading: [tag.text for tag in content.find_all(heading,recursive=True)]
619
+ for heading in headings
620
+ if heading
621
+ }
622
+
623
+ # Extracting lists (ul, ol, li)
624
+ list_items_text = [
625
+ item.text
626
+ for list_ in content.find_all(["ul", "ol"],recursive=True)
627
+ for item in list_.find_all("li",recursive=True)
628
+ if item
629
+ ]
630
+
631
+ # Extracting tables (table, tr, td)
632
+ table_cells_text = [
633
+ cell.text
634
+ for table in content.find_all("table",recursive=True)
635
+ for row in table.find_all("tr")
636
+ for cell in row.find_all("td")
637
+ if cell
638
+ ]
639
+
640
+ # Extracting other elements
641
+ divs_content = extract_from_content(content, where="div")
642
+ headers_footer_content = [
643
+ tag.text for tag in content.find_all(["header", "footer"],recursive=True) if tag
644
+ ]
645
+ meta_tags_content = [
646
+ (tag.name, tag.attrs) for tag in content.find_all("meta",recursive=True) if tag
647
+ ]
648
+ spans_content = extract_from_content(content, where="span")
649
+ bold_text_content = extract_from_content(content, where="b")
650
+ italic_text_content = extract_from_content(content, where="i")
651
+ code_snippets_content = extract_from_content(content, where="code")
652
+ blockquotes_content = extract_from_content(content, where="blockquote")
653
+ preformatted_text_content = extract_from_content(content, where="pre")
654
+ buttons_content = extract_from_content(content, where="button")
655
+ navs_content = extract_from_content(content, where="nav")
656
+ sections_content = extract_from_content(content, where="section")
657
+ articles_content = extract_from_content(content, where="article")
658
+ figures_content = extract_from_content(content, where="figure")
659
+ captions_content = extract_from_content(content, where="figcap")
660
+ abbreviations_content = extract_from_content(content, where="abbr")
661
+ definitions_content = extract_from_content(content, where="dfn")
662
+ addresses_content = extract_from_content(content, where="address")
663
+ time_elements_content = extract_from_content(content, where="time")
664
+ progress_content = extract_from_content(content, where="process")
665
+ forms = find_forms(url)
666
+
667
+ lists_to_fill = [
668
+ paragraphs_text,
669
+ specific_elements_text,
670
+ links_href,
671
+ images_src,
672
+ headings_text["h1"],
673
+ headings_text["h2"],
674
+ headings_text["h3"],
675
+ headings_text["h4"],
676
+ headings_text["h5"],
677
+ headings_text["h6"],
678
+ list_items_text,
679
+ table_cells_text,
680
+ divs_content,
681
+ headers_footer_content,
682
+ meta_tags_content,
683
+ spans_content,
684
+ bold_text_content,
685
+ italic_text_content,
686
+ code_snippets_content,
687
+ blockquotes_content,
688
+ preformatted_text_content,
689
+ buttons_content,
690
+ navs_content,
691
+ sections_content,
692
+ articles_content,
693
+ figures_content,
694
+ captions_content,
695
+ abbreviations_content,
696
+ definitions_content,
697
+ addresses_content,
698
+ time_elements_content,
699
+ progress_content,
700
+ forms,
701
+ ]
702
+ # add new features
703
+ script_texts = content_div_class(content, div="div", div_class="highlight")
704
+ lists_to_fill.append(script_texts)
705
+
706
+ audio_src = [audio["src"] for audio in content.find_all("audio", src=True,recursive=True)]
707
+ video_src = [video["src"] for video in content.find_all("video", src=True,recursive=True)]
708
+ iframe_src = [iframe["src"] for iframe in content.find_all("iframe", src=True,recursive=True)]
709
+ lists_to_fill.extend([audio_src, video_src, iframe_src])
710
+
711
+ rss_links = [
712
+ link["href"]
713
+ for link in content.find_all(
714
+ "link", type=["application/rss+xml", "application/atom+xml"],recursive=True
715
+ )
716
+ ]
717
+ lists_to_fill.append(rss_links)
718
+
719
+ # Find the maximum length among all lists
720
+ max_length = max(len(lst) for lst in lists_to_fill)
721
+
722
+ # Fill missing data with empty strings for each list
723
+ for lst in lists_to_fill:
724
+ lst += [""] * (max_length - len(lst))
725
+
726
+ # Create DataFrame
727
+ df = pd.DataFrame(
728
+ {
729
+ "h1": headings_text["h1"],
730
+ "h2": headings_text["h2"],
731
+ "h3": headings_text["h3"],
732
+ "h4": headings_text["h4"],
733
+ "h5": headings_text["h5"],
734
+ "h6": headings_text["h6"],
735
+ "paragraphs": paragraphs_text,
736
+ "divs": divs_content,
737
+ "items": list_items_text,
738
+ "tables": table_cells_text,
739
+ "headers": headers_footer_content,
740
+ "tags": meta_tags_content,
741
+ "spans": spans_content,
742
+ "bold_text": bold_text_content,
743
+ "italic_text": italic_text_content,
744
+ "codes": code_snippets_content,
745
+ "blocks": blockquotes_content,
746
+ "preformatted_text": preformatted_text_content,
747
+ "buttons": buttons_content,
748
+ "navs": navs_content,
749
+ "sections": sections_content,
750
+ "articles": articles_content,
751
+ "figures": figures_content,
752
+ "captions": captions_content,
753
+ "abbreviations": abbreviations_content,
754
+ "definitions": definitions_content,
755
+ "addresses": addresses_content,
756
+ "time_elements": time_elements_content,
757
+ "progress": progress_content,
758
+ "specific_elements": specific_elements_text,
759
+ "forms": forms,
760
+ "scripts": script_texts,
761
+ "audio": audio_src,
762
+ "video": video_src,
763
+ "iframe": iframe_src,
764
+ "rss": rss_links,
765
+ "images": images_src,
766
+ "links": links_href,
767
+ }
768
+ )
769
+ # to remove the '\n\t\r'
770
+ df = df.apply(
771
+ lambda x: x.map(clean_string) if x.dtype == "object" else x
772
+ ) # df=df.applymap(clean_string)
773
+ if dir_save:
774
+ if not dir_save.endswith(".csv"):
775
+ dir_save = dir_save + "_df.csv"
776
+ df.to_csv(dir_save)
777
+ else:
778
+ df.to_csv(dir_save)
779
+ print(f"file has been saved at\n{dir_save}")
780
+ return df