py2ls 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py2ls/.git/COMMIT_EDITMSG +1 -0
- py2ls/.git/FETCH_HEAD +1 -0
- py2ls/.git/HEAD +1 -0
- py2ls/.git/config +15 -0
- py2ls/.git/description +1 -0
- py2ls/.git/hooks/applypatch-msg.sample +15 -0
- py2ls/.git/hooks/commit-msg.sample +24 -0
- py2ls/.git/hooks/fsmonitor-watchman.sample +174 -0
- py2ls/.git/hooks/post-update.sample +8 -0
- py2ls/.git/hooks/pre-applypatch.sample +14 -0
- py2ls/.git/hooks/pre-commit.sample +49 -0
- py2ls/.git/hooks/pre-merge-commit.sample +13 -0
- py2ls/.git/hooks/pre-push.sample +53 -0
- py2ls/.git/hooks/pre-rebase.sample +169 -0
- py2ls/.git/hooks/pre-receive.sample +24 -0
- py2ls/.git/hooks/prepare-commit-msg.sample +42 -0
- py2ls/.git/hooks/push-to-checkout.sample +78 -0
- py2ls/.git/hooks/update.sample +128 -0
- py2ls/.git/index +0 -0
- py2ls/.git/info/exclude +6 -0
- py2ls/.git/logs/HEAD +1 -0
- py2ls/.git/logs/refs/heads/main +1 -0
- py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
- py2ls/.git/logs/refs/remotes/origin/main +1 -0
- py2ls/.git/objects/25/b796accd261b9135fd32a2c00785f68edf6c46 +0 -0
- py2ls/.git/objects/36/b4a1b7403abc6c360f8fe2cb656ab945254971 +0 -0
- py2ls/.git/objects/3f/d6561300938afbb3d11976cf9c8f29549280d9 +0 -0
- py2ls/.git/objects/58/20a729045d4dc7e37ccaf8aa8eec126850afe2 +0 -0
- py2ls/.git/objects/60/f273eb1c412d916fa3f11318a7da7a9911b52a +0 -0
- py2ls/.git/objects/61/570cec8c061abe74121f27f5face6c69b98f99 +0 -0
- py2ls/.git/objects/69/13c452ca319f7cbf6a0836dc10a5bb033c84e4 +0 -0
- py2ls/.git/objects/78/3d4167bc95c9d2175e0df03ef1c1c880ba75ab +0 -0
- py2ls/.git/objects/79/7ae089b2212a937840e215276005ce76881307 +0 -0
- py2ls/.git/objects/7e/5956c806b5edc344d46dab599dec337891ba1f +1 -0
- py2ls/.git/objects/8e/55a7d2b96184030211f20c9b9af201eefcac82 +0 -0
- py2ls/.git/objects/91/c69ad88fe0ba94aa7859fb5f7edac5e6f1a3f7 +0 -0
- py2ls/.git/objects/b0/56be4be89ba6b76949dd641df45bb7036050c8 +0 -0
- py2ls/.git/objects/b0/9cd7856d58590578ee1a4f3ad45d1310a97f87 +0 -0
- py2ls/.git/objects/d9/005f2cc7fc4e65f14ed5518276007c08cf2fd0 +0 -0
- py2ls/.git/objects/df/e0770424b2a19faf507a501ebfc23be8f54e7b +0 -0
- py2ls/.git/objects/e9/391ffe371f1cc43b42ef09b705d9c767c2e14f +0 -0
- py2ls/.git/objects/fc/292e793ecfd42240ac43be407023bd731fa9e7 +0 -0
- py2ls/.git/refs/heads/main +1 -0
- py2ls/.git/refs/remotes/origin/HEAD +1 -0
- py2ls/.git/refs/remotes/origin/main +1 -0
- py2ls/.gitattributes +2 -0
- py2ls/.gitignore +152 -0
- py2ls/LICENSE +201 -0
- py2ls/README.md +409 -0
- py2ls/__init__.py +17 -0
- py2ls/brain_atlas.py +145 -0
- py2ls/correlators.py +475 -0
- py2ls/dbhandler.py +97 -0
- py2ls/freqanalysis.py +800 -0
- py2ls/internet_finder.py +405 -0
- py2ls/ips.py +2844 -0
- py2ls/netfinder.py +780 -0
- py2ls/sleep_events_detectors.py +1350 -0
- py2ls/translator.py +686 -0
- py2ls/version.py +1 -0
- py2ls/wb_detector.py +169 -0
- py2ls-0.1.0.dist-info/METADATA +12 -0
- py2ls-0.1.0.dist-info/RECORD +64 -0
- py2ls-0.1.0.dist-info/WHEEL +4 -0
py2ls/internet_finder.py
ADDED
@@ -0,0 +1,405 @@
|
|
1
|
+
from bs4 import BeautifulSoup
|
2
|
+
import requests
|
3
|
+
import os
|
4
|
+
from urllib.parse import urlparse, urljoin
|
5
|
+
import base64
|
6
|
+
import pandas as pd
|
7
|
+
from collections import Counter
|
8
|
+
import random
|
9
|
+
import logging
|
10
|
+
from time import sleep
|
11
|
+
import stem.process
|
12
|
+
from stem import Signal
|
13
|
+
from stem.control import Controller
|
14
|
+
import json
|
15
|
+
# Set up logging
|
16
|
+
logging.basicConfig(level=logging.INFO)
|
17
|
+
logger = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
# Define supported content types and corresponding parsers
|
20
|
+
CONTENT_PARSERS = {
|
21
|
+
"text/html": lambda text, parser: BeautifulSoup(text, parser),
|
22
|
+
"application/json": lambda text, parser: json.loads(text),
|
23
|
+
"text/xml": lambda text, parser: BeautifulSoup(text, parser),
|
24
|
+
"text/plain": lambda text, parser: text.text,
|
25
|
+
}
|
26
|
+
|
27
|
+
def fetch_all(url, parser="lxml"): # lxml is faster, # parser="html.parser"
|
28
|
+
try:
|
29
|
+
# Generate a random user-agent string
|
30
|
+
headers = {"User-Agent": user_agent()}
|
31
|
+
|
32
|
+
# Send the initial request
|
33
|
+
response = requests.get(url, headers=headers)
|
34
|
+
|
35
|
+
# If the response is a redirect, follow it
|
36
|
+
while response.is_redirect:
|
37
|
+
logger.info(f"Redirecting to: {response.headers['Location']}")
|
38
|
+
response = requests.get(response.headers["Location"], headers=headers)
|
39
|
+
# Check for a 403 error
|
40
|
+
if response.status_code == 403:
|
41
|
+
logger.warning("403 Forbidden error. Retrying...")
|
42
|
+
# Retry the request after a short delay
|
43
|
+
sleep(random.uniform(1, 3))
|
44
|
+
response = requests.get(url, headers=headers)
|
45
|
+
# Raise an error if retry also fails
|
46
|
+
response.raise_for_status()
|
47
|
+
|
48
|
+
# Raise an error for other HTTP status codes
|
49
|
+
response.raise_for_status()
|
50
|
+
|
51
|
+
# Get the content type
|
52
|
+
content_type = response.headers.get("content-type", "").split(";")[0].lower()
|
53
|
+
content = response.content.decode(response.encoding)
|
54
|
+
# logger.info(f"Content type: {content_type}")
|
55
|
+
|
56
|
+
# Check if content type is supported
|
57
|
+
if content_type in CONTENT_PARSERS:
|
58
|
+
return content_type, CONTENT_PARSERS[content_type](content, parser)
|
59
|
+
else:
|
60
|
+
logger.warning("Unsupported content type")
|
61
|
+
return None, None
|
62
|
+
except requests.RequestException as e:
|
63
|
+
logger.error(f"Error fetching URL '{url}': {e}")
|
64
|
+
return None, None
|
65
|
+
def user_agent():
|
66
|
+
# Example of generating a random user-agent string
|
67
|
+
user_agents = [
|
68
|
+
# Windows (Intel)
|
69
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Safari/537.36",
|
70
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Safari/537.36",
|
71
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Safari/537.36",
|
72
|
+
# Windows (ARM)
|
73
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; arm64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Safari/537.36",
|
74
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; arm64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Safari/537.36",
|
75
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; arm64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Safari/537.36",
|
76
|
+
# Linux (x86_64)
|
77
|
+
"Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
|
78
|
+
"Mozilla/5.0 (X11; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0",
|
79
|
+
"Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0",
|
80
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
|
81
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4891.0 Safari/537.36",
|
82
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Safari/537.36",
|
83
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4895.0 Safari/537.36",
|
84
|
+
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
|
85
|
+
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0",
|
86
|
+
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0",
|
87
|
+
# macOS (Intel)
|
88
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15",
|
89
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",
|
90
|
+
# macOS (ARM)
|
91
|
+
"Mozilla/5.0 (Macintosh; ARM Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15",
|
92
|
+
"Mozilla/5.0 (Macintosh; ARM Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",
|
93
|
+
# iOS Devices
|
94
|
+
"Mozilla/5.0 (iPad; CPU OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
|
95
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
|
96
|
+
# Android Devices
|
97
|
+
"Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Mobile Safari/537.36",
|
98
|
+
"Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Mobile Safari/537.36",
|
99
|
+
"Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Mobile Safari/537.36",
|
100
|
+
# Smart TVs
|
101
|
+
"Mozilla/5.0 (SMART-TV; LINUX; Tizen 6.0) AppleWebKit/537.36 (KHTML, like Gecko) SmartTV/1.0",
|
102
|
+
"Mozilla/5.0 (SMART-TV; LINUX; Tizen 6.0) AppleWebKit/537.36 (KHTML, like Gecko) WebAppManager/1.0",
|
103
|
+
# Game Consoles
|
104
|
+
"Mozilla/5.0 (PlayStation 5 3.01) AppleWebKit/605.1.15 (KHTML, like Gecko)",
|
105
|
+
"Mozilla/5.0 (Xbox One 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edge/44.18363.8740",
|
106
|
+
]
|
107
|
+
agents = random.choice(user_agents)
|
108
|
+
return agents
|
109
|
+
|
110
|
+
# # Function to change Tor IP address
|
111
|
+
# def renew_tor_ip():
|
112
|
+
# with Controller.from_port(port=9051) as controller:
|
113
|
+
# controller.authenticate()
|
114
|
+
# controller.signal(Signal.NEWNYM)
|
115
|
+
|
116
|
+
# # Function to make requests through Tor
|
117
|
+
# def make_tor_request(url, max_retries=3):
|
118
|
+
# renew_tor_ip()
|
119
|
+
# headers = {"User-Agent": user_agent()}
|
120
|
+
# session = requests.Session()
|
121
|
+
# session.proxies = {"http": "socks5h://localhost:9050", "https": "socks5h://localhost:9050"}
|
122
|
+
|
123
|
+
# for i in range(max_retries):
|
124
|
+
# try:
|
125
|
+
# response = session.get(url, headers=headers, timeout=10)
|
126
|
+
# if response.status_code == 200:
|
127
|
+
# return response.text
|
128
|
+
# except requests.exceptions.RequestException as e:
|
129
|
+
# print(f"Error: {e}")
|
130
|
+
# time.sleep(2) # Add a delay between retries
|
131
|
+
|
132
|
+
# return None
|
133
|
+
|
134
|
+
|
135
|
+
def find_links(url):
|
136
|
+
links_href = [] # Initialize list to store extracted links
|
137
|
+
content_type, content = fetch_all(url)
|
138
|
+
base_url = urlparse(url)
|
139
|
+
links = content.find_all("a", href=True)
|
140
|
+
for link in links:
|
141
|
+
link_href = link["href"]
|
142
|
+
if not link_href.startswith(('http://', 'https://')):
|
143
|
+
# Convert relative links to absolute links
|
144
|
+
link_href = urljoin(base_url.geturl(), link_href)
|
145
|
+
links_href.append(link_href)
|
146
|
+
return links_href
|
147
|
+
|
148
|
+
def find_domain(links):
|
149
|
+
domains = [urlparse(link).netloc for link in links]
|
150
|
+
domain_counts = Counter(domains)
|
151
|
+
most_common_domain = domain_counts.most_common(1)[0][0]
|
152
|
+
# print(f"Most_frequent_domain:{most_common_domain}")
|
153
|
+
return most_common_domain
|
154
|
+
|
155
|
+
# To determine which links are related to target domains(e.g., pages) you are interested in
|
156
|
+
def filter_links(links, domain=None, kind='html'):
|
157
|
+
filtered_links = []
|
158
|
+
if isinstance(kind, (str, list)):
|
159
|
+
kind = tuple(kind)
|
160
|
+
if domain is None:
|
161
|
+
domain = find_domain(links)
|
162
|
+
for link in links:
|
163
|
+
parsed_link = urlparse(link)
|
164
|
+
if parsed_link.netloc == domain and parsed_link.path.endswith(kind) and 'javascript:' not in parsed_link:
|
165
|
+
filtered_links.append(link)
|
166
|
+
return filtered_links
|
167
|
+
|
168
|
+
def find_img(url, dir_save="images"):
|
169
|
+
"""
|
170
|
+
Save images referenced in HTML content locally.
|
171
|
+
Args:
|
172
|
+
content (str or BeautifulSoup): HTML content or BeautifulSoup object.
|
173
|
+
url (str): URL of the webpage.
|
174
|
+
content_type (str): Type of content. Default is "html".
|
175
|
+
dir_save (str): Directory to save images. Default is "images".
|
176
|
+
Returns:
|
177
|
+
str: HTML content with updated image URLs pointing to local files.
|
178
|
+
"""
|
179
|
+
content_type, content = fetch_all(url)
|
180
|
+
if "html" in content_type.lower():
|
181
|
+
# Create the directory if it doesn't exist
|
182
|
+
os.makedirs(dir_save, exist_ok=True)
|
183
|
+
|
184
|
+
# Parse HTML content if it's not already a BeautifulSoup object
|
185
|
+
if isinstance(content, str):
|
186
|
+
content = BeautifulSoup(content, "html.parser")
|
187
|
+
image_links=[]
|
188
|
+
# Extracting images
|
189
|
+
images = content.find_all("img", src=True)
|
190
|
+
for i, image in enumerate(images):
|
191
|
+
try:
|
192
|
+
# Get the image URL
|
193
|
+
image_url = image["src"]
|
194
|
+
|
195
|
+
if image_url.startswith("data:image"):
|
196
|
+
# Extract the image data from the data URI
|
197
|
+
mime_type, base64_data = image_url.split(",", 1)
|
198
|
+
# Determine the file extension from the MIME type
|
199
|
+
if ":" in mime_type:
|
200
|
+
# image_extension = mime_type.split(":")[1].split(";")[0]
|
201
|
+
image_extension = mime_type.split(":")[1].split(";")[0].split("/")[-1]
|
202
|
+
else:
|
203
|
+
image_extension = "png" # Default to PNG if extension is not specified
|
204
|
+
# if 'svg+xml' in image_extension:
|
205
|
+
# image_extension='svg'
|
206
|
+
image_data = base64.b64decode(base64_data)
|
207
|
+
# Save the image data to a file
|
208
|
+
image_filename = os.path.join(
|
209
|
+
dir_save, f"image_{i}.{image_extension}"
|
210
|
+
)
|
211
|
+
with open(image_filename, "wb") as image_file:
|
212
|
+
image_file.write(image_data)
|
213
|
+
|
214
|
+
# Update the src attribute of the image tag to point to the local file
|
215
|
+
image["src"] = image_filename
|
216
|
+
else:
|
217
|
+
# Construct the absolute image URL
|
218
|
+
absolute_image_url = urljoin(url, image_url)
|
219
|
+
|
220
|
+
# Parse the image URL to extract the file extension
|
221
|
+
parsed_url = urlparse(absolute_image_url)
|
222
|
+
image_extension = os.path.splitext(parsed_url.path)[1]
|
223
|
+
|
224
|
+
# Download the image
|
225
|
+
image_response = requests.get(absolute_image_url)
|
226
|
+
|
227
|
+
# Save the image to a file
|
228
|
+
image_filename = os.path.join(
|
229
|
+
dir_save, f"image_{i}{image_extension}"
|
230
|
+
)
|
231
|
+
with open(image_filename, "wb") as image_file:
|
232
|
+
image_file.write(image_response.content)
|
233
|
+
|
234
|
+
# Update the src attribute of the image tag to point to the local file
|
235
|
+
image["src"] = image_filename
|
236
|
+
except (requests.RequestException, KeyError) as e:
|
237
|
+
print(f"Failed to process image {image_url}: {e}")
|
238
|
+
print(f"images were saved at\n{dir_save}")
|
239
|
+
# Return the HTML content with updated image URLs
|
240
|
+
return content
|
241
|
+
|
242
|
+
def content_div_class(content, div="div", div_class="highlight"):
|
243
|
+
texts = [div.text for div in content.find_all(div, class_=div_class)]
|
244
|
+
return texts
|
245
|
+
def find(url, where="div", what="highlight"):
|
246
|
+
_,content = fetch_all(url, parser="html.parser")
|
247
|
+
texts = [div.text for div in content.find_all(where, class_=what)]
|
248
|
+
return texts
|
249
|
+
# usage example:
|
250
|
+
#### img2local(url, "/Users/macjianfeng/Desktop/@tmp/dd/")
|
251
|
+
def find_forms(url):
|
252
|
+
content_type, content = fetch_all(url)
|
253
|
+
df=pd.DataFrame()
|
254
|
+
# Extracting forms and inputs
|
255
|
+
forms = content.find_all("form")
|
256
|
+
form_data = []
|
257
|
+
for form in forms:
|
258
|
+
form_inputs = form.find_all("input")
|
259
|
+
input_data = {}
|
260
|
+
for input_tag in form_inputs:
|
261
|
+
input_type = input_tag.get("type")
|
262
|
+
input_name = input_tag.get("name")
|
263
|
+
input_value = input_tag.get("value")
|
264
|
+
input_data[input_name] = {"type": input_type, "value": input_value}
|
265
|
+
form_data.append(input_data)
|
266
|
+
return form_data
|
267
|
+
# to clean strings
|
268
|
+
def clean_string(value):
|
269
|
+
if isinstance(value, str):
|
270
|
+
return value.replace('\n', '').replace('\r', '').replace('\t', '')
|
271
|
+
else:
|
272
|
+
return value
|
273
|
+
def find_all(url, dir_save=None):
|
274
|
+
content_type, content = fetch_all(url)
|
275
|
+
|
276
|
+
# Extracting paragraphs
|
277
|
+
paragraphs_text = [paragraph.text for paragraph in content.find_all("p")]
|
278
|
+
|
279
|
+
# Extracting specific elements by class
|
280
|
+
specific_elements_text = [element.text for element in content.find_all(class_="specific-class")]
|
281
|
+
|
282
|
+
# Extracting links (anchor tags)
|
283
|
+
links_href = find_links(url)
|
284
|
+
links_href = filter_links(links_href)
|
285
|
+
|
286
|
+
# Extracting images
|
287
|
+
images_src = [image['src'] for image in content.find_all("img", src=True)]
|
288
|
+
|
289
|
+
# Extracting headings (h1, h2, h3, etc.)
|
290
|
+
headings = [f'h{i}' for i in range(1, 7)]
|
291
|
+
headings_text = {heading: [tag.text for tag in content.find_all(heading)] for heading in headings}
|
292
|
+
|
293
|
+
# Extracting lists (ul, ol, li)
|
294
|
+
list_items_text = [item.text for list_ in content.find_all(["ul", "ol"]) for item in list_.find_all("li")]
|
295
|
+
|
296
|
+
# Extracting tables (table, tr, td)
|
297
|
+
table_cells_text = [cell.text for table in content.find_all("table") for row in table.find_all("tr") for cell in row.find_all("td")]
|
298
|
+
|
299
|
+
# Extracting other elements
|
300
|
+
divs_content = [div.text.strip() for div in content.find_all("div")]
|
301
|
+
headers_footer_content = [tag.text for tag in content.find_all(["header", "footer"])]
|
302
|
+
meta_tags_content = [(tag.name, tag.attrs) for tag in content.find_all("meta")]
|
303
|
+
spans_content = [span.text for span in content.find_all("span")]
|
304
|
+
bold_text_content = [text.text for text in content.find_all("b")]
|
305
|
+
italic_text_content = [text.text for text in content.find_all("i")]
|
306
|
+
code_snippets_content = [code.text for code in content.find_all("code")]
|
307
|
+
blockquotes_content = [blockquote.text for blockquote in content.find_all("blockquote")]
|
308
|
+
preformatted_text_content = [pre.text for pre in content.find_all("pre")]
|
309
|
+
buttons_content = [button.text for button in content.find_all("button")]
|
310
|
+
navs_content = [nav.text for nav in content.find_all("nav")]
|
311
|
+
sections_content = [section.text for section in content.find_all("section")]
|
312
|
+
articles_content = [article.text for article in content.find_all("article")]
|
313
|
+
figures_content = [figure.text for figure in content.find_all("figure")]
|
314
|
+
captions_content = [caption.text for caption in content.find_all("figcaption")]
|
315
|
+
abbreviations_content = [abbr.text for abbr in content.find_all("abbr")]
|
316
|
+
definitions_content = [definition.text for definition in content.find_all("dfn")]
|
317
|
+
addresses_content = [address.text for address in content.find_all("address")]
|
318
|
+
time_elements_content = [time.text for time in content.find_all("time")]
|
319
|
+
progress_content = [progress.text for progress in content.find_all("progress")]
|
320
|
+
meter_content = [meter.text for meter in content.find_all("meter")]
|
321
|
+
forms = find_forms(url)
|
322
|
+
|
323
|
+
lists_to_fill = [
|
324
|
+
paragraphs_text, specific_elements_text, links_href, images_src,
|
325
|
+
headings_text["h1"], headings_text["h2"], headings_text["h3"], headings_text["h4"],
|
326
|
+
headings_text["h5"], headings_text["h6"], list_items_text, table_cells_text,
|
327
|
+
divs_content, headers_footer_content, meta_tags_content, spans_content,
|
328
|
+
bold_text_content, italic_text_content, code_snippets_content,
|
329
|
+
blockquotes_content, preformatted_text_content, buttons_content,
|
330
|
+
navs_content, sections_content, articles_content, figures_content,
|
331
|
+
captions_content, abbreviations_content, definitions_content,
|
332
|
+
addresses_content, time_elements_content, progress_content,
|
333
|
+
meter_content,forms
|
334
|
+
]
|
335
|
+
# add new features
|
336
|
+
script_texts=content_div_class(content, div="div", div_class="highlight")
|
337
|
+
lists_to_fill.append(script_texts)
|
338
|
+
|
339
|
+
audio_src = [audio['src'] for audio in content.find_all("audio", src=True)]
|
340
|
+
video_src = [video['src'] for video in content.find_all("video", src=True)]
|
341
|
+
iframe_src = [iframe['src'] for iframe in content.find_all("iframe", src=True)]
|
342
|
+
lists_to_fill.extend([audio_src, video_src, iframe_src])
|
343
|
+
|
344
|
+
rss_links = [link['href'] for link in content.find_all('link', type=['application/rss+xml', 'application/atom+xml'])]
|
345
|
+
lists_to_fill.append(rss_links)
|
346
|
+
|
347
|
+
# Find the maximum length among all lists
|
348
|
+
max_length = max(len(lst) for lst in lists_to_fill)
|
349
|
+
|
350
|
+
# Fill missing data with empty strings for each list
|
351
|
+
for lst in lists_to_fill:
|
352
|
+
lst += [""] * (max_length - len(lst))
|
353
|
+
|
354
|
+
# Create DataFrame
|
355
|
+
df = pd.DataFrame({
|
356
|
+
"headings1": headings_text["h1"],
|
357
|
+
"headings2": headings_text["h2"],
|
358
|
+
"headings3": headings_text["h3"],
|
359
|
+
"headings4": headings_text["h4"],
|
360
|
+
"headings5": headings_text["h5"],
|
361
|
+
"headings6": headings_text["h6"],
|
362
|
+
"paragraphs": paragraphs_text,
|
363
|
+
"list_items": list_items_text,
|
364
|
+
"table_cells": table_cells_text,
|
365
|
+
"headers_footer": headers_footer_content,
|
366
|
+
"meta_tags": meta_tags_content,
|
367
|
+
"spans": spans_content,
|
368
|
+
"bold_text": bold_text_content,
|
369
|
+
"italic_text": italic_text_content,
|
370
|
+
"code_snippets": code_snippets_content,
|
371
|
+
"blockquotes": blockquotes_content,
|
372
|
+
"preformatted_text": preformatted_text_content,
|
373
|
+
"buttons": buttons_content,
|
374
|
+
"navs": navs_content,
|
375
|
+
"sections": sections_content,
|
376
|
+
"articles": articles_content,
|
377
|
+
"figures": figures_content,
|
378
|
+
"captions": captions_content,
|
379
|
+
"abbreviations": abbreviations_content,
|
380
|
+
"definitions": definitions_content,
|
381
|
+
"addresses": addresses_content,
|
382
|
+
"time_elements": time_elements_content,
|
383
|
+
"progress": progress_content,
|
384
|
+
"specific_elements": specific_elements_text,
|
385
|
+
"meter": meter_content,
|
386
|
+
"forms":forms,
|
387
|
+
"scripts":script_texts,
|
388
|
+
"audio":audio_src,
|
389
|
+
"video":video_src,
|
390
|
+
"iframe":iframe_src,
|
391
|
+
"rss": rss_links,
|
392
|
+
"images": images_src,
|
393
|
+
"links": links_href,
|
394
|
+
"divs": divs_content,
|
395
|
+
})
|
396
|
+
# to remove the '\n\t\r'
|
397
|
+
df=df.apply(lambda x: x.map(clean_string) if x.dtype == "object" else x) # df=df.applymap(clean_string)
|
398
|
+
if dir_save:
|
399
|
+
if not dir_save.endswith(".csv"):
|
400
|
+
dir_save=dir_save+"_df.csv"
|
401
|
+
df.to_csv(dir_save)
|
402
|
+
else:
|
403
|
+
df.to_csv(dir_save)
|
404
|
+
print(f"file has been saved at\n{dir_save}")
|
405
|
+
return df
|