py2ls 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. py2ls/.git/COMMIT_EDITMSG +1 -0
  2. py2ls/.git/FETCH_HEAD +1 -0
  3. py2ls/.git/HEAD +1 -0
  4. py2ls/.git/config +15 -0
  5. py2ls/.git/description +1 -0
  6. py2ls/.git/hooks/applypatch-msg.sample +15 -0
  7. py2ls/.git/hooks/commit-msg.sample +24 -0
  8. py2ls/.git/hooks/fsmonitor-watchman.sample +174 -0
  9. py2ls/.git/hooks/post-update.sample +8 -0
  10. py2ls/.git/hooks/pre-applypatch.sample +14 -0
  11. py2ls/.git/hooks/pre-commit.sample +49 -0
  12. py2ls/.git/hooks/pre-merge-commit.sample +13 -0
  13. py2ls/.git/hooks/pre-push.sample +53 -0
  14. py2ls/.git/hooks/pre-rebase.sample +169 -0
  15. py2ls/.git/hooks/pre-receive.sample +24 -0
  16. py2ls/.git/hooks/prepare-commit-msg.sample +42 -0
  17. py2ls/.git/hooks/push-to-checkout.sample +78 -0
  18. py2ls/.git/hooks/update.sample +128 -0
  19. py2ls/.git/index +0 -0
  20. py2ls/.git/info/exclude +6 -0
  21. py2ls/.git/logs/HEAD +1 -0
  22. py2ls/.git/logs/refs/heads/main +1 -0
  23. py2ls/.git/logs/refs/remotes/origin/HEAD +1 -0
  24. py2ls/.git/logs/refs/remotes/origin/main +1 -0
  25. py2ls/.git/objects/25/b796accd261b9135fd32a2c00785f68edf6c46 +0 -0
  26. py2ls/.git/objects/36/b4a1b7403abc6c360f8fe2cb656ab945254971 +0 -0
  27. py2ls/.git/objects/3f/d6561300938afbb3d11976cf9c8f29549280d9 +0 -0
  28. py2ls/.git/objects/58/20a729045d4dc7e37ccaf8aa8eec126850afe2 +0 -0
  29. py2ls/.git/objects/60/f273eb1c412d916fa3f11318a7da7a9911b52a +0 -0
  30. py2ls/.git/objects/61/570cec8c061abe74121f27f5face6c69b98f99 +0 -0
  31. py2ls/.git/objects/69/13c452ca319f7cbf6a0836dc10a5bb033c84e4 +0 -0
  32. py2ls/.git/objects/78/3d4167bc95c9d2175e0df03ef1c1c880ba75ab +0 -0
  33. py2ls/.git/objects/79/7ae089b2212a937840e215276005ce76881307 +0 -0
  34. py2ls/.git/objects/7e/5956c806b5edc344d46dab599dec337891ba1f +1 -0
  35. py2ls/.git/objects/8e/55a7d2b96184030211f20c9b9af201eefcac82 +0 -0
  36. py2ls/.git/objects/91/c69ad88fe0ba94aa7859fb5f7edac5e6f1a3f7 +0 -0
  37. py2ls/.git/objects/b0/56be4be89ba6b76949dd641df45bb7036050c8 +0 -0
  38. py2ls/.git/objects/b0/9cd7856d58590578ee1a4f3ad45d1310a97f87 +0 -0
  39. py2ls/.git/objects/d9/005f2cc7fc4e65f14ed5518276007c08cf2fd0 +0 -0
  40. py2ls/.git/objects/df/e0770424b2a19faf507a501ebfc23be8f54e7b +0 -0
  41. py2ls/.git/objects/e9/391ffe371f1cc43b42ef09b705d9c767c2e14f +0 -0
  42. py2ls/.git/objects/fc/292e793ecfd42240ac43be407023bd731fa9e7 +0 -0
  43. py2ls/.git/refs/heads/main +1 -0
  44. py2ls/.git/refs/remotes/origin/HEAD +1 -0
  45. py2ls/.git/refs/remotes/origin/main +1 -0
  46. py2ls/.gitattributes +2 -0
  47. py2ls/.gitignore +152 -0
  48. py2ls/LICENSE +201 -0
  49. py2ls/README.md +409 -0
  50. py2ls/__init__.py +17 -0
  51. py2ls/brain_atlas.py +145 -0
  52. py2ls/correlators.py +475 -0
  53. py2ls/dbhandler.py +97 -0
  54. py2ls/freqanalysis.py +800 -0
  55. py2ls/internet_finder.py +405 -0
  56. py2ls/ips.py +2844 -0
  57. py2ls/netfinder.py +780 -0
  58. py2ls/sleep_events_detectors.py +1350 -0
  59. py2ls/translator.py +686 -0
  60. py2ls/version.py +1 -0
  61. py2ls/wb_detector.py +169 -0
  62. py2ls-0.1.0.dist-info/METADATA +12 -0
  63. py2ls-0.1.0.dist-info/RECORD +64 -0
  64. py2ls-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,405 @@
1
+ from bs4 import BeautifulSoup
2
+ import requests
3
+ import os
4
+ from urllib.parse import urlparse, urljoin
5
+ import base64
6
+ import pandas as pd
7
+ from collections import Counter
8
+ import random
9
+ import logging
10
+ from time import sleep
11
+ import stem.process
12
+ from stem import Signal
13
+ from stem.control import Controller
14
+ import json
15
+ # Set up logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Define supported content types and corresponding parsers
20
+ CONTENT_PARSERS = {
21
+ "text/html": lambda text, parser: BeautifulSoup(text, parser),
22
+ "application/json": lambda text, parser: json.loads(text),
23
+ "text/xml": lambda text, parser: BeautifulSoup(text, parser),
24
+ "text/plain": lambda text, parser: text.text,
25
+ }
26
+
27
+ def fetch_all(url, parser="lxml"): # lxml is faster, # parser="html.parser"
28
+ try:
29
+ # Generate a random user-agent string
30
+ headers = {"User-Agent": user_agent()}
31
+
32
+ # Send the initial request
33
+ response = requests.get(url, headers=headers)
34
+
35
+ # If the response is a redirect, follow it
36
+ while response.is_redirect:
37
+ logger.info(f"Redirecting to: {response.headers['Location']}")
38
+ response = requests.get(response.headers["Location"], headers=headers)
39
+ # Check for a 403 error
40
+ if response.status_code == 403:
41
+ logger.warning("403 Forbidden error. Retrying...")
42
+ # Retry the request after a short delay
43
+ sleep(random.uniform(1, 3))
44
+ response = requests.get(url, headers=headers)
45
+ # Raise an error if retry also fails
46
+ response.raise_for_status()
47
+
48
+ # Raise an error for other HTTP status codes
49
+ response.raise_for_status()
50
+
51
+ # Get the content type
52
+ content_type = response.headers.get("content-type", "").split(";")[0].lower()
53
+ content = response.content.decode(response.encoding)
54
+ # logger.info(f"Content type: {content_type}")
55
+
56
+ # Check if content type is supported
57
+ if content_type in CONTENT_PARSERS:
58
+ return content_type, CONTENT_PARSERS[content_type](content, parser)
59
+ else:
60
+ logger.warning("Unsupported content type")
61
+ return None, None
62
+ except requests.RequestException as e:
63
+ logger.error(f"Error fetching URL '{url}': {e}")
64
+ return None, None
65
+ def user_agent():
66
+ # Example of generating a random user-agent string
67
+ user_agents = [
68
+ # Windows (Intel)
69
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Safari/537.36",
70
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Safari/537.36",
71
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Safari/537.36",
72
+ # Windows (ARM)
73
+ "Mozilla/5.0 (Windows NT 10.0; Win64; arm64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Safari/537.36",
74
+ "Mozilla/5.0 (Windows NT 10.0; Win64; arm64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Safari/537.36",
75
+ "Mozilla/5.0 (Windows NT 10.0; Win64; arm64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Safari/537.36",
76
+ # Linux (x86_64)
77
+ "Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
78
+ "Mozilla/5.0 (X11; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0",
79
+ "Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0",
80
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
81
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4891.0 Safari/537.36",
82
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Safari/537.36",
83
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4895.0 Safari/537.36",
84
+ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0",
85
+ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0",
86
+ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0",
87
+ # macOS (Intel)
88
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15",
89
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",
90
+ # macOS (ARM)
91
+ "Mozilla/5.0 (Macintosh; ARM Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15",
92
+ "Mozilla/5.0 (Macintosh; ARM Mac OS X 12_0_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15",
93
+ # iOS Devices
94
+ "Mozilla/5.0 (iPad; CPU OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
95
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1",
96
+ # Android Devices
97
+ "Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4891.0 Mobile Safari/537.36",
98
+ "Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4893.0 Mobile Safari/537.36",
99
+ "Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4895.0 Mobile Safari/537.36",
100
+ # Smart TVs
101
+ "Mozilla/5.0 (SMART-TV; LINUX; Tizen 6.0) AppleWebKit/537.36 (KHTML, like Gecko) SmartTV/1.0",
102
+ "Mozilla/5.0 (SMART-TV; LINUX; Tizen 6.0) AppleWebKit/537.36 (KHTML, like Gecko) WebAppManager/1.0",
103
+ # Game Consoles
104
+ "Mozilla/5.0 (PlayStation 5 3.01) AppleWebKit/605.1.15 (KHTML, like Gecko)",
105
+ "Mozilla/5.0 (Xbox One 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edge/44.18363.8740",
106
+ ]
107
+ agents = random.choice(user_agents)
108
+ return agents
109
+
110
+ # # Function to change Tor IP address
111
+ # def renew_tor_ip():
112
+ # with Controller.from_port(port=9051) as controller:
113
+ # controller.authenticate()
114
+ # controller.signal(Signal.NEWNYM)
115
+
116
+ # # Function to make requests through Tor
117
+ # def make_tor_request(url, max_retries=3):
118
+ # renew_tor_ip()
119
+ # headers = {"User-Agent": user_agent()}
120
+ # session = requests.Session()
121
+ # session.proxies = {"http": "socks5h://localhost:9050", "https": "socks5h://localhost:9050"}
122
+
123
+ # for i in range(max_retries):
124
+ # try:
125
+ # response = session.get(url, headers=headers, timeout=10)
126
+ # if response.status_code == 200:
127
+ # return response.text
128
+ # except requests.exceptions.RequestException as e:
129
+ # print(f"Error: {e}")
130
+ # time.sleep(2) # Add a delay between retries
131
+
132
+ # return None
133
+
134
+
135
+ def find_links(url):
136
+ links_href = [] # Initialize list to store extracted links
137
+ content_type, content = fetch_all(url)
138
+ base_url = urlparse(url)
139
+ links = content.find_all("a", href=True)
140
+ for link in links:
141
+ link_href = link["href"]
142
+ if not link_href.startswith(('http://', 'https://')):
143
+ # Convert relative links to absolute links
144
+ link_href = urljoin(base_url.geturl(), link_href)
145
+ links_href.append(link_href)
146
+ return links_href
147
+
148
+ def find_domain(links):
149
+ domains = [urlparse(link).netloc for link in links]
150
+ domain_counts = Counter(domains)
151
+ most_common_domain = domain_counts.most_common(1)[0][0]
152
+ # print(f"Most_frequent_domain:{most_common_domain}")
153
+ return most_common_domain
154
+
155
+ # To determine which links are related to target domains(e.g., pages) you are interested in
156
+ def filter_links(links, domain=None, kind='html'):
157
+ filtered_links = []
158
+ if isinstance(kind, (str, list)):
159
+ kind = tuple(kind)
160
+ if domain is None:
161
+ domain = find_domain(links)
162
+ for link in links:
163
+ parsed_link = urlparse(link)
164
+ if parsed_link.netloc == domain and parsed_link.path.endswith(kind) and 'javascript:' not in parsed_link:
165
+ filtered_links.append(link)
166
+ return filtered_links
167
+
168
+ def find_img(url, dir_save="images"):
169
+ """
170
+ Save images referenced in HTML content locally.
171
+ Args:
172
+ content (str or BeautifulSoup): HTML content or BeautifulSoup object.
173
+ url (str): URL of the webpage.
174
+ content_type (str): Type of content. Default is "html".
175
+ dir_save (str): Directory to save images. Default is "images".
176
+ Returns:
177
+ str: HTML content with updated image URLs pointing to local files.
178
+ """
179
+ content_type, content = fetch_all(url)
180
+ if "html" in content_type.lower():
181
+ # Create the directory if it doesn't exist
182
+ os.makedirs(dir_save, exist_ok=True)
183
+
184
+ # Parse HTML content if it's not already a BeautifulSoup object
185
+ if isinstance(content, str):
186
+ content = BeautifulSoup(content, "html.parser")
187
+ image_links=[]
188
+ # Extracting images
189
+ images = content.find_all("img", src=True)
190
+ for i, image in enumerate(images):
191
+ try:
192
+ # Get the image URL
193
+ image_url = image["src"]
194
+
195
+ if image_url.startswith("data:image"):
196
+ # Extract the image data from the data URI
197
+ mime_type, base64_data = image_url.split(",", 1)
198
+ # Determine the file extension from the MIME type
199
+ if ":" in mime_type:
200
+ # image_extension = mime_type.split(":")[1].split(";")[0]
201
+ image_extension = mime_type.split(":")[1].split(";")[0].split("/")[-1]
202
+ else:
203
+ image_extension = "png" # Default to PNG if extension is not specified
204
+ # if 'svg+xml' in image_extension:
205
+ # image_extension='svg'
206
+ image_data = base64.b64decode(base64_data)
207
+ # Save the image data to a file
208
+ image_filename = os.path.join(
209
+ dir_save, f"image_{i}.{image_extension}"
210
+ )
211
+ with open(image_filename, "wb") as image_file:
212
+ image_file.write(image_data)
213
+
214
+ # Update the src attribute of the image tag to point to the local file
215
+ image["src"] = image_filename
216
+ else:
217
+ # Construct the absolute image URL
218
+ absolute_image_url = urljoin(url, image_url)
219
+
220
+ # Parse the image URL to extract the file extension
221
+ parsed_url = urlparse(absolute_image_url)
222
+ image_extension = os.path.splitext(parsed_url.path)[1]
223
+
224
+ # Download the image
225
+ image_response = requests.get(absolute_image_url)
226
+
227
+ # Save the image to a file
228
+ image_filename = os.path.join(
229
+ dir_save, f"image_{i}{image_extension}"
230
+ )
231
+ with open(image_filename, "wb") as image_file:
232
+ image_file.write(image_response.content)
233
+
234
+ # Update the src attribute of the image tag to point to the local file
235
+ image["src"] = image_filename
236
+ except (requests.RequestException, KeyError) as e:
237
+ print(f"Failed to process image {image_url}: {e}")
238
+ print(f"images were saved at\n{dir_save}")
239
+ # Return the HTML content with updated image URLs
240
+ return content
241
+
242
+ def content_div_class(content, div="div", div_class="highlight"):
243
+ texts = [div.text for div in content.find_all(div, class_=div_class)]
244
+ return texts
245
+ def find(url, where="div", what="highlight"):
246
+ _,content = fetch_all(url, parser="html.parser")
247
+ texts = [div.text for div in content.find_all(where, class_=what)]
248
+ return texts
249
+ # usage example:
250
+ #### img2local(url, "/Users/macjianfeng/Desktop/@tmp/dd/")
251
+ def find_forms(url):
252
+ content_type, content = fetch_all(url)
253
+ df=pd.DataFrame()
254
+ # Extracting forms and inputs
255
+ forms = content.find_all("form")
256
+ form_data = []
257
+ for form in forms:
258
+ form_inputs = form.find_all("input")
259
+ input_data = {}
260
+ for input_tag in form_inputs:
261
+ input_type = input_tag.get("type")
262
+ input_name = input_tag.get("name")
263
+ input_value = input_tag.get("value")
264
+ input_data[input_name] = {"type": input_type, "value": input_value}
265
+ form_data.append(input_data)
266
+ return form_data
267
+ # to clean strings
268
+ def clean_string(value):
269
+ if isinstance(value, str):
270
+ return value.replace('\n', '').replace('\r', '').replace('\t', '')
271
+ else:
272
+ return value
273
+ def find_all(url, dir_save=None):
274
+ content_type, content = fetch_all(url)
275
+
276
+ # Extracting paragraphs
277
+ paragraphs_text = [paragraph.text for paragraph in content.find_all("p")]
278
+
279
+ # Extracting specific elements by class
280
+ specific_elements_text = [element.text for element in content.find_all(class_="specific-class")]
281
+
282
+ # Extracting links (anchor tags)
283
+ links_href = find_links(url)
284
+ links_href = filter_links(links_href)
285
+
286
+ # Extracting images
287
+ images_src = [image['src'] for image in content.find_all("img", src=True)]
288
+
289
+ # Extracting headings (h1, h2, h3, etc.)
290
+ headings = [f'h{i}' for i in range(1, 7)]
291
+ headings_text = {heading: [tag.text for tag in content.find_all(heading)] for heading in headings}
292
+
293
+ # Extracting lists (ul, ol, li)
294
+ list_items_text = [item.text for list_ in content.find_all(["ul", "ol"]) for item in list_.find_all("li")]
295
+
296
+ # Extracting tables (table, tr, td)
297
+ table_cells_text = [cell.text for table in content.find_all("table") for row in table.find_all("tr") for cell in row.find_all("td")]
298
+
299
+ # Extracting other elements
300
+ divs_content = [div.text.strip() for div in content.find_all("div")]
301
+ headers_footer_content = [tag.text for tag in content.find_all(["header", "footer"])]
302
+ meta_tags_content = [(tag.name, tag.attrs) for tag in content.find_all("meta")]
303
+ spans_content = [span.text for span in content.find_all("span")]
304
+ bold_text_content = [text.text for text in content.find_all("b")]
305
+ italic_text_content = [text.text for text in content.find_all("i")]
306
+ code_snippets_content = [code.text for code in content.find_all("code")]
307
+ blockquotes_content = [blockquote.text for blockquote in content.find_all("blockquote")]
308
+ preformatted_text_content = [pre.text for pre in content.find_all("pre")]
309
+ buttons_content = [button.text for button in content.find_all("button")]
310
+ navs_content = [nav.text for nav in content.find_all("nav")]
311
+ sections_content = [section.text for section in content.find_all("section")]
312
+ articles_content = [article.text for article in content.find_all("article")]
313
+ figures_content = [figure.text for figure in content.find_all("figure")]
314
+ captions_content = [caption.text for caption in content.find_all("figcaption")]
315
+ abbreviations_content = [abbr.text for abbr in content.find_all("abbr")]
316
+ definitions_content = [definition.text for definition in content.find_all("dfn")]
317
+ addresses_content = [address.text for address in content.find_all("address")]
318
+ time_elements_content = [time.text for time in content.find_all("time")]
319
+ progress_content = [progress.text for progress in content.find_all("progress")]
320
+ meter_content = [meter.text for meter in content.find_all("meter")]
321
+ forms = find_forms(url)
322
+
323
+ lists_to_fill = [
324
+ paragraphs_text, specific_elements_text, links_href, images_src,
325
+ headings_text["h1"], headings_text["h2"], headings_text["h3"], headings_text["h4"],
326
+ headings_text["h5"], headings_text["h6"], list_items_text, table_cells_text,
327
+ divs_content, headers_footer_content, meta_tags_content, spans_content,
328
+ bold_text_content, italic_text_content, code_snippets_content,
329
+ blockquotes_content, preformatted_text_content, buttons_content,
330
+ navs_content, sections_content, articles_content, figures_content,
331
+ captions_content, abbreviations_content, definitions_content,
332
+ addresses_content, time_elements_content, progress_content,
333
+ meter_content,forms
334
+ ]
335
+ # add new features
336
+ script_texts=content_div_class(content, div="div", div_class="highlight")
337
+ lists_to_fill.append(script_texts)
338
+
339
+ audio_src = [audio['src'] for audio in content.find_all("audio", src=True)]
340
+ video_src = [video['src'] for video in content.find_all("video", src=True)]
341
+ iframe_src = [iframe['src'] for iframe in content.find_all("iframe", src=True)]
342
+ lists_to_fill.extend([audio_src, video_src, iframe_src])
343
+
344
+ rss_links = [link['href'] for link in content.find_all('link', type=['application/rss+xml', 'application/atom+xml'])]
345
+ lists_to_fill.append(rss_links)
346
+
347
+ # Find the maximum length among all lists
348
+ max_length = max(len(lst) for lst in lists_to_fill)
349
+
350
+ # Fill missing data with empty strings for each list
351
+ for lst in lists_to_fill:
352
+ lst += [""] * (max_length - len(lst))
353
+
354
+ # Create DataFrame
355
+ df = pd.DataFrame({
356
+ "headings1": headings_text["h1"],
357
+ "headings2": headings_text["h2"],
358
+ "headings3": headings_text["h3"],
359
+ "headings4": headings_text["h4"],
360
+ "headings5": headings_text["h5"],
361
+ "headings6": headings_text["h6"],
362
+ "paragraphs": paragraphs_text,
363
+ "list_items": list_items_text,
364
+ "table_cells": table_cells_text,
365
+ "headers_footer": headers_footer_content,
366
+ "meta_tags": meta_tags_content,
367
+ "spans": spans_content,
368
+ "bold_text": bold_text_content,
369
+ "italic_text": italic_text_content,
370
+ "code_snippets": code_snippets_content,
371
+ "blockquotes": blockquotes_content,
372
+ "preformatted_text": preformatted_text_content,
373
+ "buttons": buttons_content,
374
+ "navs": navs_content,
375
+ "sections": sections_content,
376
+ "articles": articles_content,
377
+ "figures": figures_content,
378
+ "captions": captions_content,
379
+ "abbreviations": abbreviations_content,
380
+ "definitions": definitions_content,
381
+ "addresses": addresses_content,
382
+ "time_elements": time_elements_content,
383
+ "progress": progress_content,
384
+ "specific_elements": specific_elements_text,
385
+ "meter": meter_content,
386
+ "forms":forms,
387
+ "scripts":script_texts,
388
+ "audio":audio_src,
389
+ "video":video_src,
390
+ "iframe":iframe_src,
391
+ "rss": rss_links,
392
+ "images": images_src,
393
+ "links": links_href,
394
+ "divs": divs_content,
395
+ })
396
+ # to remove the '\n\t\r'
397
+ df=df.apply(lambda x: x.map(clean_string) if x.dtype == "object" else x) # df=df.applymap(clean_string)
398
+ if dir_save:
399
+ if not dir_save.endswith(".csv"):
400
+ dir_save=dir_save+"_df.csv"
401
+ df.to_csv(dir_save)
402
+ else:
403
+ df.to_csv(dir_save)
404
+ print(f"file has been saved at\n{dir_save}")
405
+ return df