opsci-toolbox 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,30 @@ from datetime import datetime,timedelta
7
7
  from opsci_toolbox.helpers.dates import str_to_datetime
8
8
  from opsci_toolbox.helpers.common import write_jsonl
9
9
 
10
- def create_queries_per_period(query, publishedAfter, publishedBefore, col_publishedAfter = "start_date", col_publishedBefore = "end_date", date_format = '%Y-%m-%d', rolling_days = 7 ):
10
+ def create_queries_per_period(
11
+ query: dict,
12
+ publishedAfter: str,
13
+ publishedBefore: str,
14
+ col_publishedAfter: str = "start_date",
15
+ col_publishedBefore: str = "end_date",
16
+ date_format: str = '%Y-%m-%d',
17
+ rolling_days: int = 7
18
+ ) -> list:
19
+ """
20
+ Generates a list of query dictionaries with date ranges for a rolling period.
21
+
22
+ Args:
23
+ query (dict): The base query dictionary to be modified with date ranges.
24
+ publishedAfter (str): The start date in string format.
25
+ publishedBefore (str): The end date in string format.
26
+ col_publishedAfter (str, optional): The key name for the start date in the query dictionary. Defaults to "start_date".
27
+ col_publishedBefore (str, optional): The key name for the end date in the query dictionary. Defaults to "end_date".
28
+ date_format (str, optional): The format of the input date strings. Defaults to '%Y-%m-%d'.
29
+ rolling_days (int, optional): The number of days for each rolling period. Defaults to 7.
30
+
31
+ Returns:
32
+ list: A list of query dictionaries with updated date ranges.
33
+ """
11
34
  datetime_publishedAfter = datetime.strptime(publishedAfter, date_format)
12
35
  datetime_publishedBefore = datetime.strptime(publishedBefore, date_format)
13
36
 
@@ -34,17 +57,31 @@ def create_queries_per_period(query, publishedAfter, publishedBefore, col_publis
34
57
  return queries
35
58
 
36
59
 
37
- def remove_extra_spaces(text):
60
+ def remove_extra_spaces(text: str) -> str:
38
61
  """
39
- Remove extra spaces
62
+ Removes extra spaces from the input text, including leading and trailing spaces.
63
+
64
+ Args:
65
+ text (str): The input text from which extra spaces should be removed.
66
+
67
+ Returns:
68
+ str: The cleaned text with extra spaces removed.
40
69
  """
41
70
  cleaned_text = re.sub(r'\s+', ' ', text)
42
71
  return cleaned_text.strip()
43
72
 
44
73
 
45
- def query_rapidAPI(url: str, query_dict: dict, host: str):
74
+ def query_rapidAPI(url: str, query_dict: dict, host: str)-> requests.Response:
46
75
  """
47
- Function to query RapidAPI
76
+ Function to query RapidAPI.
77
+
78
+ Args:
79
+ url (str): The URL for the RapidAPI endpoint.
80
+ query_dict (dict): A dictionary containing query parameters.
81
+ host (str): The RapidAPI host.
82
+
83
+ Returns:
84
+ requests.Response: The response object from the RapidAPI request, or None if an error occurs.
48
85
  """
49
86
 
50
87
  headers = {
@@ -62,7 +99,16 @@ def query_rapidAPI(url: str, query_dict: dict, host: str):
62
99
  return response
63
100
 
64
101
 
65
- def response_header(response):
102
+ def response_header(response: requests.Response) -> dict:
103
+ """
104
+ Retrieves the headers from an HTTP response object.
105
+
106
+ Args:
107
+ response: The HTTP response object from which headers are to be retrieved.
108
+
109
+ Returns:
110
+ dict: The headers of the HTTP response.
111
+ """
66
112
  return response.headers
67
113
 
68
114
  ##################################################################################################
@@ -70,9 +116,15 @@ def response_header(response):
70
116
  # https://rapidapi.com/omarmhaimdat/api/twitter154
71
117
  ##################################################################################################
72
118
 
73
- def parse_user(user : dict):
119
+ def parse_user(user: dict) -> tuple:
74
120
  """
75
- Parse the subdict related to user data
121
+ Parse the subdict related to user data.
122
+
123
+ Args:
124
+ user (dict): Dictionary containing user data.
125
+
126
+ Returns:
127
+ tuple: A tuple containing parsed user data fields.
76
128
  """
77
129
  if user:
78
130
  user_creation_date=user.get("creation_date","")
@@ -109,9 +161,15 @@ def parse_user(user : dict):
109
161
  record = (user_creation_date, user_id, user_username, user_name, user_follower_count, user_following_count, user_favourites_count, user_is_private, user_is_verified, user_is_blue_verified, user_location, user_profile_pic_url, user_profile_banner_url, user_description, user_external_url, user_number_of_tweets, user_bot, user_timestamp, user_has_nft_avatar,user_category, user_default_profile, user_default_profile_image, user_listed_count, user_verified_type)
110
162
  return record
111
163
 
112
- def parse_retweet(data):
164
+ def parse_retweet(data: dict) -> tuple:
113
165
  """
114
- Parse subdict related to original tweet if the captured tweet is RT
166
+ Parse subdict related to original tweet if the captured tweet is RT.
167
+
168
+ Args:
169
+ data (dict): Dictionary containing tweet data.
170
+
171
+ Returns:
172
+ tuple: A tuple containing parsed tweet data fields.
115
173
  """
116
174
  if data:
117
175
  tweet_id=data.get("tweet_id", "")
@@ -150,9 +208,15 @@ def parse_retweet(data):
150
208
  record=(tweet_id, creation_date, text,media_url, video_url, language, favorite_count, retweet_count, reply_count, quote_count, retweet, views, timestamp, video_view_count,in_reply_to_status_id, quoted_status_id, expanded_url, retweet_tweet_id,conversation_id,bookmark_count, source,community_note)
151
209
  return record
152
210
 
153
- def parse_entities(extended_entities):
211
+ def parse_entities(extended_entities: dict) -> tuple:
154
212
  """
155
- Parse the subdict related to extended entities (image, video, tags...)
213
+ Parse the subdict related to extended entities (image, video, tags...).
214
+
215
+ Args:
216
+ extended_entities (dict): Dictionary containing extended entities data.
217
+
218
+ Returns:
219
+ tuple: A tuple containing parsed extended entities data fields.
156
220
  """
157
221
  id_str, indices, media_key, media_url, media_type, original_info, height, width, ext_alt_text, monetizable, aspect_ratio, duration_millis = [], [], [], [], [], [], [], [], [], [], [], []
158
222
  all_x, all_y, all_h, all_w =[], [], [], []
@@ -222,9 +286,15 @@ def parse_entities(extended_entities):
222
286
  record = (id_str, indices, media_key, media_url, media_type, all_x, all_y, all_h, all_w, height, width, ext_alt_text, all_tag_user_id, all_tag_user_screenname, all_tag_user_type, monetizable, aspect_ratio, duration_millis, all_variants_url, all_variants_bitrate, all_variants_content_type)
223
287
  return record
224
288
 
225
- def parse_tweet(json_data):
289
+ def parse_tweet(json_data: list) -> pd.DataFrame:
226
290
  """
227
- Parse a batch of tweets
291
+ Parse a batch of tweets.
292
+
293
+ Args:
294
+ json_data (list): List of dictionaries containing tweet data.
295
+
296
+ Returns:
297
+ pd.DataFrame: A pandas DataFrame containing parsed tweet data.
228
298
  """
229
299
  all_records=[]
230
300
  for data in json_data:
@@ -279,9 +349,15 @@ def parse_tweet(json_data):
279
349
  df = pd.DataFrame.from_records(all_records, columns = all_cols)
280
350
  return df
281
351
 
282
- def parse_twitter_list_details(json_data):
352
+ def parse_twitter_list_details(json_data : dict) -> pd.DataFrame:
283
353
  """
284
- Parse list results from https://rapidapi.com/omarmhaimdat/api/twitter154
354
+ Parse list results from https://rapidapi.com/omarmhaimdat/api/twitter154.
355
+
356
+ Args:
357
+ json_data (dict): Dictionary containing list details data.
358
+
359
+ Returns:
360
+ pd.DataFrame: A pandas DataFrame containing parsed list details.
285
361
  """
286
362
  list_id = json_data.get("list_id", "")
287
363
  list_id_str = json_data.get("list_id_str", "")
@@ -304,7 +380,16 @@ def parse_twitter_list_details(json_data):
304
380
  # https://instagram-scraper2.p.rapidapi.com/hash_tag_medias_v2
305
381
  ######################################################################################
306
382
 
307
- def instagram_parse_hashtag_data(hashtag_data):
383
+ def instagram_parse_hashtag_data(hashtag_data: dict)-> pd.DataFrame:
384
+ """
385
+ Parse Instagram hashtag data into a DataFrame.
386
+
387
+ Args:
388
+ hashtag_data (dict): Dictionary containing Instagram hashtag data.
389
+
390
+ Returns:
391
+ pd.DataFrame: A pandas DataFrame containing parsed hashtag data.
392
+ """
308
393
  hashtag_id = hashtag_data.get("id")
309
394
  hashtag_name = hashtag_data.get("name")
310
395
  allow_following = hashtag_data.get("allow_following")
@@ -354,9 +439,17 @@ def instagram_parse_hashtag_data(hashtag_data):
354
439
  # function to parse Twitter data
355
440
  # https://rapidapi.com/twttrapi-twttrapi-default/api/twttrapi
356
441
  ######################################################################################
357
- def compile_list_entries(json_data, path_json, filename):
442
+ def compile_list_entries(json_data: dict, path_json: str, filename: str)-> tuple:
358
443
  """
359
- Function to return next cursor and list details from https://twttrapi.p.rapidapi.com/list-members
444
+ Function to process list entries from Twitter API response and write to JSONL file. https://twttrapi.p.rapidapi.com/list-members
445
+
446
+ Args:
447
+ json_data (dict): JSON response data from Twitter API.
448
+ path_json (str): Path to directory where JSONL file will be saved.
449
+ filename (str): Name of the JSONL file.
450
+
451
+ Returns:
452
+ tuple: A tuple containing a list of results (user legacy data) and next cursor (str or None).
360
453
  """
361
454
  results = []
362
455
  next_cursor = None
@@ -377,9 +470,15 @@ def compile_list_entries(json_data, path_json, filename):
377
470
  return results, next_cursor
378
471
 
379
472
 
380
- def parse_list_entries(jsonl_data):
473
+ def parse_list_entries(jsonl_data: list)-> pd.DataFrame:
381
474
  """
382
- Function to parse list details from https://twttrapi.p.rapidapi.com/list-members
475
+ Parse list details from JSONL data obtained from the Twitter API.
476
+
477
+ Args:
478
+ jsonl_data (list): List of dictionaries containing JSON data.
479
+
480
+ Returns:
481
+ pd.DataFrame: DataFrame containing parsed list details.
383
482
  """
384
483
  all_records=[]
385
484
  for data in jsonl_data:
@@ -12,18 +12,30 @@ import pandas as pd
12
12
  from tqdm import tqdm
13
13
 
14
14
 
15
- def url_get_domain(url):
15
+ def url_get_domain(url: str) -> str:
16
16
  """
17
- Return the domain name from a url
17
+ Extracts and returns the domain name from a given URL.
18
+
19
+ Args:
20
+ url (str): The URL string from which the domain name is to be extracted.
21
+
22
+ Returns:
23
+ str: The domain name extracted from the URL.
18
24
  """
19
25
  parsed_url = urlparse(url)
20
26
  domain = parsed_url.hostname if parsed_url.hostname else parsed_url.netloc
21
27
  return domain
22
28
 
23
29
 
24
- def url_get_extension(url):
30
+ def url_get_extension(url: str) -> str:
25
31
  """
26
- Return the extension of the domain name from a url
32
+ Extracts and returns the extension (TLD) of the domain name from a given URL.
33
+
34
+ Args:
35
+ url (str): The URL string from which the domain extension is to be extracted.
36
+
37
+ Returns:
38
+ str: The extension (TLD) of the domain name extracted from the URL.
27
39
  """
28
40
  # Parse the URL using urlparse
29
41
  parsed_url = urlparse(url)
@@ -34,15 +46,21 @@ def url_get_extension(url):
34
46
  # Split the netloc by '.' to get the domain and TLD
35
47
  domain_parts = netloc.split(".")
36
48
 
37
- # Get the last two parts, which represent the domain and TLD
49
+ # Get the last part, which represents the TLD
38
50
  extension = ".".join(domain_parts[-1:])
39
51
 
40
52
  return extension
41
53
 
42
54
 
43
- def url_clean_parameters(url):
55
+ def url_clean_parameters(url: str) -> str:
44
56
  """
45
- Return a URL without any parameters or utm tags
57
+ Removes query parameters and UTM tags from a given URL and returns the cleaned URL.
58
+
59
+ Args:
60
+ url (str): The URL string from which parameters and UTM tags are to be removed.
61
+
62
+ Returns:
63
+ str: The cleaned URL without any parameters or UTM tags.
46
64
  """
47
65
  parsed_url = urlparse(url)
48
66
  netloc = parsed_url.netloc if parsed_url.netloc else ""
@@ -50,9 +68,15 @@ def url_clean_parameters(url):
50
68
  return netloc + path
51
69
 
52
70
 
53
- def url_clean_protocol(url):
71
+ def url_clean_protocol(url: str) -> str:
54
72
  """
55
- Remove https / http from a url
73
+ Removes the 'http://' or 'https://' prefix from a given URL.
74
+
75
+ Args:
76
+ url (str): The URL string from which the protocol is to be removed.
77
+
78
+ Returns:
79
+ str: The URL without the 'http://' or 'https://' prefix.
56
80
  """
57
81
  prefixes_to_remove = ["https://", "http://"]
58
82
 
@@ -64,9 +88,15 @@ def url_clean_protocol(url):
64
88
  return url
65
89
 
66
90
 
67
- def url_remove_www(url):
91
+ def url_remove_www(url: str) -> str:
68
92
  """
69
- Remove www from a url
93
+ Removes the 'www.' prefix from a given URL, along with any protocol prefix.
94
+
95
+ Args:
96
+ url (str): The URL string from which the 'www.' prefix is to be removed.
97
+
98
+ Returns:
99
+ str: The URL without the 'www.' prefix.
70
100
  """
71
101
  prefixes_to_remove = ["https://www.", "http://www.", "https://", "http://", "www."]
72
102
 
@@ -78,9 +108,15 @@ def url_remove_www(url):
78
108
  return url
79
109
 
80
110
 
81
- def url_add_protocol(url):
111
+ def url_add_protocol(url: str) -> str:
82
112
  """
83
- Return a formatted url with protocol and www. if necessary
113
+ Ensures the given URL has a protocol ('https://') and 'www.' prefix if necessary.
114
+
115
+ Args:
116
+ url (str): The URL string to be formatted with protocol and 'www.' prefix if required.
117
+
118
+ Returns:
119
+ str: The formatted URL with protocol and 'www.' prefix if it was missing.
84
120
  """
85
121
  parsed_url = urlparse(url)
86
122
 
@@ -96,9 +132,15 @@ def url_add_protocol(url):
96
132
  return url
97
133
 
98
134
 
99
- def url_is_valid(url):
135
+ def url_is_valid(url: str) -> bool:
100
136
  """
101
- Checks if a URL is valid
137
+ Checks if a given URL is valid.
138
+
139
+ Args:
140
+ url (str): The URL string to be validated.
141
+
142
+ Returns:
143
+ bool: True if the URL is valid, False otherwise.
102
144
  """
103
145
  try:
104
146
  parsed_url = urlparse(url)
@@ -108,9 +150,15 @@ def url_is_valid(url):
108
150
  return False
109
151
 
110
152
 
111
- def url_is_reachable(url):
153
+ def url_is_reachable(url: str) -> bool:
112
154
  """
113
- Checks if url is reachable (no 404 error...)
155
+ Checks if a given URL is reachable (i.e., does not return a 404 error or other HTTP errors).
156
+
157
+ Args:
158
+ url (str): The URL string to be checked for reachability.
159
+
160
+ Returns:
161
+ bool: True if the URL is reachable, False otherwise.
114
162
  """
115
163
  try:
116
164
  response = requests.get(url)
@@ -121,9 +169,15 @@ def url_is_reachable(url):
121
169
  return False # HTTP error occurred, URL is not reachable
122
170
 
123
171
 
124
- def scrape(url):
172
+ def scrape(url: str) -> requests.Response:
125
173
  """
126
- Get requests and return full response
174
+ Sends a GET request to the given URL and returns the full response.
175
+
176
+ Args:
177
+ url (str): The URL to be requested.
178
+
179
+ Returns:
180
+ requests.Response: The full response from the GET request.
127
181
  """
128
182
  try:
129
183
  response = requests.get(url)
@@ -133,11 +187,17 @@ def scrape(url):
133
187
  return response
134
188
 
135
189
 
136
- def justext_parse_content(response, languages=["English", "French"]):
137
- """
138
- Return main content from a HTML response
190
+ def justext_parse_content(response: requests.Response, languages: list = ["English", "French"]) -> str:
139
191
  """
192
+ Extracts and returns the main content from an HTML response using jusText.
193
+
194
+ Args:
195
+ response (requests.Response): The HTTP response object containing the HTML content.
196
+ languages (list): A list of languages to use for stopword lists in jusText. Default is ["English", "French"].
140
197
 
198
+ Returns:
199
+ str: The extracted main content from the HTML response.
200
+ """
141
201
  stoplist = frozenset()
142
202
 
143
203
  for lang in languages:
@@ -158,9 +218,15 @@ def justext_parse_content(response, languages=["English", "French"]):
158
218
  return concatenated_text
159
219
 
160
220
 
161
- def trafilatura_parse_content(response):
221
+ def trafilatura_parse_content(response: requests.Response) -> str:
162
222
  """
163
- Return main content from a HTML response
223
+ Extracts and returns the main content from an HTML response using Trafilatura.
224
+
225
+ Args:
226
+ response (requests.Response): The HTTP response object containing the HTML content.
227
+
228
+ Returns:
229
+ str: The extracted main content from the HTML response.
164
230
  """
165
231
  try:
166
232
  text = extract(response.content)
@@ -171,13 +237,13 @@ def trafilatura_parse_content(response):
171
237
 
172
238
 
173
239
  def process_scraping(
174
- url,
175
- path,
176
- method="justext",
177
- languages=["English", "French"],
178
- title=True,
179
- meta=True,
180
- lst_properties=[
240
+ url: str,
241
+ path: str,
242
+ method: str = "justext",
243
+ languages: list = ["English", "French"],
244
+ title: bool = True,
245
+ meta: bool = True,
246
+ lst_properties: list = [
181
247
  "og:site_name",
182
248
  "og:url",
183
249
  "og:title",
@@ -204,7 +270,23 @@ def process_scraping(
204
270
  "al:android:package",
205
271
  "al:android:app_name",
206
272
  ],
207
- ):
273
+ ) -> dict:
274
+ """
275
+ Process scraping of a URL, extract main content, title, and meta properties,
276
+ and store the results in a JSON file.
277
+
278
+ Args:
279
+ url (str): The URL to scrape.
280
+ path (str): The directory path where the scraped data JSON file will be saved.
281
+ method (str, optional): The method to use for content extraction ('justext' or 'trafilatura'). Defaults to 'justext'.
282
+ languages (list, optional): A list of languages for stopword lists in jusText. Defaults to ["English", "French"].
283
+ title (bool, optional): Whether to extract the title from the HTML. Defaults to True.
284
+ meta (bool, optional): Whether to extract meta properties from the HTML. Defaults to True.
285
+ lst_properties (list, optional): List of specific meta properties to extract. Defaults to a comprehensive list.
286
+
287
+ Returns:
288
+ dict or None: A dictionary containing the extracted data and file path if successful, or None if an error occurs.
289
+ """
208
290
  try:
209
291
 
210
292
  # We name the files
@@ -257,9 +339,15 @@ def process_scraping(
257
339
  return None
258
340
 
259
341
 
260
- def parse_title(response):
342
+ def parse_title(response: requests.Response) -> str:
261
343
  """
262
- Return webpage title
344
+ Extracts and returns the webpage title from an HTML response.
345
+
346
+ Parameters:
347
+ response (requests.Response): The HTTP response object containing the HTML content.
348
+
349
+ Returns:
350
+ str or None: The extracted title text if found, None if no title tag is found or an error occurs.
263
351
  """
264
352
  try:
265
353
  soup = BeautifulSoup(response.content, "html.parser")
@@ -277,9 +365,7 @@ def parse_title(response):
277
365
  return None
278
366
 
279
367
 
280
- def get_meta_properties(
281
- response,
282
- lst_properties=[
368
+ def get_meta_properties(response: requests.Response, lst_properties: list = [
283
369
  "og:site_name",
284
370
  "og:url",
285
371
  "og:title",
@@ -305,10 +391,16 @@ def get_meta_properties(
305
391
  "al:ios:app_name",
306
392
  "al:android:package",
307
393
  "al:android:app_name",
308
- ],
309
- ):
394
+ ]) -> dict:
310
395
  """
311
- Parse a list of meta tags from a webpage and returns a dict
396
+ Extracts specified meta properties from a webpage and returns them as a dictionary.
397
+
398
+ Args:
399
+ response (requests.Response): The HTTP response object containing the HTML content.
400
+ lst_properties (list, optional): A list of meta property names to extract. Defaults to a comprehensive list.
401
+
402
+ Returns:
403
+ dict or None: A dictionary mapping meta property names to their content values if found, or None if an error occurs.
312
404
  """
313
405
  try:
314
406
 
@@ -338,14 +430,14 @@ def get_meta_properties(
338
430
 
339
431
 
340
432
  def parallel_scraping(
341
- urls,
342
- path,
343
- max_workers=8,
344
- method="justext",
345
- languages=["English", "French"],
346
- title=True,
347
- meta=True,
348
- lst_properties=[
433
+ urls: list,
434
+ path: str,
435
+ max_workers: int = 8,
436
+ method: str = "justext",
437
+ languages: list = ["English", "French"],
438
+ title: bool = True,
439
+ meta: bool = True,
440
+ lst_properties: list = [
349
441
  "og:site_name",
350
442
  "og:url",
351
443
  "og:title",
@@ -373,9 +465,19 @@ def parallel_scraping(
373
465
  "al:android:app_name",
374
466
  ],
375
467
  ):
376
-
377
468
  """
378
- Execute concurrent threads to scrape multiple webpages
469
+ Execute concurrent threads to scrape multiple webpages.
470
+
471
+ Args:
472
+ urls (list): List of URLs to scrape.
473
+ path (str): The directory path where scraped data will be saved.
474
+ max_workers (int, optional): Maximum number of concurrent threads. Defaults to 8.
475
+ method (str, optional): Method to use for content extraction ('justext' or 'trafilatura'). Defaults to 'justext'.
476
+ languages (list, optional): Languages for stopword lists in jusText. Defaults to ['English', 'French'].
477
+ title (bool, optional): Whether to extract title from HTML. Defaults to True.
478
+ meta (bool, optional): Whether to extract meta properties from HTML. Defaults to True.
479
+ lst_properties (list, optional): List of specific meta properties to extract. Defaults to a comprehensive list.
480
+
379
481
  """
380
482
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
381
483
  # Submit scraping tasks for each URL and add tqdm progress bar
@@ -403,9 +505,17 @@ def parallel_scraping(
403
505
  print(f"Error scraping : {e}")
404
506
 
405
507
 
406
- def parse_scraped_webpages(path_json_files, output_path, name):
508
+ def parse_scraped_webpages(path_json_files: str, output_path: str, name: str) -> pd.DataFrame:
407
509
  """
408
- Parse JSON files captured by scraper
510
+ Parse JSON files containing scraped data and save the extracted data into a CSV file.
511
+
512
+ Args:
513
+ path_json_files (str): Directory path containing JSON files.
514
+ output_path (str): Directory path where the CSV file will be saved.
515
+ name (str): Name of the CSV file.
516
+
517
+ Returns:
518
+ pd.DataFrame: DataFrame containing the parsed data from JSON files.
409
519
  """
410
520
  extracted_data = []
411
521
 
@@ -421,10 +531,20 @@ def parse_scraped_webpages(path_json_files, output_path, name):
421
531
  save_dataframe_csv(df, output_path, name)
422
532
  return df
423
533
 
424
- def download_file(url:str, path:str):
425
- '''
426
- Download a file using a URL and write in a local file
427
- '''
534
+ def download_file(url: str, path: str) -> None:
535
+ """
536
+ Download a file from a URL and save it locally.
537
+
538
+ ARgs:
539
+ url (str): The URL of the file to download.
540
+ path (str): The local path where the file will be saved.
541
+
542
+ Raises:
543
+ requests.exceptions.RequestException: If an HTTP error occurs during the request.
544
+
545
+ Returns:
546
+ None
547
+ """
428
548
  try:
429
549
  response = requests.get(url)
430
550
  response.raise_for_status() # Raise an HTTPError for bad responses
@@ -435,10 +555,17 @@ def download_file(url:str, path:str):
435
555
  except requests.exceptions.RequestException as e:
436
556
  print(f"Error downloading file: {url, e}")
437
557
 
438
- def parallel_dl(urls, paths, max_workers=8):
439
-
558
+ def parallel_dl(urls: list, paths: list, max_workers: int = 8) -> None:
440
559
  """
441
- Execute concurrent threads to scrape multiple webpages
560
+ Execute concurrent threads to download multiple files from URLs and save them locally.
561
+
562
+ Args:
563
+ urls (list): List of URLs to download files from.
564
+ paths (list): List of local paths where downloaded files will be saved.
565
+ max_workers (int, optional): Maximum number of concurrent threads. Defaults to 8.
566
+
567
+ Returns:
568
+ None
442
569
  """
443
570
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
444
571
  # Submit scraping tasks for each URL and add tqdm progress bar