opsci-toolbox 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +120 -21
- opsci_toolbox/apis/webscraping.py +186 -59
- opsci_toolbox/apis/youtube_helpers.py +103 -16
- opsci_toolbox/helpers/common.py +368 -254
- opsci_toolbox/helpers/cv.py +50 -60
- opsci_toolbox/helpers/dataviz.py +255 -184
- opsci_toolbox/helpers/dates.py +17 -18
- opsci_toolbox/helpers/nlp.py +154 -114
- opsci_toolbox/helpers/nlp_cuml.py +389 -36
- opsci_toolbox/helpers/sna.py +509 -0
- opsci_toolbox/helpers/sql.py +53 -0
- {opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.9.dist-info}/METADATA +12 -9
- opsci_toolbox-0.0.9.dist-info/RECORD +22 -0
- opsci_toolbox-0.0.7.dist-info/RECORD +0 -21
- {opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.9.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.9.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,30 @@ from datetime import datetime,timedelta
|
|
7
7
|
from opsci_toolbox.helpers.dates import str_to_datetime
|
8
8
|
from opsci_toolbox.helpers.common import write_jsonl
|
9
9
|
|
10
|
-
def create_queries_per_period(
|
10
|
+
def create_queries_per_period(
|
11
|
+
query: dict,
|
12
|
+
publishedAfter: str,
|
13
|
+
publishedBefore: str,
|
14
|
+
col_publishedAfter: str = "start_date",
|
15
|
+
col_publishedBefore: str = "end_date",
|
16
|
+
date_format: str = '%Y-%m-%d',
|
17
|
+
rolling_days: int = 7
|
18
|
+
) -> list:
|
19
|
+
"""
|
20
|
+
Generates a list of query dictionaries with date ranges for a rolling period.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
query (dict): The base query dictionary to be modified with date ranges.
|
24
|
+
publishedAfter (str): The start date in string format.
|
25
|
+
publishedBefore (str): The end date in string format.
|
26
|
+
col_publishedAfter (str, optional): The key name for the start date in the query dictionary. Defaults to "start_date".
|
27
|
+
col_publishedBefore (str, optional): The key name for the end date in the query dictionary. Defaults to "end_date".
|
28
|
+
date_format (str, optional): The format of the input date strings. Defaults to '%Y-%m-%d'.
|
29
|
+
rolling_days (int, optional): The number of days for each rolling period. Defaults to 7.
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
list: A list of query dictionaries with updated date ranges.
|
33
|
+
"""
|
11
34
|
datetime_publishedAfter = datetime.strptime(publishedAfter, date_format)
|
12
35
|
datetime_publishedBefore = datetime.strptime(publishedBefore, date_format)
|
13
36
|
|
@@ -34,17 +57,31 @@ def create_queries_per_period(query, publishedAfter, publishedBefore, col_publis
|
|
34
57
|
return queries
|
35
58
|
|
36
59
|
|
37
|
-
def remove_extra_spaces(text):
|
60
|
+
def remove_extra_spaces(text: str) -> str:
|
38
61
|
"""
|
39
|
-
|
62
|
+
Removes extra spaces from the input text, including leading and trailing spaces.
|
63
|
+
|
64
|
+
Args:
|
65
|
+
text (str): The input text from which extra spaces should be removed.
|
66
|
+
|
67
|
+
Returns:
|
68
|
+
str: The cleaned text with extra spaces removed.
|
40
69
|
"""
|
41
70
|
cleaned_text = re.sub(r'\s+', ' ', text)
|
42
71
|
return cleaned_text.strip()
|
43
72
|
|
44
73
|
|
45
|
-
def query_rapidAPI(url: str, query_dict: dict, host: str):
|
74
|
+
def query_rapidAPI(url: str, query_dict: dict, host: str)-> requests.Response:
|
46
75
|
"""
|
47
|
-
Function to query RapidAPI
|
76
|
+
Function to query RapidAPI.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
url (str): The URL for the RapidAPI endpoint.
|
80
|
+
query_dict (dict): A dictionary containing query parameters.
|
81
|
+
host (str): The RapidAPI host.
|
82
|
+
|
83
|
+
Returns:
|
84
|
+
requests.Response: The response object from the RapidAPI request, or None if an error occurs.
|
48
85
|
"""
|
49
86
|
|
50
87
|
headers = {
|
@@ -62,7 +99,16 @@ def query_rapidAPI(url: str, query_dict: dict, host: str):
|
|
62
99
|
return response
|
63
100
|
|
64
101
|
|
65
|
-
def response_header(response):
|
102
|
+
def response_header(response: requests.Response) -> dict:
|
103
|
+
"""
|
104
|
+
Retrieves the headers from an HTTP response object.
|
105
|
+
|
106
|
+
Args:
|
107
|
+
response: The HTTP response object from which headers are to be retrieved.
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
dict: The headers of the HTTP response.
|
111
|
+
"""
|
66
112
|
return response.headers
|
67
113
|
|
68
114
|
##################################################################################################
|
@@ -70,9 +116,15 @@ def response_header(response):
|
|
70
116
|
# https://rapidapi.com/omarmhaimdat/api/twitter154
|
71
117
|
##################################################################################################
|
72
118
|
|
73
|
-
def parse_user(user
|
119
|
+
def parse_user(user: dict) -> tuple:
|
74
120
|
"""
|
75
|
-
Parse the subdict related to user data
|
121
|
+
Parse the subdict related to user data.
|
122
|
+
|
123
|
+
Args:
|
124
|
+
user (dict): Dictionary containing user data.
|
125
|
+
|
126
|
+
Returns:
|
127
|
+
tuple: A tuple containing parsed user data fields.
|
76
128
|
"""
|
77
129
|
if user:
|
78
130
|
user_creation_date=user.get("creation_date","")
|
@@ -109,9 +161,15 @@ def parse_user(user : dict):
|
|
109
161
|
record = (user_creation_date, user_id, user_username, user_name, user_follower_count, user_following_count, user_favourites_count, user_is_private, user_is_verified, user_is_blue_verified, user_location, user_profile_pic_url, user_profile_banner_url, user_description, user_external_url, user_number_of_tweets, user_bot, user_timestamp, user_has_nft_avatar,user_category, user_default_profile, user_default_profile_image, user_listed_count, user_verified_type)
|
110
162
|
return record
|
111
163
|
|
112
|
-
def parse_retweet(data):
|
164
|
+
def parse_retweet(data: dict) -> tuple:
|
113
165
|
"""
|
114
|
-
Parse subdict related to original tweet if the captured tweet is RT
|
166
|
+
Parse subdict related to original tweet if the captured tweet is RT.
|
167
|
+
|
168
|
+
Args:
|
169
|
+
data (dict): Dictionary containing tweet data.
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
tuple: A tuple containing parsed tweet data fields.
|
115
173
|
"""
|
116
174
|
if data:
|
117
175
|
tweet_id=data.get("tweet_id", "")
|
@@ -150,9 +208,15 @@ def parse_retweet(data):
|
|
150
208
|
record=(tweet_id, creation_date, text,media_url, video_url, language, favorite_count, retweet_count, reply_count, quote_count, retweet, views, timestamp, video_view_count,in_reply_to_status_id, quoted_status_id, expanded_url, retweet_tweet_id,conversation_id,bookmark_count, source,community_note)
|
151
209
|
return record
|
152
210
|
|
153
|
-
def parse_entities(extended_entities):
|
211
|
+
def parse_entities(extended_entities: dict) -> tuple:
|
154
212
|
"""
|
155
|
-
Parse the subdict related to extended entities (image, video, tags...)
|
213
|
+
Parse the subdict related to extended entities (image, video, tags...).
|
214
|
+
|
215
|
+
Args:
|
216
|
+
extended_entities (dict): Dictionary containing extended entities data.
|
217
|
+
|
218
|
+
Returns:
|
219
|
+
tuple: A tuple containing parsed extended entities data fields.
|
156
220
|
"""
|
157
221
|
id_str, indices, media_key, media_url, media_type, original_info, height, width, ext_alt_text, monetizable, aspect_ratio, duration_millis = [], [], [], [], [], [], [], [], [], [], [], []
|
158
222
|
all_x, all_y, all_h, all_w =[], [], [], []
|
@@ -222,9 +286,15 @@ def parse_entities(extended_entities):
|
|
222
286
|
record = (id_str, indices, media_key, media_url, media_type, all_x, all_y, all_h, all_w, height, width, ext_alt_text, all_tag_user_id, all_tag_user_screenname, all_tag_user_type, monetizable, aspect_ratio, duration_millis, all_variants_url, all_variants_bitrate, all_variants_content_type)
|
223
287
|
return record
|
224
288
|
|
225
|
-
def parse_tweet(json_data):
|
289
|
+
def parse_tweet(json_data: list) -> pd.DataFrame:
|
226
290
|
"""
|
227
|
-
Parse a batch of tweets
|
291
|
+
Parse a batch of tweets.
|
292
|
+
|
293
|
+
Args:
|
294
|
+
json_data (list): List of dictionaries containing tweet data.
|
295
|
+
|
296
|
+
Returns:
|
297
|
+
pd.DataFrame: A pandas DataFrame containing parsed tweet data.
|
228
298
|
"""
|
229
299
|
all_records=[]
|
230
300
|
for data in json_data:
|
@@ -279,9 +349,15 @@ def parse_tweet(json_data):
|
|
279
349
|
df = pd.DataFrame.from_records(all_records, columns = all_cols)
|
280
350
|
return df
|
281
351
|
|
282
|
-
def parse_twitter_list_details(json_data):
|
352
|
+
def parse_twitter_list_details(json_data : dict) -> pd.DataFrame:
|
283
353
|
"""
|
284
|
-
Parse list results from https://rapidapi.com/omarmhaimdat/api/twitter154
|
354
|
+
Parse list results from https://rapidapi.com/omarmhaimdat/api/twitter154.
|
355
|
+
|
356
|
+
Args:
|
357
|
+
json_data (dict): Dictionary containing list details data.
|
358
|
+
|
359
|
+
Returns:
|
360
|
+
pd.DataFrame: A pandas DataFrame containing parsed list details.
|
285
361
|
"""
|
286
362
|
list_id = json_data.get("list_id", "")
|
287
363
|
list_id_str = json_data.get("list_id_str", "")
|
@@ -304,7 +380,16 @@ def parse_twitter_list_details(json_data):
|
|
304
380
|
# https://instagram-scraper2.p.rapidapi.com/hash_tag_medias_v2
|
305
381
|
######################################################################################
|
306
382
|
|
307
|
-
def instagram_parse_hashtag_data(hashtag_data):
|
383
|
+
def instagram_parse_hashtag_data(hashtag_data: dict)-> pd.DataFrame:
|
384
|
+
"""
|
385
|
+
Parse Instagram hashtag data into a DataFrame.
|
386
|
+
|
387
|
+
Args:
|
388
|
+
hashtag_data (dict): Dictionary containing Instagram hashtag data.
|
389
|
+
|
390
|
+
Returns:
|
391
|
+
pd.DataFrame: A pandas DataFrame containing parsed hashtag data.
|
392
|
+
"""
|
308
393
|
hashtag_id = hashtag_data.get("id")
|
309
394
|
hashtag_name = hashtag_data.get("name")
|
310
395
|
allow_following = hashtag_data.get("allow_following")
|
@@ -354,9 +439,17 @@ def instagram_parse_hashtag_data(hashtag_data):
|
|
354
439
|
# function to parse Twitter data
|
355
440
|
# https://rapidapi.com/twttrapi-twttrapi-default/api/twttrapi
|
356
441
|
######################################################################################
|
357
|
-
def compile_list_entries(json_data, path_json, filename):
|
442
|
+
def compile_list_entries(json_data: dict, path_json: str, filename: str)-> tuple:
|
358
443
|
"""
|
359
|
-
Function to
|
444
|
+
Function to process list entries from Twitter API response and write to JSONL file. https://twttrapi.p.rapidapi.com/list-members
|
445
|
+
|
446
|
+
Args:
|
447
|
+
json_data (dict): JSON response data from Twitter API.
|
448
|
+
path_json (str): Path to directory where JSONL file will be saved.
|
449
|
+
filename (str): Name of the JSONL file.
|
450
|
+
|
451
|
+
Returns:
|
452
|
+
tuple: A tuple containing a list of results (user legacy data) and next cursor (str or None).
|
360
453
|
"""
|
361
454
|
results = []
|
362
455
|
next_cursor = None
|
@@ -377,9 +470,15 @@ def compile_list_entries(json_data, path_json, filename):
|
|
377
470
|
return results, next_cursor
|
378
471
|
|
379
472
|
|
380
|
-
def parse_list_entries(jsonl_data):
|
473
|
+
def parse_list_entries(jsonl_data: list)-> pd.DataFrame:
|
381
474
|
"""
|
382
|
-
|
475
|
+
Parse list details from JSONL data obtained from the Twitter API.
|
476
|
+
|
477
|
+
Args:
|
478
|
+
jsonl_data (list): List of dictionaries containing JSON data.
|
479
|
+
|
480
|
+
Returns:
|
481
|
+
pd.DataFrame: DataFrame containing parsed list details.
|
383
482
|
"""
|
384
483
|
all_records=[]
|
385
484
|
for data in jsonl_data:
|
@@ -12,18 +12,30 @@ import pandas as pd
|
|
12
12
|
from tqdm import tqdm
|
13
13
|
|
14
14
|
|
15
|
-
def url_get_domain(url):
|
15
|
+
def url_get_domain(url: str) -> str:
|
16
16
|
"""
|
17
|
-
|
17
|
+
Extracts and returns the domain name from a given URL.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
url (str): The URL string from which the domain name is to be extracted.
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
str: The domain name extracted from the URL.
|
18
24
|
"""
|
19
25
|
parsed_url = urlparse(url)
|
20
26
|
domain = parsed_url.hostname if parsed_url.hostname else parsed_url.netloc
|
21
27
|
return domain
|
22
28
|
|
23
29
|
|
24
|
-
def url_get_extension(url):
|
30
|
+
def url_get_extension(url: str) -> str:
|
25
31
|
"""
|
26
|
-
|
32
|
+
Extracts and returns the extension (TLD) of the domain name from a given URL.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
url (str): The URL string from which the domain extension is to be extracted.
|
36
|
+
|
37
|
+
Returns:
|
38
|
+
str: The extension (TLD) of the domain name extracted from the URL.
|
27
39
|
"""
|
28
40
|
# Parse the URL using urlparse
|
29
41
|
parsed_url = urlparse(url)
|
@@ -34,15 +46,21 @@ def url_get_extension(url):
|
|
34
46
|
# Split the netloc by '.' to get the domain and TLD
|
35
47
|
domain_parts = netloc.split(".")
|
36
48
|
|
37
|
-
# Get the last
|
49
|
+
# Get the last part, which represents the TLD
|
38
50
|
extension = ".".join(domain_parts[-1:])
|
39
51
|
|
40
52
|
return extension
|
41
53
|
|
42
54
|
|
43
|
-
def url_clean_parameters(url):
|
55
|
+
def url_clean_parameters(url: str) -> str:
|
44
56
|
"""
|
45
|
-
|
57
|
+
Removes query parameters and UTM tags from a given URL and returns the cleaned URL.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
url (str): The URL string from which parameters and UTM tags are to be removed.
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
str: The cleaned URL without any parameters or UTM tags.
|
46
64
|
"""
|
47
65
|
parsed_url = urlparse(url)
|
48
66
|
netloc = parsed_url.netloc if parsed_url.netloc else ""
|
@@ -50,9 +68,15 @@ def url_clean_parameters(url):
|
|
50
68
|
return netloc + path
|
51
69
|
|
52
70
|
|
53
|
-
def url_clean_protocol(url):
|
71
|
+
def url_clean_protocol(url: str) -> str:
|
54
72
|
"""
|
55
|
-
|
73
|
+
Removes the 'http://' or 'https://' prefix from a given URL.
|
74
|
+
|
75
|
+
Args:
|
76
|
+
url (str): The URL string from which the protocol is to be removed.
|
77
|
+
|
78
|
+
Returns:
|
79
|
+
str: The URL without the 'http://' or 'https://' prefix.
|
56
80
|
"""
|
57
81
|
prefixes_to_remove = ["https://", "http://"]
|
58
82
|
|
@@ -64,9 +88,15 @@ def url_clean_protocol(url):
|
|
64
88
|
return url
|
65
89
|
|
66
90
|
|
67
|
-
def url_remove_www(url):
|
91
|
+
def url_remove_www(url: str) -> str:
|
68
92
|
"""
|
69
|
-
|
93
|
+
Removes the 'www.' prefix from a given URL, along with any protocol prefix.
|
94
|
+
|
95
|
+
Args:
|
96
|
+
url (str): The URL string from which the 'www.' prefix is to be removed.
|
97
|
+
|
98
|
+
Returns:
|
99
|
+
str: The URL without the 'www.' prefix.
|
70
100
|
"""
|
71
101
|
prefixes_to_remove = ["https://www.", "http://www.", "https://", "http://", "www."]
|
72
102
|
|
@@ -78,9 +108,15 @@ def url_remove_www(url):
|
|
78
108
|
return url
|
79
109
|
|
80
110
|
|
81
|
-
def url_add_protocol(url):
|
111
|
+
def url_add_protocol(url: str) -> str:
|
82
112
|
"""
|
83
|
-
|
113
|
+
Ensures the given URL has a protocol ('https://') and 'www.' prefix if necessary.
|
114
|
+
|
115
|
+
Args:
|
116
|
+
url (str): The URL string to be formatted with protocol and 'www.' prefix if required.
|
117
|
+
|
118
|
+
Returns:
|
119
|
+
str: The formatted URL with protocol and 'www.' prefix if it was missing.
|
84
120
|
"""
|
85
121
|
parsed_url = urlparse(url)
|
86
122
|
|
@@ -96,9 +132,15 @@ def url_add_protocol(url):
|
|
96
132
|
return url
|
97
133
|
|
98
134
|
|
99
|
-
def url_is_valid(url):
|
135
|
+
def url_is_valid(url: str) -> bool:
|
100
136
|
"""
|
101
|
-
Checks if a URL is valid
|
137
|
+
Checks if a given URL is valid.
|
138
|
+
|
139
|
+
Args:
|
140
|
+
url (str): The URL string to be validated.
|
141
|
+
|
142
|
+
Returns:
|
143
|
+
bool: True if the URL is valid, False otherwise.
|
102
144
|
"""
|
103
145
|
try:
|
104
146
|
parsed_url = urlparse(url)
|
@@ -108,9 +150,15 @@ def url_is_valid(url):
|
|
108
150
|
return False
|
109
151
|
|
110
152
|
|
111
|
-
def url_is_reachable(url):
|
153
|
+
def url_is_reachable(url: str) -> bool:
|
112
154
|
"""
|
113
|
-
Checks if
|
155
|
+
Checks if a given URL is reachable (i.e., does not return a 404 error or other HTTP errors).
|
156
|
+
|
157
|
+
Args:
|
158
|
+
url (str): The URL string to be checked for reachability.
|
159
|
+
|
160
|
+
Returns:
|
161
|
+
bool: True if the URL is reachable, False otherwise.
|
114
162
|
"""
|
115
163
|
try:
|
116
164
|
response = requests.get(url)
|
@@ -121,9 +169,15 @@ def url_is_reachable(url):
|
|
121
169
|
return False # HTTP error occurred, URL is not reachable
|
122
170
|
|
123
171
|
|
124
|
-
def scrape(url):
|
172
|
+
def scrape(url: str) -> requests.Response:
|
125
173
|
"""
|
126
|
-
|
174
|
+
Sends a GET request to the given URL and returns the full response.
|
175
|
+
|
176
|
+
Args:
|
177
|
+
url (str): The URL to be requested.
|
178
|
+
|
179
|
+
Returns:
|
180
|
+
requests.Response: The full response from the GET request.
|
127
181
|
"""
|
128
182
|
try:
|
129
183
|
response = requests.get(url)
|
@@ -133,11 +187,17 @@ def scrape(url):
|
|
133
187
|
return response
|
134
188
|
|
135
189
|
|
136
|
-
def justext_parse_content(response, languages=["English", "French"]):
|
137
|
-
"""
|
138
|
-
Return main content from a HTML response
|
190
|
+
def justext_parse_content(response: requests.Response, languages: list = ["English", "French"]) -> str:
|
139
191
|
"""
|
192
|
+
Extracts and returns the main content from an HTML response using jusText.
|
193
|
+
|
194
|
+
Args:
|
195
|
+
response (requests.Response): The HTTP response object containing the HTML content.
|
196
|
+
languages (list): A list of languages to use for stopword lists in jusText. Default is ["English", "French"].
|
140
197
|
|
198
|
+
Returns:
|
199
|
+
str: The extracted main content from the HTML response.
|
200
|
+
"""
|
141
201
|
stoplist = frozenset()
|
142
202
|
|
143
203
|
for lang in languages:
|
@@ -158,9 +218,15 @@ def justext_parse_content(response, languages=["English", "French"]):
|
|
158
218
|
return concatenated_text
|
159
219
|
|
160
220
|
|
161
|
-
def trafilatura_parse_content(response):
|
221
|
+
def trafilatura_parse_content(response: requests.Response) -> str:
|
162
222
|
"""
|
163
|
-
|
223
|
+
Extracts and returns the main content from an HTML response using Trafilatura.
|
224
|
+
|
225
|
+
Args:
|
226
|
+
response (requests.Response): The HTTP response object containing the HTML content.
|
227
|
+
|
228
|
+
Returns:
|
229
|
+
str: The extracted main content from the HTML response.
|
164
230
|
"""
|
165
231
|
try:
|
166
232
|
text = extract(response.content)
|
@@ -171,13 +237,13 @@ def trafilatura_parse_content(response):
|
|
171
237
|
|
172
238
|
|
173
239
|
def process_scraping(
|
174
|
-
url,
|
175
|
-
path,
|
176
|
-
method="justext",
|
177
|
-
languages=["English", "French"],
|
178
|
-
title=True,
|
179
|
-
meta=True,
|
180
|
-
lst_properties=[
|
240
|
+
url: str,
|
241
|
+
path: str,
|
242
|
+
method: str = "justext",
|
243
|
+
languages: list = ["English", "French"],
|
244
|
+
title: bool = True,
|
245
|
+
meta: bool = True,
|
246
|
+
lst_properties: list = [
|
181
247
|
"og:site_name",
|
182
248
|
"og:url",
|
183
249
|
"og:title",
|
@@ -204,7 +270,23 @@ def process_scraping(
|
|
204
270
|
"al:android:package",
|
205
271
|
"al:android:app_name",
|
206
272
|
],
|
207
|
-
):
|
273
|
+
) -> dict:
|
274
|
+
"""
|
275
|
+
Process scraping of a URL, extract main content, title, and meta properties,
|
276
|
+
and store the results in a JSON file.
|
277
|
+
|
278
|
+
Args:
|
279
|
+
url (str): The URL to scrape.
|
280
|
+
path (str): The directory path where the scraped data JSON file will be saved.
|
281
|
+
method (str, optional): The method to use for content extraction ('justext' or 'trafilatura'). Defaults to 'justext'.
|
282
|
+
languages (list, optional): A list of languages for stopword lists in jusText. Defaults to ["English", "French"].
|
283
|
+
title (bool, optional): Whether to extract the title from the HTML. Defaults to True.
|
284
|
+
meta (bool, optional): Whether to extract meta properties from the HTML. Defaults to True.
|
285
|
+
lst_properties (list, optional): List of specific meta properties to extract. Defaults to a comprehensive list.
|
286
|
+
|
287
|
+
Returns:
|
288
|
+
dict or None: A dictionary containing the extracted data and file path if successful, or None if an error occurs.
|
289
|
+
"""
|
208
290
|
try:
|
209
291
|
|
210
292
|
# We name the files
|
@@ -257,9 +339,15 @@ def process_scraping(
|
|
257
339
|
return None
|
258
340
|
|
259
341
|
|
260
|
-
def parse_title(response):
|
342
|
+
def parse_title(response: requests.Response) -> str:
|
261
343
|
"""
|
262
|
-
|
344
|
+
Extracts and returns the webpage title from an HTML response.
|
345
|
+
|
346
|
+
Parameters:
|
347
|
+
response (requests.Response): The HTTP response object containing the HTML content.
|
348
|
+
|
349
|
+
Returns:
|
350
|
+
str or None: The extracted title text if found, None if no title tag is found or an error occurs.
|
263
351
|
"""
|
264
352
|
try:
|
265
353
|
soup = BeautifulSoup(response.content, "html.parser")
|
@@ -277,9 +365,7 @@ def parse_title(response):
|
|
277
365
|
return None
|
278
366
|
|
279
367
|
|
280
|
-
def get_meta_properties(
|
281
|
-
response,
|
282
|
-
lst_properties=[
|
368
|
+
def get_meta_properties(response: requests.Response, lst_properties: list = [
|
283
369
|
"og:site_name",
|
284
370
|
"og:url",
|
285
371
|
"og:title",
|
@@ -305,10 +391,16 @@ def get_meta_properties(
|
|
305
391
|
"al:ios:app_name",
|
306
392
|
"al:android:package",
|
307
393
|
"al:android:app_name",
|
308
|
-
]
|
309
|
-
):
|
394
|
+
]) -> dict:
|
310
395
|
"""
|
311
|
-
|
396
|
+
Extracts specified meta properties from a webpage and returns them as a dictionary.
|
397
|
+
|
398
|
+
Args:
|
399
|
+
response (requests.Response): The HTTP response object containing the HTML content.
|
400
|
+
lst_properties (list, optional): A list of meta property names to extract. Defaults to a comprehensive list.
|
401
|
+
|
402
|
+
Returns:
|
403
|
+
dict or None: A dictionary mapping meta property names to their content values if found, or None if an error occurs.
|
312
404
|
"""
|
313
405
|
try:
|
314
406
|
|
@@ -338,14 +430,14 @@ def get_meta_properties(
|
|
338
430
|
|
339
431
|
|
340
432
|
def parallel_scraping(
|
341
|
-
urls,
|
342
|
-
path,
|
343
|
-
max_workers=8,
|
344
|
-
method="justext",
|
345
|
-
languages=["English", "French"],
|
346
|
-
title=True,
|
347
|
-
meta=True,
|
348
|
-
lst_properties=[
|
433
|
+
urls: list,
|
434
|
+
path: str,
|
435
|
+
max_workers: int = 8,
|
436
|
+
method: str = "justext",
|
437
|
+
languages: list = ["English", "French"],
|
438
|
+
title: bool = True,
|
439
|
+
meta: bool = True,
|
440
|
+
lst_properties: list = [
|
349
441
|
"og:site_name",
|
350
442
|
"og:url",
|
351
443
|
"og:title",
|
@@ -373,9 +465,19 @@ def parallel_scraping(
|
|
373
465
|
"al:android:app_name",
|
374
466
|
],
|
375
467
|
):
|
376
|
-
|
377
468
|
"""
|
378
|
-
Execute concurrent threads to scrape multiple webpages
|
469
|
+
Execute concurrent threads to scrape multiple webpages.
|
470
|
+
|
471
|
+
Args:
|
472
|
+
urls (list): List of URLs to scrape.
|
473
|
+
path (str): The directory path where scraped data will be saved.
|
474
|
+
max_workers (int, optional): Maximum number of concurrent threads. Defaults to 8.
|
475
|
+
method (str, optional): Method to use for content extraction ('justext' or 'trafilatura'). Defaults to 'justext'.
|
476
|
+
languages (list, optional): Languages for stopword lists in jusText. Defaults to ['English', 'French'].
|
477
|
+
title (bool, optional): Whether to extract title from HTML. Defaults to True.
|
478
|
+
meta (bool, optional): Whether to extract meta properties from HTML. Defaults to True.
|
479
|
+
lst_properties (list, optional): List of specific meta properties to extract. Defaults to a comprehensive list.
|
480
|
+
|
379
481
|
"""
|
380
482
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
381
483
|
# Submit scraping tasks for each URL and add tqdm progress bar
|
@@ -403,9 +505,17 @@ def parallel_scraping(
|
|
403
505
|
print(f"Error scraping : {e}")
|
404
506
|
|
405
507
|
|
406
|
-
def parse_scraped_webpages(path_json_files, output_path, name):
|
508
|
+
def parse_scraped_webpages(path_json_files: str, output_path: str, name: str) -> pd.DataFrame:
|
407
509
|
"""
|
408
|
-
Parse JSON files
|
510
|
+
Parse JSON files containing scraped data and save the extracted data into a CSV file.
|
511
|
+
|
512
|
+
Args:
|
513
|
+
path_json_files (str): Directory path containing JSON files.
|
514
|
+
output_path (str): Directory path where the CSV file will be saved.
|
515
|
+
name (str): Name of the CSV file.
|
516
|
+
|
517
|
+
Returns:
|
518
|
+
pd.DataFrame: DataFrame containing the parsed data from JSON files.
|
409
519
|
"""
|
410
520
|
extracted_data = []
|
411
521
|
|
@@ -421,10 +531,20 @@ def parse_scraped_webpages(path_json_files, output_path, name):
|
|
421
531
|
save_dataframe_csv(df, output_path, name)
|
422
532
|
return df
|
423
533
|
|
424
|
-
def download_file(url:str, path:str):
|
425
|
-
|
426
|
-
Download a file
|
427
|
-
|
534
|
+
def download_file(url: str, path: str) -> None:
|
535
|
+
"""
|
536
|
+
Download a file from a URL and save it locally.
|
537
|
+
|
538
|
+
ARgs:
|
539
|
+
url (str): The URL of the file to download.
|
540
|
+
path (str): The local path where the file will be saved.
|
541
|
+
|
542
|
+
Raises:
|
543
|
+
requests.exceptions.RequestException: If an HTTP error occurs during the request.
|
544
|
+
|
545
|
+
Returns:
|
546
|
+
None
|
547
|
+
"""
|
428
548
|
try:
|
429
549
|
response = requests.get(url)
|
430
550
|
response.raise_for_status() # Raise an HTTPError for bad responses
|
@@ -435,10 +555,17 @@ def download_file(url:str, path:str):
|
|
435
555
|
except requests.exceptions.RequestException as e:
|
436
556
|
print(f"Error downloading file: {url, e}")
|
437
557
|
|
438
|
-
def parallel_dl(urls, paths, max_workers=8):
|
439
|
-
|
558
|
+
def parallel_dl(urls: list, paths: list, max_workers: int = 8) -> None:
|
440
559
|
"""
|
441
|
-
Execute concurrent threads to
|
560
|
+
Execute concurrent threads to download multiple files from URLs and save them locally.
|
561
|
+
|
562
|
+
Args:
|
563
|
+
urls (list): List of URLs to download files from.
|
564
|
+
paths (list): List of local paths where downloaded files will be saved.
|
565
|
+
max_workers (int, optional): Maximum number of concurrent threads. Defaults to 8.
|
566
|
+
|
567
|
+
Returns:
|
568
|
+
None
|
442
569
|
"""
|
443
570
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
444
571
|
# Submit scraping tasks for each URL and add tqdm progress bar
|