opsci-toolbox 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +120 -21
- opsci_toolbox/apis/webscraping.py +186 -59
- opsci_toolbox/apis/youtube_helpers.py +103 -16
- opsci_toolbox/helpers/common.py +368 -254
- opsci_toolbox/helpers/cv.py +50 -60
- opsci_toolbox/helpers/dataviz.py +255 -184
- opsci_toolbox/helpers/dates.py +17 -18
- opsci_toolbox/helpers/nlp.py +154 -114
- opsci_toolbox/helpers/nlp_cuml.py +389 -36
- opsci_toolbox/helpers/sna.py +509 -0
- opsci_toolbox/helpers/sql.py +53 -0
- {opsci_toolbox-0.0.6.dist-info → opsci_toolbox-0.0.8.dist-info}/METADATA +14 -9
- opsci_toolbox-0.0.8.dist-info/RECORD +22 -0
- opsci_toolbox-0.0.6.dist-info/RECORD +0 -21
- {opsci_toolbox-0.0.6.dist-info → opsci_toolbox-0.0.8.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.6.dist-info → opsci_toolbox-0.0.8.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,26 @@
|
|
1
1
|
import pandas as pd
|
2
|
-
from googleapiclient.discovery import build
|
2
|
+
from googleapiclient.discovery import build, Resource
|
3
3
|
import re
|
4
|
-
from
|
4
|
+
from opsci_toolbox.helpers.common import write_jsonl, read_json
|
5
5
|
import time
|
6
6
|
from datetime import datetime,timedelta
|
7
|
-
from
|
7
|
+
from opsci_toolbox.helpers.nlp import remove_extra_spaces
|
8
8
|
import os
|
9
9
|
|
10
10
|
#########################################################################################
|
11
11
|
# HELPERS
|
12
12
|
#########################################################################################
|
13
13
|
|
14
|
-
def YT_duration_to_milliseconds(duration):
|
14
|
+
def YT_duration_to_milliseconds(duration: str) -> int:
|
15
|
+
"""
|
16
|
+
Convert an ISO 8601 duration string to milliseconds.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
duration (str): The ISO 8601 duration string (e.g., 'PT1H2M3S' for 1 hour, 2 minutes, and 3 seconds).
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
int: The total duration in milliseconds. Returns None if the duration string is invalid.
|
23
|
+
"""
|
15
24
|
# Regular expression to match ISO 8601 duration format
|
16
25
|
duration_pattern = re.compile(r'P(?:(\d+)D)?T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?')
|
17
26
|
|
@@ -32,7 +41,22 @@ def YT_duration_to_milliseconds(duration):
|
|
32
41
|
|
33
42
|
return total_milliseconds
|
34
43
|
|
35
|
-
def create_queries_per_period(query, publishedAfter, publishedBefore, col_publishedAfter = "publishedAfter", col_publishedBefore = "publishedBefore", date_format = '%Y-%m-%dT%H:%M:%SZ', rolling_days = 7 ):
|
44
|
+
def create_queries_per_period(query : dict, publishedAfter : str, publishedBefore : str, col_publishedAfter : str = "publishedAfter", col_publishedBefore : str = "publishedBefore", date_format : str = '%Y-%m-%dT%H:%M:%SZ', rolling_days : int = 7 ) -> list:
|
45
|
+
"""
|
46
|
+
Generate a list of query dictionaries with specific date ranges based on a rolling window.
|
47
|
+
|
48
|
+
Args:
|
49
|
+
query (Dict[str, str]): The base query dictionary to be modified with date ranges.
|
50
|
+
publishedAfter (str): The start date of the entire period in the specified date_format.
|
51
|
+
publishedBefore (str): The end date of the entire period in the specified date_format.
|
52
|
+
col_publishedAfter (str, optional): The key for the start date in the query dictionary. Defaults to "publishedAfter".
|
53
|
+
col_publishedBefore (str, optional): The key for the end date in the query dictionary. Defaults to "publishedBefore".
|
54
|
+
date_format (str, optional): The date format used for parsing and formatting dates. Defaults to '%Y-%m-%dT%H:%M:%SZ'.
|
55
|
+
rolling_days (int, optional): The number of days in each rolling period. Defaults to 7.
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
List[Dict[str, str]]: A list of query dictionaries with date ranges.
|
59
|
+
"""
|
36
60
|
datetime_publishedAfter = datetime.strptime(publishedAfter, date_format)
|
37
61
|
datetime_publishedBefore = datetime.strptime(publishedBefore, date_format)
|
38
62
|
|
@@ -62,14 +86,32 @@ def create_queries_per_period(query, publishedAfter, publishedBefore, col_publis
|
|
62
86
|
# API queries functions
|
63
87
|
#########################################################################################
|
64
88
|
|
65
|
-
def YT_client(api_key, api_service_name="youtube", api_version="v3"):
|
89
|
+
def YT_client(api_key: str, api_service_name: str = "youtube", api_version: str = "v3") -> Resource:
|
66
90
|
"""
|
67
|
-
Instantiate a new client using an API
|
91
|
+
Instantiate a new YouTube client using an API key.
|
92
|
+
|
93
|
+
Args:
|
94
|
+
api_key (str): The API key for accessing the YouTube Data API.
|
95
|
+
api_service_name (str, optional): The name of the API service. Defaults to "youtube".
|
96
|
+
api_version (str, optional): The version of the API service. Defaults to "v3".
|
97
|
+
|
98
|
+
Returns:
|
99
|
+
googleapiclient.discovery.Resource: The instantiated YouTube client.
|
68
100
|
"""
|
69
101
|
client = build(api_service_name, api_version, developerKey=api_key)
|
70
102
|
return client
|
71
103
|
|
72
|
-
def check_keys(lst_api_keys):
|
104
|
+
def check_keys(lst_api_keys : list) -> tuple:
|
105
|
+
"""
|
106
|
+
Check a list of API keys and determine if any of them have available quota.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
lst_api_keys (List[str]): A list of file paths to JSON files containing API key data.
|
110
|
+
|
111
|
+
Returns:
|
112
|
+
Tuple[str, dict, bool]: A tuple containing the filename (without extension) of the first API key file with available quota,
|
113
|
+
the API key data as a dictionary, and a boolean indicating if a valid key was found.
|
114
|
+
"""
|
73
115
|
status_ok = False
|
74
116
|
for key_idx, apifile_path in enumerate(lst_api_keys):
|
75
117
|
api_filename = os.path.splitext(os.path.basename(apifile_path))[0]
|
@@ -85,9 +127,17 @@ def check_keys(lst_api_keys):
|
|
85
127
|
return api_filename, api_key_data, status_ok
|
86
128
|
|
87
129
|
|
88
|
-
def search_videos(client, query_dict, next_token) :
|
130
|
+
def search_videos(client: Resource, query_dict : dict, next_token : str) -> tuple :
|
89
131
|
"""
|
90
|
-
Query to search for videos using a string query and a
|
132
|
+
Query to search for videos using a string query and a dictionary of parameters.
|
133
|
+
|
134
|
+
Args:
|
135
|
+
client (googleapiclient.discovery.Resource): The YouTube Data API client.
|
136
|
+
query_dict (Dict[str, Any]): A dictionary containing query parameters.
|
137
|
+
next_token (str, optional): The token for the next page of results. Defaults to None.
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
Tuple[List[Dict[str, Any]], str, int]: A tuple containing a list of search results, the next page token, and the total number of results.
|
91
141
|
"""
|
92
142
|
try:
|
93
143
|
if next_token is None:
|
@@ -134,9 +184,21 @@ def search_videos(client, query_dict, next_token) :
|
|
134
184
|
total_results = 0
|
135
185
|
return results, next_token, total_results
|
136
186
|
|
137
|
-
def process_search_videos(client, query_dict, limit, query_id, json_path, next_token = None):
|
187
|
+
def process_search_videos(client : Resource, query_dict: dict, limit : int, query_id : str, json_path : str, next_token : str = None) -> tuple:
|
138
188
|
"""
|
139
|
-
|
189
|
+
Process to iterate over pages of video search results and store JSON response in case of quota limit.
|
190
|
+
|
191
|
+
Args:
|
192
|
+
client (googleapiclient.discovery.Resource): The YouTube Data API client.
|
193
|
+
query_dict (Dict[str, Any]): A dictionary containing query parameters.
|
194
|
+
limit (int): The maximum number of pages to retrieve.
|
195
|
+
query_id (str): An identifier for the query.
|
196
|
+
json_path (str): The directory path where JSONL files will be saved.
|
197
|
+
next_token (str, optional): The token for the next page of results. Defaults to None.
|
198
|
+
|
199
|
+
Returns:
|
200
|
+
Tuple[List[Dict[str, Any]], int, str, int]: A tuple containing the list of results, total results count,
|
201
|
+
next page token, and the counter of processed pages.
|
140
202
|
"""
|
141
203
|
counter=0
|
142
204
|
results =[]
|
@@ -171,9 +233,16 @@ def process_search_videos(client, query_dict, limit, query_id, json_path, next_t
|
|
171
233
|
print(e)
|
172
234
|
return results, total_results, next_token, counter
|
173
235
|
|
174
|
-
def get_video_details(client, lst):
|
236
|
+
def get_video_details(client : Resource, lst : list) -> dict:
|
175
237
|
"""
|
176
|
-
Query to get video details
|
238
|
+
Query to get video details.
|
239
|
+
|
240
|
+
Args:
|
241
|
+
client (googleapiclient.discovery.Resource): The YouTube Data API client.
|
242
|
+
lst (List[str]): A list of video IDs to fetch details for.
|
243
|
+
|
244
|
+
Returns:
|
245
|
+
List[Dict[str, Any]]: A list of dictionaries containing video details.
|
177
246
|
"""
|
178
247
|
|
179
248
|
try:
|
@@ -195,7 +264,16 @@ def get_video_details(client, lst):
|
|
195
264
|
# Parsing functions
|
196
265
|
#########################################################################################
|
197
266
|
|
198
|
-
def parse_video_details(lst_items):
|
267
|
+
def parse_video_details(lst_items : list) -> pd.DataFrame:
|
268
|
+
"""
|
269
|
+
Parse video details from a list of video items.
|
270
|
+
|
271
|
+
Args:
|
272
|
+
lst_items (List[Dict[str, Any]]): List of dictionaries containing video details.
|
273
|
+
|
274
|
+
Returns:
|
275
|
+
pd.DataFrame: DataFrame containing the parsed video details.
|
276
|
+
"""
|
199
277
|
all_records =[]
|
200
278
|
for item in lst_items:
|
201
279
|
video_id = item.get("id", None)
|
@@ -251,7 +329,16 @@ def parse_video_details(lst_items):
|
|
251
329
|
"license", "embeddable", "madeForKids"])
|
252
330
|
return df
|
253
331
|
|
254
|
-
def parse_search_results(jsonl_data):
|
332
|
+
def parse_search_results(jsonl_data : list) -> pd.DataFrame:
|
333
|
+
"""
|
334
|
+
Parse search results from JSONL data to extract video details.
|
335
|
+
|
336
|
+
Args:
|
337
|
+
jsonl_data (List[Dict[str, Any]]): List of dictionaries containing video search results.
|
338
|
+
|
339
|
+
Returns:
|
340
|
+
pd.DataFrame: DataFrame containing the parsed video details.
|
341
|
+
"""
|
255
342
|
all_records =[]
|
256
343
|
for json in jsonl_data:
|
257
344
|
video_id = json.get("id", {}).get("videoId", "")
|