opsci-toolbox 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,26 @@
1
1
  import pandas as pd
2
- from googleapiclient.discovery import build
2
+ from googleapiclient.discovery import build, Resource
3
3
  import re
4
- from lib.helpers import write_jsonl, read_json
4
+ from opsci_toolbox.helpers.common import write_jsonl, read_json
5
5
  import time
6
6
  from datetime import datetime,timedelta
7
- from lib.nlp_helpers import remove_extra_spaces
7
+ from opsci_toolbox.helpers.nlp import remove_extra_spaces
8
8
  import os
9
9
 
10
10
  #########################################################################################
11
11
  # HELPERS
12
12
  #########################################################################################
13
13
 
14
- def YT_duration_to_milliseconds(duration):
14
+ def YT_duration_to_milliseconds(duration: str) -> int:
15
+ """
16
+ Convert an ISO 8601 duration string to milliseconds.
17
+
18
+ Args:
19
+ duration (str): The ISO 8601 duration string (e.g., 'PT1H2M3S' for 1 hour, 2 minutes, and 3 seconds).
20
+
21
+ Returns:
22
+ int: The total duration in milliseconds. Returns None if the duration string is invalid.
23
+ """
15
24
  # Regular expression to match ISO 8601 duration format
16
25
  duration_pattern = re.compile(r'P(?:(\d+)D)?T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?')
17
26
 
@@ -32,7 +41,22 @@ def YT_duration_to_milliseconds(duration):
32
41
 
33
42
  return total_milliseconds
34
43
 
35
- def create_queries_per_period(query, publishedAfter, publishedBefore, col_publishedAfter = "publishedAfter", col_publishedBefore = "publishedBefore", date_format = '%Y-%m-%dT%H:%M:%SZ', rolling_days = 7 ):
44
+ def create_queries_per_period(query : dict, publishedAfter : str, publishedBefore : str, col_publishedAfter : str = "publishedAfter", col_publishedBefore : str = "publishedBefore", date_format : str = '%Y-%m-%dT%H:%M:%SZ', rolling_days : int = 7 ) -> list:
45
+ """
46
+ Generate a list of query dictionaries with specific date ranges based on a rolling window.
47
+
48
+ Args:
49
+ query (Dict[str, str]): The base query dictionary to be modified with date ranges.
50
+ publishedAfter (str): The start date of the entire period in the specified date_format.
51
+ publishedBefore (str): The end date of the entire period in the specified date_format.
52
+ col_publishedAfter (str, optional): The key for the start date in the query dictionary. Defaults to "publishedAfter".
53
+ col_publishedBefore (str, optional): The key for the end date in the query dictionary. Defaults to "publishedBefore".
54
+ date_format (str, optional): The date format used for parsing and formatting dates. Defaults to '%Y-%m-%dT%H:%M:%SZ'.
55
+ rolling_days (int, optional): The number of days in each rolling period. Defaults to 7.
56
+
57
+ Returns:
58
+ List[Dict[str, str]]: A list of query dictionaries with date ranges.
59
+ """
36
60
  datetime_publishedAfter = datetime.strptime(publishedAfter, date_format)
37
61
  datetime_publishedBefore = datetime.strptime(publishedBefore, date_format)
38
62
 
@@ -62,14 +86,32 @@ def create_queries_per_period(query, publishedAfter, publishedBefore, col_publis
62
86
  # API queries functions
63
87
  #########################################################################################
64
88
 
65
- def YT_client(api_key, api_service_name="youtube", api_version="v3"):
89
+ def YT_client(api_key: str, api_service_name: str = "youtube", api_version: str = "v3") -> Resource:
66
90
  """
67
- Instantiate a new client using an API KEY
91
+ Instantiate a new YouTube client using an API key.
92
+
93
+ Args:
94
+ api_key (str): The API key for accessing the YouTube Data API.
95
+ api_service_name (str, optional): The name of the API service. Defaults to "youtube".
96
+ api_version (str, optional): The version of the API service. Defaults to "v3".
97
+
98
+ Returns:
99
+ googleapiclient.discovery.Resource: The instantiated YouTube client.
68
100
  """
69
101
  client = build(api_service_name, api_version, developerKey=api_key)
70
102
  return client
71
103
 
72
- def check_keys(lst_api_keys):
104
+ def check_keys(lst_api_keys : list) -> tuple:
105
+ """
106
+ Check a list of API keys and determine if any of them have available quota.
107
+
108
+ Args:
109
+ lst_api_keys (List[str]): A list of file paths to JSON files containing API key data.
110
+
111
+ Returns:
112
+ Tuple[str, dict, bool]: A tuple containing the filename (without extension) of the first API key file with available quota,
113
+ the API key data as a dictionary, and a boolean indicating if a valid key was found.
114
+ """
73
115
  status_ok = False
74
116
  for key_idx, apifile_path in enumerate(lst_api_keys):
75
117
  api_filename = os.path.splitext(os.path.basename(apifile_path))[0]
@@ -85,9 +127,17 @@ def check_keys(lst_api_keys):
85
127
  return api_filename, api_key_data, status_ok
86
128
 
87
129
 
88
- def search_videos(client, query_dict, next_token) :
130
+ def search_videos(client: Resource, query_dict : dict, next_token : str) -> tuple :
89
131
  """
90
- Query to search for videos using a string query and a dict of parameters
132
+ Query to search for videos using a string query and a dictionary of parameters.
133
+
134
+ Args:
135
+ client (googleapiclient.discovery.Resource): The YouTube Data API client.
136
+ query_dict (Dict[str, Any]): A dictionary containing query parameters.
137
+ next_token (str, optional): The token for the next page of results. Defaults to None.
138
+
139
+ Returns:
140
+ Tuple[List[Dict[str, Any]], str, int]: A tuple containing a list of search results, the next page token, and the total number of results.
91
141
  """
92
142
  try:
93
143
  if next_token is None:
@@ -134,9 +184,21 @@ def search_videos(client, query_dict, next_token) :
134
184
  total_results = 0
135
185
  return results, next_token, total_results
136
186
 
137
- def process_search_videos(client, query_dict, limit, query_id, json_path, next_token = None):
187
+ def process_search_videos(client : Resource, query_dict: dict, limit : int, query_id : str, json_path : str, next_token : str = None) -> tuple:
138
188
  """
139
- process to iterate over pages of video search results and store JSON response in case of quota limit
189
+ Process to iterate over pages of video search results and store JSON response in case of quota limit.
190
+
191
+ Args:
192
+ client (googleapiclient.discovery.Resource): The YouTube Data API client.
193
+ query_dict (Dict[str, Any]): A dictionary containing query parameters.
194
+ limit (int): The maximum number of pages to retrieve.
195
+ query_id (str): An identifier for the query.
196
+ json_path (str): The directory path where JSONL files will be saved.
197
+ next_token (str, optional): The token for the next page of results. Defaults to None.
198
+
199
+ Returns:
200
+ Tuple[List[Dict[str, Any]], int, str, int]: A tuple containing the list of results, total results count,
201
+ next page token, and the counter of processed pages.
140
202
  """
141
203
  counter=0
142
204
  results =[]
@@ -171,9 +233,16 @@ def process_search_videos(client, query_dict, limit, query_id, json_path, next_t
171
233
  print(e)
172
234
  return results, total_results, next_token, counter
173
235
 
174
- def get_video_details(client, lst):
236
+ def get_video_details(client : Resource, lst : list) -> dict:
175
237
  """
176
- Query to get video details
238
+ Query to get video details.
239
+
240
+ Args:
241
+ client (googleapiclient.discovery.Resource): The YouTube Data API client.
242
+ lst (List[str]): A list of video IDs to fetch details for.
243
+
244
+ Returns:
245
+ List[Dict[str, Any]]: A list of dictionaries containing video details.
177
246
  """
178
247
 
179
248
  try:
@@ -195,7 +264,16 @@ def get_video_details(client, lst):
195
264
  # Parsing functions
196
265
  #########################################################################################
197
266
 
198
- def parse_video_details(lst_items):
267
+ def parse_video_details(lst_items : list) -> pd.DataFrame:
268
+ """
269
+ Parse video details from a list of video items.
270
+
271
+ Args:
272
+ lst_items (List[Dict[str, Any]]): List of dictionaries containing video details.
273
+
274
+ Returns:
275
+ pd.DataFrame: DataFrame containing the parsed video details.
276
+ """
199
277
  all_records =[]
200
278
  for item in lst_items:
201
279
  video_id = item.get("id", None)
@@ -251,7 +329,16 @@ def parse_video_details(lst_items):
251
329
  "license", "embeddable", "madeForKids"])
252
330
  return df
253
331
 
254
- def parse_search_results(jsonl_data):
332
+ def parse_search_results(jsonl_data : list) -> pd.DataFrame:
333
+ """
334
+ Parse search results from JSONL data to extract video details.
335
+
336
+ Args:
337
+ jsonl_data (List[Dict[str, Any]]): List of dictionaries containing video search results.
338
+
339
+ Returns:
340
+ pd.DataFrame: DataFrame containing the parsed video details.
341
+ """
255
342
  all_records =[]
256
343
  for json in jsonl_data:
257
344
  video_id = json.get("id", {}).get("videoId", "")