PyPI - opsci-toolbox - Versions diffs - 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl - Mend

opsci-toolbox 0.0.6py3-none-any.whl → 0.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

opsci_toolbox/apis/rapidapi_helpers.py +120 -21
opsci_toolbox/apis/webscraping.py +186 -59
opsci_toolbox/apis/youtube_helpers.py +103 -16
opsci_toolbox/helpers/common.py +368 -254
opsci_toolbox/helpers/cv.py +50 -60
opsci_toolbox/helpers/dataviz.py +255 -184
opsci_toolbox/helpers/dates.py +17 -18
opsci_toolbox/helpers/nlp.py +154 -114
opsci_toolbox/helpers/nlp_cuml.py +389 -36
opsci_toolbox/helpers/sna.py +509 -0
opsci_toolbox/helpers/sql.py +53 -0
{opsci_toolbox-0.0.6.dist-info → opsci_toolbox-0.0.8.dist-info}/METADATA +14 -9
opsci_toolbox-0.0.8.dist-info/RECORD +22 -0
opsci_toolbox-0.0.6.dist-info/RECORD +0 -21
{opsci_toolbox-0.0.6.dist-info → opsci_toolbox-0.0.8.dist-info}/WHEEL +0 -0
{opsci_toolbox-0.0.6.dist-info → opsci_toolbox-0.0.8.dist-info}/top_level.txt +0 -0

opsci_toolbox/apis/youtube_helpers.py CHANGED Viewed

@@ -1,17 +1,26 @@
 import pandas as pd
-from googleapiclient.discovery import build
+from googleapiclient.discovery import build, Resource
 import re
-from lib.helpers import write_jsonl, read_json
+from opsci_toolbox.helpers.common import write_jsonl, read_json
 import time
 from datetime import datetime,timedelta
-from lib.nlp_helpers import remove_extra_spaces
+from opsci_toolbox.helpers.nlp import remove_extra_spaces
 import os
 #########################################################################################
 # HELPERS
 #########################################################################################
-def YT_duration_to_milliseconds(duration):
+def YT_duration_to_milliseconds(duration: str) -> int:
+    """
+    Convert an ISO 8601 duration string to milliseconds.
+    Args:
+        duration (str): The ISO 8601 duration string (e.g., 'PT1H2M3S' for 1 hour, 2 minutes, and 3 seconds).
+    Returns:
+        int: The total duration in milliseconds. Returns None if the duration string is invalid.
+    """
     # Regular expression to match ISO 8601 duration format
     duration_pattern = re.compile(r'P(?:(\d+)D)?T(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?')
@@ -32,7 +41,22 @@ def YT_duration_to_milliseconds(duration):
     return total_milliseconds
-def create_queries_per_period(query, publishedAfter, publishedBefore, col_publishedAfter = "publishedAfter", col_publishedBefore = "publishedBefore", date_format = '%Y-%m-%dT%H:%M:%SZ', rolling_days = 7 ):
+def create_queries_per_period(query : dict, publishedAfter : str, publishedBefore : str, col_publishedAfter : str = "publishedAfter", col_publishedBefore : str = "publishedBefore", date_format : str = '%Y-%m-%dT%H:%M:%SZ', rolling_days : int = 7 ) -> list:
+    """
+    Generate a list of query dictionaries with specific date ranges based on a rolling window.
+    Args:
+        query (Dict[str, str]): The base query dictionary to be modified with date ranges.
+        publishedAfter (str): The start date of the entire period in the specified date_format.
+        publishedBefore (str): The end date of the entire period in the specified date_format.
+        col_publishedAfter (str, optional): The key for the start date in the query dictionary. Defaults to "publishedAfter".
+        col_publishedBefore (str, optional): The key for the end date in the query dictionary. Defaults to "publishedBefore".
+        date_format (str, optional): The date format used for parsing and formatting dates. Defaults to '%Y-%m-%dT%H:%M:%SZ'.
+        rolling_days (int, optional): The number of days in each rolling period. Defaults to 7.
+    Returns:
+        List[Dict[str, str]]: A list of query dictionaries with date ranges.
+    """
     datetime_publishedAfter = datetime.strptime(publishedAfter, date_format)
     datetime_publishedBefore = datetime.strptime(publishedBefore, date_format)
@@ -62,14 +86,32 @@ def create_queries_per_period(query, publishedAfter, publishedBefore, col_publis
 # API queries functions
 #########################################################################################
-def YT_client(api_key, api_service_name="youtube", api_version="v3"):
+def YT_client(api_key: str, api_service_name: str = "youtube", api_version: str = "v3") -> Resource:
     """
-    Instantiate a new client using an API KEY
+    Instantiate a new YouTube client using an API key.
+    Args:
+        api_key (str): The API key for accessing the YouTube Data API.
+        api_service_name (str, optional): The name of the API service. Defaults to "youtube".
+        api_version (str, optional): The version of the API service. Defaults to "v3".
+    Returns:
+        googleapiclient.discovery.Resource: The instantiated YouTube client.
     """
     client = build(api_service_name, api_version, developerKey=api_key)
     return client
-def check_keys(lst_api_keys):
+def check_keys(lst_api_keys : list) -> tuple:
+    """
+    Check a list of API keys and determine if any of them have available quota.
+    Args:
+        lst_api_keys (List[str]): A list of file paths to JSON files containing API key data.
+    Returns:
+        Tuple[str, dict, bool]: A tuple containing the filename (without extension) of the first API key file with available quota,
+                            the API key data as a dictionary, and a boolean indicating if a valid key was found.
+    """
     status_ok = False
     for key_idx, apifile_path in enumerate(lst_api_keys):
         api_filename = os.path.splitext(os.path.basename(apifile_path))[0]
@@ -85,9 +127,17 @@ def check_keys(lst_api_keys):
     return api_filename, api_key_data, status_ok
-def search_videos(client, query_dict, next_token) :
+def search_videos(client: Resource, query_dict : dict, next_token : str) -> tuple :
     """
-    Query to search for videos using a string query and a dict of parameters
+    Query to search for videos using a string query and a dictionary of parameters.
+    Args:
+        client (googleapiclient.discovery.Resource): The YouTube Data API client.
+        query_dict (Dict[str, Any]): A dictionary containing query parameters.
+        next_token (str, optional): The token for the next page of results. Defaults to None.
+    Returns:
+        Tuple[List[Dict[str, Any]], str, int]: A tuple containing a list of search results, the next page token, and the total number of results.
     """
     try:
         if next_token is None:
@@ -134,9 +184,21 @@ def search_videos(client, query_dict, next_token) :
         total_results = 0
     return results, next_token, total_results
-def process_search_videos(client, query_dict, limit, query_id, json_path, next_token = None):
+def process_search_videos(client : Resource, query_dict: dict, limit :  int, query_id : str, json_path : str, next_token : str = None) -> tuple:
     """
-    process to iterate over pages of video search results and store JSON response in case of quota limit
+    Process to iterate over pages of video search results and store JSON response in case of quota limit.
+    Args:
+        client (googleapiclient.discovery.Resource): The YouTube Data API client.
+        query_dict (Dict[str, Any]): A dictionary containing query parameters.
+        limit (int): The maximum number of pages to retrieve.
+        query_id (str): An identifier for the query.
+        json_path (str): The directory path where JSONL files will be saved.
+        next_token (str, optional): The token for the next page of results. Defaults to None.
+    Returns:
+        Tuple[List[Dict[str, Any]], int, str, int]: A tuple containing the list of results, total results count,
+                                                    next page token, and the counter of processed pages.
     """
     counter=0
     results =[]
@@ -171,9 +233,16 @@ def process_search_videos(client, query_dict, limit, query_id, json_path, next_t
         print(e)
     return results, total_results, next_token, counter
-def get_video_details(client, lst):
+def get_video_details(client : Resource, lst : list) -> dict:
     """
-    Query to get video details
+    Query to get video details.
+    Args:
+        client (googleapiclient.discovery.Resource): The YouTube Data API client.
+        lst (List[str]): A list of video IDs to fetch details for.
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing video details.
     """
     try:
@@ -195,7 +264,16 @@ def get_video_details(client, lst):
 # Parsing functions
 #########################################################################################
-def parse_video_details(lst_items):
+def parse_video_details(lst_items : list) -> pd.DataFrame:
+    """
+    Parse video details from a list of video items.
+    Args:
+        lst_items (List[Dict[str, Any]]): List of dictionaries containing video details.
+    Returns:
+        pd.DataFrame: DataFrame containing the parsed video details.
+    """
     all_records =[]
     for item in lst_items:
         video_id = item.get("id", None)
@@ -251,7 +329,16 @@ def parse_video_details(lst_items):
                   "license", "embeddable", "madeForKids"])
     return df
-def parse_search_results(jsonl_data):
+def parse_search_results(jsonl_data : list) -> pd.DataFrame:
+    """
+    Parse search results from JSONL data to extract video details.
+    Args:
+        jsonl_data (List[Dict[str, Any]]): List of dictionaries containing video search results.
+    Returns:
+        pd.DataFrame: DataFrame containing the parsed video details.
+    """
     all_records =[]
     for json in jsonl_data:
         video_id = json.get("id", {}).get("videoId", "")

opsci-toolbox 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

opsci-toolbox 0.0.6py3-none-any.whl → 0.0.8py3-none-any.whl