PyPI - opsci-toolbox - Versions diffs - 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl - Mend

opsci-toolbox 0.0.6py3-none-any.whl → 0.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

opsci_toolbox/apis/rapidapi_helpers.py +120 -21
opsci_toolbox/apis/webscraping.py +186 -59
opsci_toolbox/apis/youtube_helpers.py +103 -16
opsci_toolbox/helpers/common.py +368 -254
opsci_toolbox/helpers/cv.py +50 -60
opsci_toolbox/helpers/dataviz.py +255 -184
opsci_toolbox/helpers/dates.py +17 -18
opsci_toolbox/helpers/nlp.py +154 -114
opsci_toolbox/helpers/nlp_cuml.py +389 -36
opsci_toolbox/helpers/sna.py +509 -0
opsci_toolbox/helpers/sql.py +53 -0
{opsci_toolbox-0.0.6.dist-info → opsci_toolbox-0.0.8.dist-info}/METADATA +14 -9
opsci_toolbox-0.0.8.dist-info/RECORD +22 -0
opsci_toolbox-0.0.6.dist-info/RECORD +0 -21
{opsci_toolbox-0.0.6.dist-info → opsci_toolbox-0.0.8.dist-info}/WHEEL +0 -0
{opsci_toolbox-0.0.6.dist-info → opsci_toolbox-0.0.8.dist-info}/top_level.txt +0 -0

opsci_toolbox/apis/rapidapi_helpers.py CHANGED Viewed

@@ -7,7 +7,30 @@ from datetime import datetime,timedelta
 from opsci_toolbox.helpers.dates import str_to_datetime
 from opsci_toolbox.helpers.common import write_jsonl
-def create_queries_per_period(query, publishedAfter, publishedBefore, col_publishedAfter = "start_date", col_publishedBefore = "end_date", date_format = '%Y-%m-%d', rolling_days = 7 ):
+def create_queries_per_period(
+    query: dict,
+    publishedAfter: str,
+    publishedBefore: str,
+    col_publishedAfter: str = "start_date",
+    col_publishedBefore: str = "end_date",
+    date_format: str = '%Y-%m-%d',
+    rolling_days: int = 7
+) -> list:
+    """
+    Generates a list of query dictionaries with date ranges for a rolling period.
+    Args:
+        query (dict): The base query dictionary to be modified with date ranges.
+        publishedAfter (str): The start date in string format.
+        publishedBefore (str): The end date in string format.
+        col_publishedAfter (str, optional): The key name for the start date in the query dictionary. Defaults to "start_date".
+        col_publishedBefore (str, optional): The key name for the end date in the query dictionary. Defaults to "end_date".
+        date_format (str, optional): The format of the input date strings. Defaults to '%Y-%m-%d'.
+        rolling_days (int, optional): The number of days for each rolling period. Defaults to 7.
+    Returns:
+        list: A list of query dictionaries with updated date ranges.
+    """
     datetime_publishedAfter = datetime.strptime(publishedAfter, date_format)
     datetime_publishedBefore = datetime.strptime(publishedBefore, date_format)
@@ -34,17 +57,31 @@ def create_queries_per_period(query, publishedAfter, publishedBefore, col_publis
     return queries
-def remove_extra_spaces(text):
+def remove_extra_spaces(text: str) -> str:
     """
-    Remove extra spaces
+    Removes extra spaces from the input text, including leading and trailing spaces.
+    Args:
+        text (str): The input text from which extra spaces should be removed.
+    Returns:
+        str: The cleaned text with extra spaces removed.
     """
     cleaned_text = re.sub(r'\s+', ' ', text)
     return cleaned_text.strip()
-def query_rapidAPI(url: str, query_dict: dict, host: str):
+def query_rapidAPI(url: str, query_dict: dict, host: str)-> requests.Response:
     """
-    Function to query RapidAPI
+    Function to query RapidAPI.
+    Args:
+        url (str): The URL for the RapidAPI endpoint.
+        query_dict (dict): A dictionary containing query parameters.
+        host (str): The RapidAPI host.
+    Returns:
+        requests.Response: The response object from the RapidAPI request, or None if an error occurs.
     """
     headers = {
@@ -62,7 +99,16 @@ def query_rapidAPI(url: str, query_dict: dict, host: str):
     return response
-def response_header(response):
+def response_header(response: requests.Response) -> dict:
+    """
+    Retrieves the headers from an HTTP response object.
+    Args:
+        response: The HTTP response object from which headers are to be retrieved.
+    Returns:
+        dict: The headers of the HTTP response.
+    """
     return response.headers
 ##################################################################################################
@@ -70,9 +116,15 @@ def response_header(response):
 # https://rapidapi.com/omarmhaimdat/api/twitter154
 ##################################################################################################
-def parse_user(user : dict):
+def parse_user(user: dict) -> tuple:
     """
-    Parse the subdict related to user data
+    Parse the subdict related to user data.
+    Args:
+        user (dict): Dictionary containing user data.
+    Returns:
+        tuple: A tuple containing parsed user data fields.
     """
     if user:
         user_creation_date=user.get("creation_date","")
@@ -109,9 +161,15 @@ def parse_user(user : dict):
     record = (user_creation_date, user_id, user_username, user_name, user_follower_count, user_following_count, user_favourites_count, user_is_private, user_is_verified, user_is_blue_verified, user_location, user_profile_pic_url, user_profile_banner_url, user_description, user_external_url, user_number_of_tweets, user_bot, user_timestamp, user_has_nft_avatar,user_category, user_default_profile, user_default_profile_image, user_listed_count, user_verified_type)
     return record
-def parse_retweet(data):
+def parse_retweet(data: dict) -> tuple:
     """
-    Parse subdict related to original tweet if the captured tweet is RT
+    Parse subdict related to original tweet if the captured tweet is RT.
+    Args:
+        data (dict): Dictionary containing tweet data.
+    Returns:
+        tuple: A tuple containing parsed tweet data fields.
     """
     if data:
         tweet_id=data.get("tweet_id", "")
@@ -150,9 +208,15 @@ def parse_retweet(data):
     record=(tweet_id, creation_date, text,media_url, video_url, language, favorite_count, retweet_count, reply_count, quote_count, retweet, views, timestamp, video_view_count,in_reply_to_status_id, quoted_status_id, expanded_url, retweet_tweet_id,conversation_id,bookmark_count, source,community_note)
     return record
-def parse_entities(extended_entities):
+def parse_entities(extended_entities: dict) -> tuple:
     """
-    Parse the subdict related to extended entities (image, video, tags...)
+    Parse the subdict related to extended entities (image, video, tags...).
+    Args:
+        extended_entities (dict): Dictionary containing extended entities data.
+    Returns:
+        tuple: A tuple containing parsed extended entities data fields.
     """
     id_str, indices, media_key, media_url, media_type, original_info, height, width, ext_alt_text, monetizable, aspect_ratio, duration_millis = [], [], [], [], [], [], [], [], [], [], [], []
     all_x, all_y, all_h, all_w =[], [], [], []
@@ -222,9 +286,15 @@ def parse_entities(extended_entities):
     record = (id_str, indices, media_key, media_url, media_type, all_x, all_y, all_h, all_w, height, width, ext_alt_text, all_tag_user_id, all_tag_user_screenname, all_tag_user_type, monetizable, aspect_ratio, duration_millis, all_variants_url, all_variants_bitrate, all_variants_content_type)
     return record
-def parse_tweet(json_data):
+def parse_tweet(json_data: list) -> pd.DataFrame:
     """
-    Parse a batch of tweets
+    Parse a batch of tweets.
+    Args:
+        json_data (list): List of dictionaries containing tweet data.
+    Returns:
+        pd.DataFrame: A pandas DataFrame containing parsed tweet data.
     """
     all_records=[]
     for data in json_data:
@@ -279,9 +349,15 @@ def parse_tweet(json_data):
     df = pd.DataFrame.from_records(all_records, columns = all_cols)
     return df
-def parse_twitter_list_details(json_data):
+def parse_twitter_list_details(json_data : dict) -> pd.DataFrame:
     """
-    Parse list results from https://rapidapi.com/omarmhaimdat/api/twitter154
+    Parse list results from https://rapidapi.com/omarmhaimdat/api/twitter154.
+    Args:
+        json_data (dict): Dictionary containing list details data.
+    Returns:
+        pd.DataFrame: A pandas DataFrame containing parsed list details.
     """
     list_id = json_data.get("list_id", "")
     list_id_str = json_data.get("list_id_str", "")
@@ -304,7 +380,16 @@ def parse_twitter_list_details(json_data):
 # https://instagram-scraper2.p.rapidapi.com/hash_tag_medias_v2
 ######################################################################################
-def instagram_parse_hashtag_data(hashtag_data):
+def instagram_parse_hashtag_data(hashtag_data: dict)-> pd.DataFrame:
+    """
+    Parse Instagram hashtag data into a DataFrame.
+    Args:
+        hashtag_data (dict): Dictionary containing Instagram hashtag data.
+    Returns:
+        pd.DataFrame: A pandas DataFrame containing parsed hashtag data.
+    """
     hashtag_id =  hashtag_data.get("id")
     hashtag_name =  hashtag_data.get("name")
     allow_following =  hashtag_data.get("allow_following")
@@ -354,9 +439,17 @@ def instagram_parse_hashtag_data(hashtag_data):
 # function to parse Twitter data
 # https://rapidapi.com/twttrapi-twttrapi-default/api/twttrapi
 ######################################################################################
-def compile_list_entries(json_data, path_json, filename):
+def compile_list_entries(json_data: dict, path_json: str, filename: str)-> tuple:
     """
-    Function to return next cursor and list details from https://twttrapi.p.rapidapi.com/list-members
+    Function to process list entries from Twitter API response and write to JSONL file. https://twttrapi.p.rapidapi.com/list-members
+    Args:
+        json_data (dict): JSON response data from Twitter API.
+        path_json (str): Path to directory where JSONL file will be saved.
+        filename (str): Name of the JSONL file.
+    Returns:
+        tuple: A tuple containing a list of results (user legacy data) and next cursor (str or None).
     """
     results = []
     next_cursor = None
@@ -377,9 +470,15 @@ def compile_list_entries(json_data, path_json, filename):
     return results, next_cursor
-def parse_list_entries(jsonl_data):
+def parse_list_entries(jsonl_data: list)-> pd.DataFrame:
     """
-    Function to parse list details from https://twttrapi.p.rapidapi.com/list-members
+    Parse list details from JSONL data obtained from the Twitter API.
+    Args:
+        jsonl_data (list): List of dictionaries containing JSON data.
+    Returns:
+        pd.DataFrame: DataFrame containing parsed list details.
     """
     all_records=[]
     for data in jsonl_data:

opsci_toolbox/apis/webscraping.py CHANGED Viewed

@@ -12,18 +12,30 @@ import pandas as pd
 from tqdm import tqdm
-def url_get_domain(url):
+def url_get_domain(url: str) -> str:
     """
-    Return the domain name from a url
+    Extracts and returns the domain name from a given URL.
+    Args:
+        url (str): The URL string from which the domain name is to be extracted.
+    Returns:
+        str: The domain name extracted from the URL.
     """
     parsed_url = urlparse(url)
     domain = parsed_url.hostname if parsed_url.hostname else parsed_url.netloc
     return domain
-def url_get_extension(url):
+def url_get_extension(url: str) -> str:
     """
-    Return the extension of the domain name from a url
+    Extracts and returns the extension (TLD) of the domain name from a given URL.
+    Args:
+        url (str): The URL string from which the domain extension is to be extracted.
+    Returns:
+        str: The extension (TLD) of the domain name extracted from the URL.
     """
     # Parse the URL using urlparse
     parsed_url = urlparse(url)
@@ -34,15 +46,21 @@ def url_get_extension(url):
     # Split the netloc by '.' to get the domain and TLD
     domain_parts = netloc.split(".")
-    # Get the last two parts, which represent the domain and TLD
+    # Get the last part, which represents the TLD
     extension = ".".join(domain_parts[-1:])
     return extension
-def url_clean_parameters(url):
+def url_clean_parameters(url: str) -> str:
     """
-    Return a URL without any parameters or utm tags
+    Removes query parameters and UTM tags from a given URL and returns the cleaned URL.
+    Args:
+        url (str): The URL string from which parameters and UTM tags are to be removed.
+    Returns:
+        str: The cleaned URL without any parameters or UTM tags.
     """
     parsed_url = urlparse(url)
     netloc = parsed_url.netloc if parsed_url.netloc else ""
@@ -50,9 +68,15 @@ def url_clean_parameters(url):
     return netloc + path
-def url_clean_protocol(url):
+def url_clean_protocol(url: str) -> str:
     """
-    Remove https / http from a url
+    Removes the 'http://' or 'https://' prefix from a given URL.
+    Args:
+        url (str): The URL string from which the protocol is to be removed.
+    Returns:
+        str: The URL without the 'http://' or 'https://' prefix.
     """
     prefixes_to_remove = ["https://", "http://"]
@@ -64,9 +88,15 @@ def url_clean_protocol(url):
     return url
-def url_remove_www(url):
+def url_remove_www(url: str) -> str:
     """
-    Remove www from a url
+    Removes the 'www.' prefix from a given URL, along with any protocol prefix.
+    Args:
+        url (str): The URL string from which the 'www.' prefix is to be removed.
+    Returns:
+        str: The URL without the 'www.' prefix.
     """
     prefixes_to_remove = ["https://www.", "http://www.", "https://", "http://", "www."]
@@ -78,9 +108,15 @@ def url_remove_www(url):
     return url
-def url_add_protocol(url):
+def url_add_protocol(url: str) -> str:
     """
-    Return a formatted url with protocol and www. if necessary
+    Ensures the given URL has a protocol ('https://') and 'www.' prefix if necessary.
+    Args:
+        url (str): The URL string to be formatted with protocol and 'www.' prefix if required.
+    Returns:
+        str: The formatted URL with protocol and 'www.' prefix if it was missing.
     """
     parsed_url = urlparse(url)
@@ -96,9 +132,15 @@ def url_add_protocol(url):
     return url
-def url_is_valid(url):
+def url_is_valid(url: str) -> bool:
     """
-    Checks if a URL is valid
+    Checks if a given URL is valid.
+    Args:
+        url (str): The URL string to be validated.
+    Returns:
+        bool: True if the URL is valid, False otherwise.
     """
     try:
         parsed_url = urlparse(url)
@@ -108,9 +150,15 @@ def url_is_valid(url):
         return False
-def url_is_reachable(url):
+def url_is_reachable(url: str) -> bool:
     """
-    Checks if url is reachable (no 404 error...)
+    Checks if a given URL is reachable (i.e., does not return a 404 error or other HTTP errors).
+    Args:
+        url (str): The URL string to be checked for reachability.
+    Returns:
+        bool: True if the URL is reachable, False otherwise.
     """
     try:
         response = requests.get(url)
@@ -121,9 +169,15 @@ def url_is_reachable(url):
         return False  # HTTP error occurred, URL is not reachable
-def scrape(url):
+def scrape(url: str) -> requests.Response:
     """
-    Get requests and return full response
+    Sends a GET request to the given URL and returns the full response.
+    Args:
+        url (str): The URL to be requested.
+    Returns:
+        requests.Response: The full response from the GET request.
     """
     try:
         response = requests.get(url)
@@ -133,11 +187,17 @@ def scrape(url):
     return response
-def justext_parse_content(response, languages=["English", "French"]):
-    """
-    Return main content from a HTML response
+def justext_parse_content(response: requests.Response, languages: list = ["English", "French"]) -> str:
     """
+    Extracts and returns the main content from an HTML response using jusText.
+    Args:
+        response (requests.Response): The HTTP response object containing the HTML content.
+        languages (list): A list of languages to use for stopword lists in jusText. Default is ["English", "French"].
+    Returns:
+        str: The extracted main content from the HTML response.
+    """
     stoplist = frozenset()
     for lang in languages:
@@ -158,9 +218,15 @@ def justext_parse_content(response, languages=["English", "French"]):
     return concatenated_text
-def trafilatura_parse_content(response):
+def trafilatura_parse_content(response: requests.Response) -> str:
     """
-    Return main content from a HTML response
+    Extracts and returns the main content from an HTML response using Trafilatura.
+    Args:
+        response (requests.Response): The HTTP response object containing the HTML content.
+    Returns:
+        str: The extracted main content from the HTML response.
     """
     try:
         text = extract(response.content)
@@ -171,13 +237,13 @@ def trafilatura_parse_content(response):
 def process_scraping(
-    url,
-    path,
-    method="justext",
-    languages=["English", "French"],
-    title=True,
-    meta=True,
-    lst_properties=[
+    url: str,
+    path: str,
+    method: str = "justext",
+    languages: list = ["English", "French"],
+    title: bool = True,
+    meta: bool = True,
+    lst_properties: list = [
         "og:site_name",
         "og:url",
         "og:title",
@@ -204,7 +270,23 @@ def process_scraping(
         "al:android:package",
         "al:android:app_name",
     ],
-):
+) -> dict:
+    """
+    Process scraping of a URL, extract main content, title, and meta properties,
+    and store the results in a JSON file.
+    Args:
+        url (str): The URL to scrape.
+        path (str): The directory path where the scraped data JSON file will be saved.
+        method (str, optional): The method to use for content extraction ('justext' or 'trafilatura'). Defaults to 'justext'.
+        languages (list, optional): A list of languages for stopword lists in jusText. Defaults to ["English", "French"].
+        title (bool, optional): Whether to extract the title from the HTML. Defaults to True.
+        meta (bool, optional): Whether to extract meta properties from the HTML. Defaults to True.
+        lst_properties (list, optional): List of specific meta properties to extract. Defaults to a comprehensive list.
+    Returns:
+        dict or None: A dictionary containing the extracted data and file path if successful, or None if an error occurs.
+    """
     try:
         # We name the files
@@ -257,9 +339,15 @@ def process_scraping(
         return None
-def parse_title(response):
+def parse_title(response: requests.Response) -> str:
     """
-    Return webpage title
+    Extracts and returns the webpage title from an HTML response.
+    Parameters:
+    response (requests.Response): The HTTP response object containing the HTML content.
+    Returns:
+    str or None: The extracted title text if found, None if no title tag is found or an error occurs.
     """
     try:
         soup = BeautifulSoup(response.content, "html.parser")
@@ -277,9 +365,7 @@ def parse_title(response):
         return None
-def get_meta_properties(
-    response,
-    lst_properties=[
+def get_meta_properties(response: requests.Response, lst_properties: list = [
         "og:site_name",
         "og:url",
         "og:title",
@@ -305,10 +391,16 @@ def get_meta_properties(
         "al:ios:app_name",
         "al:android:package",
         "al:android:app_name",
-    ],
-):
+    ]) -> dict:
     """
-    Parse a list of meta tags from a webpage and returns a dict
+    Extracts specified meta properties from a webpage and returns them as a dictionary.
+    Args:
+        response (requests.Response): The HTTP response object containing the HTML content.
+        lst_properties (list, optional): A list of meta property names to extract. Defaults to a comprehensive list.
+    Returns:
+        dict or None: A dictionary mapping meta property names to their content values if found, or None if an error occurs.
     """
     try:
@@ -338,14 +430,14 @@ def get_meta_properties(
 def parallel_scraping(
-    urls,
-    path,
-    max_workers=8,
-    method="justext",
-    languages=["English", "French"],
-    title=True,
-    meta=True,
-    lst_properties=[
+    urls: list,
+    path: str,
+    max_workers: int = 8,
+    method: str = "justext",
+    languages: list = ["English", "French"],
+    title: bool = True,
+    meta: bool = True,
+    lst_properties: list = [
         "og:site_name",
         "og:url",
         "og:title",
@@ -373,9 +465,19 @@ def parallel_scraping(
         "al:android:app_name",
     ],
 ):
     """
-    Execute concurrent threads to scrape multiple webpages
+    Execute concurrent threads to scrape multiple webpages.
+    Args:
+        urls (list): List of URLs to scrape.
+        path (str): The directory path where scraped data will be saved.
+        max_workers (int, optional): Maximum number of concurrent threads. Defaults to 8.
+        method (str, optional): Method to use for content extraction ('justext' or 'trafilatura'). Defaults to 'justext'.
+        languages (list, optional): Languages for stopword lists in jusText. Defaults to ['English', 'French'].
+        title (bool, optional): Whether to extract title from HTML. Defaults to True.
+        meta (bool, optional): Whether to extract meta properties from HTML. Defaults to True.
+        lst_properties (list, optional): List of specific meta properties to extract. Defaults to a comprehensive list.
     """
     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
         # Submit scraping tasks for each URL and add tqdm progress bar
@@ -403,9 +505,17 @@ def parallel_scraping(
                 print(f"Error scraping : {e}")
-def parse_scraped_webpages(path_json_files, output_path, name):
+def parse_scraped_webpages(path_json_files: str, output_path: str, name: str) -> pd.DataFrame:
     """
-    Parse JSON files captured by scraper
+    Parse JSON files containing scraped data and save the extracted data into a CSV file.
+    Args:
+        path_json_files (str): Directory path containing JSON files.
+        output_path (str): Directory path where the CSV file will be saved.
+        name (str): Name of the CSV file.
+    Returns:
+        pd.DataFrame: DataFrame containing the parsed data from JSON files.
     """
     extracted_data = []
@@ -421,10 +531,20 @@ def parse_scraped_webpages(path_json_files, output_path, name):
     save_dataframe_csv(df, output_path, name)
     return df
-def download_file(url:str, path:str):
-    '''
-    Download a file using a URL and write in a local file
-    '''
+def download_file(url: str, path: str) -> None:
+    """
+    Download a file from a URL and save it locally.
+    ARgs:
+        url (str): The URL of the file to download.
+        path (str): The local path where the file will be saved.
+    Raises:
+        requests.exceptions.RequestException: If an HTTP error occurs during the request.
+    Returns:
+        None
+    """
     try:
         response = requests.get(url)
         response.raise_for_status()  # Raise an HTTPError for bad responses
@@ -435,10 +555,17 @@ def download_file(url:str, path:str):
     except requests.exceptions.RequestException as e:
         print(f"Error downloading file: {url, e}")
-def parallel_dl(urls, paths, max_workers=8):
+def parallel_dl(urls: list, paths: list, max_workers: int = 8) -> None:
     """
-    Execute concurrent threads to scrape multiple webpages
+    Execute concurrent threads to download multiple files from URLs and save them locally.
+    Args:
+        urls (list): List of URLs to download files from.
+        paths (list): List of local paths where downloaded files will be saved.
+        max_workers (int, optional): Maximum number of concurrent threads. Defaults to 8.
+    Returns:
+        None
     """
     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
         # Submit scraping tasks for each URL and add tqdm progress bar

opsci-toolbox 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

opsci-toolbox 0.0.6py3-none-any.whl → 0.0.8py3-none-any.whl