PyPI - opsci-toolbox - Versions diffs - 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl - Mend

opsci-toolbox 0.0.7py3-none-any.whl → 0.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

opsci_toolbox/apis/rapidapi_helpers.py +120 -21
opsci_toolbox/apis/webscraping.py +186 -59
opsci_toolbox/apis/youtube_helpers.py +103 -16
opsci_toolbox/helpers/common.py +368 -254
opsci_toolbox/helpers/cv.py +50 -60
opsci_toolbox/helpers/dataviz.py +255 -184
opsci_toolbox/helpers/dates.py +17 -18
opsci_toolbox/helpers/nlp.py +154 -114
opsci_toolbox/helpers/nlp_cuml.py +389 -36
opsci_toolbox/helpers/sna.py +509 -0
opsci_toolbox/helpers/sql.py +53 -0
{opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.8.dist-info}/METADATA +14 -9
opsci_toolbox-0.0.8.dist-info/RECORD +22 -0
opsci_toolbox-0.0.7.dist-info/RECORD +0 -21
{opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.8.dist-info}/WHEEL +0 -0
{opsci_toolbox-0.0.7.dist-info → opsci_toolbox-0.0.8.dist-info}/top_level.txt +0 -0

opsci_toolbox/helpers/common.py CHANGED Viewed

@@ -16,6 +16,7 @@ from datetime import datetime
 import hashlib
 import ast
 import subprocess
+import chardet
 ####################################################################################################
 # FILE LOADERS
@@ -25,16 +26,16 @@ def load_file(path: str, delimiter: str = ";", decimal: str = ".")  -> pd.DataFr
     """
     Load a file into a Pandas DataFrame based on the file extension.
-    Parameters:
-    path (str): The file path to load.
-    delimiter (str, optional): The delimiter used in CSV/TSV files. Default is ";".
-    decimal (str, optional): The character used for decimal points in CSV/TSV files. Default is ".".
+    Args:
+        path (str): The file path to load.
+        delimiter (str, optional): The delimiter used in CSV/TSV files. Default is ";".
+        decimal (str, optional): The character used for decimal points in CSV/TSV files. Default is ".".
     Returns:
-    pd.DataFrame: The loaded data as a Pandas DataFrame.
+        pandas.DataFrame: The loaded data as a Pandas DataFrame.
     Raises:
-    ValueError: If the file extension is not supported.
+        ValueError: If the file extension is not supported.
     """
     extension = os.path.splitext(os.path.basename(path))[1]
     if extension == ".parquet":
@@ -57,14 +58,14 @@ def load_parquet(path: str) -> pd.DataFrame:
     """
     Load a parquet file into a DataFrame.
-    Parameters:
-    path (str): The file path to the parquet file.
+    Args:
+        path (str): The file path to the parquet file.
     Returns:
-    pd.DataFrame: The loaded data as a Pandas DataFrame.
+        pandas.DataFrame: The loaded data as a Pandas DataFrame.
     Raises:
-    Exception: If there is an error reading the parquet file.
+        Exception: If there is an error reading the parquet file.
     """
     try:
         table = pq.read_table(path)
@@ -74,15 +75,37 @@ def load_parquet(path: str) -> pd.DataFrame:
         print(e)
     return df
+def load_excel(path : str, sheet_name : str = ""):
+    """
+    Loads an Excel sheet into a Pandas DataFrame.
+    Args:
+        file_path (str): Path to the Excel file.
+        sheet_name (str, int, list, or None): Name of sheet or sheet number to load.
+            0 (default) - Load first sheet.
+            str - Load sheet with specified name.
+            list - Load multiple sheets, returns a dictionary of DataFrames.
+            None - Load all sheets, returns a dictionary of DataFrames.
+    Returns:
+        DataFrame or dict of DataFrames.
+    """
+    try:
+        df = pd.read_excel(path, sheet_name=sheet_name)
+        return df
+    except Exception as e:
+        print(f"Error loading Excel file: {e}")
+        return None
 def load_pickle(path: str) -> pd.DataFrame:
     """
     Load a pickle file into a DataFrame.
-    Parameters:
-    path (str): The file path to the pickle file.
+    Args:
+        path (str): The file path to the pickle file.
     Returns:
-    pd.DataFrame: The loaded data as a Pandas DataFrame.
+        pandas.DataFrame: The loaded data as a Pandas DataFrame.
     """
     return pd.read_pickle(path)
@@ -91,14 +114,14 @@ def load_json(path: str) -> pd.DataFrame:
     """
     Load a JSON file into a DataFrame.
-    Parameters:
-    path (str): The file path to the JSON file.
+    Args:
+        path (str): The file path to the JSON file.
     Returns:
-    pd.DataFrame: The loaded data as a Pandas DataFrame.
+        pd.DataFrame: The loaded data as a Pandas DataFrame.
     Raises:
-    Exception: If there is an error reading the JSON file.
+        Exception: If there is an error reading the JSON file.
     """
     df = pd.DataFrame()
     try:
@@ -114,14 +137,14 @@ def load_jsonl(path: str) -> pd.DataFrame:
     """
     Load a JSON Lines (jsonl) file into a DataFrame.
-    Parameters:
-    path (str): The file path to the jsonl file.
+    Args:
+        path (str): The file path to the jsonl file.
     Returns:
-    pd.DataFrame: The loaded data as a Pandas DataFrame.
+        pd.DataFrame: The loaded data as a Pandas DataFrame.
     Raises:
-    Exception: If there is an error reading the jsonl file.
+        Exception: If there is an error reading the jsonl file.
     """
     df = pd.DataFrame()
     try:
@@ -144,16 +167,16 @@ def load_csv(path: str, delimiter: str = ";", decimal: str = ".") -> pd.DataFram
     """
     Load a CSV file into a DataFrame.
-    Parameters:
-    path (str): The file path to the CSV file.
-    delimiter (str, optional): The delimiter used in the CSV file. Default is ";".
-    decimal (str, optional): The character used for decimal points in the CSV file. Default is ".".
+    Args:
+        path (str): The file path to the CSV file.
+        delimiter (str, optional): The delimiter used in the CSV file. Default is ";".
+        decimal (str, optional): The character used for decimal points in the CSV file. Default is ".".
     Returns:
-    pd.DataFrame: The loaded data as a Pandas DataFrame.
+        pd.DataFrame: The loaded data as a Pandas DataFrame.
     Raises:
-    Exception: If there is an error reading the CSV file.
+        Exception: If there is an error reading the CSV file.
     """
     df = pd.DataFrame()
     try:
@@ -167,15 +190,15 @@ def read_txt_to_list(file_path: str) -> list[str]:
     """
     Read a text file line by line and append to a Python list.
-    Parameters:
-    file_path (str): The file path to the text file.
+    Args:
+        file_path (str): The file path to the text file.
     Returns:
-    list[str]: A list of lines read from the text file.
+        list[str]: A list of lines read from the text file.
     Raises:
-    FileNotFoundError: If the file does not exist.
-    Exception: If any other error occurs during file reading.
+        FileNotFoundError: If the file does not exist.
+        Exception: If any other error occurs during file reading.
     """
     # Initialize an empty list to store the lines
@@ -197,15 +220,15 @@ def read_json(path: str) -> dict:
     """
     Read a JSON file and return a dictionary.
-    Parameters:
-    path (str): The file path to the JSON file.
+    Args:
+        path (str): The file path to the JSON file.
     Returns:
-    dict: The data read from the JSON file as a dictionary.
+        dict: The data read from the JSON file as a dictionary.
     Raises:
-    FileNotFoundError: If the file does not exist.
-    Exception: If there is an error reading the JSON file.
+        FileNotFoundError: If the file does not exist.
+        Exception: If there is an error reading the JSON file.
     """
     with open(path, 'r') as json_file:
         data = json.load(json_file)
@@ -215,15 +238,15 @@ def read_txt_file(file_path: str) -> str:
     """
     Read the content of a text file and return it as a string.
-    Parameters:
-    file_path (str): The file path to the text file.
+    Args:
+        file_path (str): The file path to the text file.
     Returns:
-    str: The content of the text file as a string.
+        str: The content of the text file as a string.
     Raises:
-    FileNotFoundError: If the file does not exist.
-    Exception: If there is an error reading the text file.
+        FileNotFoundError: If the file does not exist.
+        Exception: If there is an error reading the text file.
     """
     try:
         with open(file_path, 'r') as file:
@@ -240,15 +263,15 @@ def read_jsonl(path: str) -> list[dict]:
     """
     Load a JSON Lines (jsonl) file into a list of dictionaries.
-    Parameters:
-    path (str): The file path to the jsonl file.
+    Args:
+        path (str): The file path to the jsonl file.
     Returns:
-    list[dict]: A list of dictionaries containing the data read from the JSON Lines file.
+        list[dict]: A list of dictionaries containing the data read from the JSON Lines file.
     Raises:
-    FileNotFoundError: If the file does not exist.
-    Exception: If there is an error reading the jsonl file.
+        FileNotFoundError: If the file does not exist.
+        Exception: If there is an error reading the jsonl file.
     """
     json_data = []
     try:
@@ -274,13 +297,13 @@ def write_pickle(data: pd.DataFrame, path: str, filename: str) -> str:
     """
     Write a DataFrame into a pickle file.
-    Parameters:
-    data (pd.DataFrame): The DataFrame to be written to the pickle file.
-    path (str): The directory where the pickle file will be saved.
-    filename (str): The name of the pickle file (without the extension).
+    Args:
+        data (pd.DataFrame): The DataFrame to be written to the pickle file.
+        path (str): The directory where the pickle file will be saved.
+        filename (str): The name of the pickle file (without the extension).
     Returns:
-    str: The full path to the saved pickle file.
+        str: The full path to the saved pickle file.
     """
     file_path = os.path.join(path, filename + '.pickle')
     with open(file_path, 'wb') as f:
@@ -292,13 +315,13 @@ def write_list_to_txt(input_list: list, path: str, name: str) -> str:
     """
     Write a list to a text file, with each item on a new line.
-    Parameters:
-    - input_list (list): The list to be written to the text file.
-    - path (str): The directory path where the text file will be saved.
-    - name (str): The name of the text file (without the extension).
+    Args:
+        input_list (list): The list to be written to the text file.
+        path (str): The directory path where the text file will be saved.
+        name (str): The name of the text file (without the extension).
     Returns:
-    str: The full path to the saved text file.
+        str: The full path to the saved text file.
     """
     file_path = os.path.join(path, name + '.txt')
     with open(file_path, 'w') as file:
@@ -310,13 +333,13 @@ def write_jsonl(data: list[dict], path: str, name: str) -> str:
     """
     Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
-    Parameters:
-    - data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
-    - path (str): The directory path where the JSON Lines file will be saved.
-    - name (str): The name of the JSON Lines file (without the extension).
+    Args:
+        data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
+        path (str): The directory path where the JSON Lines file will be saved.
+        name (str): The name of the JSON Lines file (without the extension).
     Returns:
-    str: The full path to the saved JSON Lines file.
+        str: The full path to the saved JSON Lines file.
     """
     file_path = os.path.join(path, name + '.jsonl')
     with open(file_path, 'w') as file:
@@ -330,13 +353,13 @@ def write_json(json_dict: dict, path: str, name: str) -> str:
     """
     Write a dictionary to a JSON file.
-    Parameters:
-    - json_dict (dict): The dictionary to be written to the JSON file.
-    - path (str): The directory path where the JSON file will be saved.
-    - name (str): The name of the JSON file (without the extension).
+    Args:
+        json_dict (dict): The dictionary to be written to the JSON file.
+        path (str): The directory path where the JSON file will be saved.
+        name (str): The name of the JSON file (without the extension).
     Returns:
-    str: The full path to the saved JSON file.
+        str: The full path to the saved JSON file.
     """
     file_path = os.path.join(path, name + '.json')
     with open(file_path, 'w') as outfile:
@@ -348,14 +371,14 @@ def write_dataframe_to_json(df: pd.DataFrame, path: str, name: str, orient: str
     """
     Write a DataFrame to a JSON file.
-    Parameters:
-    - df (pd.DataFrame): The DataFrame to be written to the JSON file.
-    - path (str): The directory path where the JSON file will be saved.
-    - name (str): The name of the JSON file (without the extension).
-    - orient (str, optional): The format of the JSON file. Default is 'records'.
+    Args:
+        df (pd.DataFrame): The DataFrame to be written to the JSON file.
+        path (str): The directory path where the JSON file will be saved.
+        name (str): The name of the JSON file (without the extension).
+        orient (str, optional): The format of the JSON file. Default is 'records'.
     Returns:
-    str: The full path to the saved JSON file.
+        str: The full path to the saved JSON file.
     """
     file_path = os.path.join(path, name + ".json")
     df.to_json(file_path, orient=orient, lines=True)
@@ -366,14 +389,14 @@ def save_dataframe_excel(df: pd.DataFrame, path: str, name: str, sheet_name: str
     """
     Write a DataFrame to an Excel file.
-    Parameters:
-    - df (pd.DataFrame): The DataFrame to be written to the Excel file.
-    - path (str): The directory path where the Excel file will be saved.
-    - name (str): The name of the Excel file (without the extension).
-    - sheet_name (str): The name of the Excel sheet.
+    Args:
+        df (pd.DataFrame): The DataFrame to be written to the Excel file.
+        path (str): The directory path where the Excel file will be saved.
+        name (str): The name of the Excel file (without the extension).
+        sheet_name (str): The name of the Excel sheet.
     Returns:
-    str: The full path to the saved Excel file.
+        str: The full path to the saved Excel file.
     """
     file_path = os.path.join(path, f"{name}.xlsx")
     df.to_excel(file_path, sheet_name=sheet_name, index=False)
@@ -384,13 +407,13 @@ def add_dataframe_to_excel(df: pd.DataFrame, existing_file_path: str, new_sheet_
     """
     Adds a DataFrame to an existing Excel file as a new sheet.
-    Parameters:
-    - df (pd.DataFrame): The DataFrame to be added.
-    - existing_file_path (str): Path to the existing Excel file.
-    - new_sheet_name (str): Name of the new sheet in the Excel file.
+    Args:
+        df (pd.DataFrame): The DataFrame to be added.
+        existing_file_path (str): Path to the existing Excel file.
+        new_sheet_name (str): Name of the new sheet in the Excel file.
     Returns:
-    - None
+        None
     """
     # Read existing Excel file into a dictionary of DataFrames
     excel_file = pd.read_excel(existing_file_path, sheet_name=None)
@@ -407,13 +430,13 @@ def save_dataframe_csv(df: pd.DataFrame, path: str, name: str) -> str:
     """
     Save a DataFrame to a CSV file within a specified directory.
-    Parameters:
-    - df (pd.DataFrame): The DataFrame to be saved.
-    - path (str): The directory where the CSV file will be saved.
-    - name (str): The desired name for the CSV file (without extension).
+    Args:
+        df (pd.DataFrame): The DataFrame to be saved.
+        path (str): The directory where the CSV file will be saved.
+        name (str): The desired name for the CSV file (without extension).
     Returns:
-    str: The full path to the saved CSV file.
+        str: The full path to the saved CSV file.
     """
     file_path = os.path.join(path, f"{name}.csv")
     df.to_csv(
@@ -430,31 +453,31 @@ def write_txt_file(data: str, path: str, name: str) -> str:
     """
     Write a string to a text file.
-    Parameters:
-    - data (str): The string to be written to the text file.
-    - path (str): The directory path where the text file will be saved.
-    - name (str): The name of the text file (without the extension).
+    Args:
+        data (str): The string to be written to the text file.
+        path (str): The directory path where the text file will be saved.
+        name (str): The name of the text file (without the extension).
     Returns:
-    str: The full path to the saved text file.
+        str: The full path to the saved text file.
     """
     file_path = os.path.join(path, name + '.txt')
     with open(file_path, "w") as file:
         file.write(data)
     return file_path
-def split_df_into_chunks(df: pd.DataFrame, path: str, name: str, chunk_size: int = 10000) -> list[str]:
+def split_df_into_chunks(df: pd.DataFrame, path: str, name: str, chunk_size: int = 10000) -> list:
     """
     Split a DataFrame into multiple pickle files with a specified chunk size.
-    Parameters:
-    - df (pd.DataFrame): The DataFrame to be split.
-    - path (str): The directory path where the pickle files will be saved.
-    - name (str): The base name for the pickle files.
-    - chunk_size (int, optional): The size of each chunk. Default is 10000.
+    Args:
+        df (pd.DataFrame): The DataFrame to be split.
+        path (str): The directory path where the pickle files will be saved.
+        name (str): The base name for the pickle files.
+        chunk_size (int, optional): The size of each chunk. Default is 10000.
     Returns:
-    list[str]: A list of file paths to the saved pickle files.
+        list[str]: A list of file paths to the saved pickle files.
     """
     num_chunks = -(-len(df) // chunk_size)  # Calculate the number of chunks using ceil division
@@ -479,11 +502,11 @@ def create_dir(path: str) -> str:
     """
     Create a local directory if it doesn't exist.
-    Parameters:
-    - path (str): The directory path to be created.
+    Args:
+        path (str): The directory path to be created.
     Returns:
-    str: The path of the created directory.
+        str: The path of the created directory.
     """
     if not os.path.exists(path):
         os.makedirs(path)
@@ -491,31 +514,31 @@ def create_dir(path: str) -> str:
     return path
-def list_files_in_dir(path: str, filetype: str = '*.json') -> list[str]:
+def list_files_in_dir(path: str, filetype: str = '*.json') -> list:
     """
     List files of a specific format in a directory.
-    Parameters:
-    - path (str): The directory path to search for files.
-    - filetype (str, optional): The file type pattern to search for. Default is '*.json'.
+    Args:
+        path (str): The directory path to search for files.
+        filetype (str, optional): The file type pattern to search for.
     Returns:
-    list[str]: A list of file paths matching the specified file type pattern.
+        list: A list of file paths matching the specified file type pattern.
     """
     pattern = os.path.join(path, filetype)
     files = glob.glob(pattern)
     return files
-def list_subdirectories(root_directory: str) -> list[str]:
+def list_subdirectories(root_directory: str) -> list:
     """
     List subdirectories in a root directory.
-    Parameters:
-    - root_directory (str): The root directory path.
+    Args:
+        root_directory (str): The root directory path.
     Returns:
-    list[str]: A list of subdirectory names.
+        list[str]: A list of subdirectory names.
     """
     subdirectories = []
     for entry in os.scandir(root_directory):
@@ -524,15 +547,15 @@ def list_subdirectories(root_directory: str) -> list[str]:
     return subdirectories
-def list_recursive_subdirectories(root_directory: str) -> list[str]:
+def list_recursive_subdirectories(root_directory: str) -> list:
     """
     List recursively all subdirectories from a root directory.
-    Parameters:
-    - root_directory (str): The root directory path.
+    Args:
+        root_directory (str): The root directory path.
     Returns:
-    list[str]: A list of subdirectory paths.
+        list[str]: A list of subdirectory paths.
     """
     subdirectories = []
     for root, dirs, files in os.walk(root_directory):
@@ -540,16 +563,16 @@ def list_recursive_subdirectories(root_directory: str) -> list[str]:
     return subdirectories
-def list_files_in_subdirectories(path: str, filetype: str = '*.json') -> list[str]:
+def list_files_in_subdirectories(path: str, filetype: str = '*.json') -> list:
     """
     Walk through subdirectories of a root directory to list files of a specific format.
-    Parameters:
-    - path (str): The root directory path.
-    - filetype (str, optional): The file type pattern to search for. Default is '*.json'.
+    Args:
+        path (str): The root directory path.
+        filetype (str, optional): The file type pattern to search for.
     Returns:
-    list[str]: A list of file paths matching the specified file type pattern in subdirectories.
+        list[str]: A list of file paths matching the specified file type pattern in subdirectories.
     """
     files = []
@@ -568,13 +591,13 @@ def copy_file(source_path: str, destination_path: str, new_filename: str = '') -
     """
     Copy a file from a source path to a destination path.
-    Parameters:
-    - source_path (str): The path of the source file.
-    - destination_path (str): The path of the destination directory.
-    - new_filename (str, optional): The new filename. If not provided, the original filename is used.
+    Args:
+        source_path (str): The path of the source file.
+        destination_path (str): The path of the destination directory.
+        new_filename (str, optional): The new filename. If not provided, the original filename is used.
     Returns:
-    str: The path of the copied file.
+        str: The path of the copied file.
     """
     if new_filename:
         file_path = os.path.join(destination_path, new_filename)
@@ -589,11 +612,11 @@ def remove_file(file_path: str) -> None:
     """
     Remove a single file.
-    Parameters:
-    - file_path (str): The path of the file to be removed.
+    Args:
+        file_path (str): The path of the file to be removed.
     Returns:
-    None
+        None
     """
     try:
         os.remove(file_path)
@@ -605,11 +628,11 @@ def remove_folder(folder_path: str) -> None:
     """
     Remove a folder and all its contents.
-    Parameters:
-    - folder_path (str): The path of the folder to be removed.
+    Args:
+        folder_path (str): The path of the folder to be removed.
     Returns:
-    None
+        None
     """
     try:
         shutil.rmtree(folder_path)
@@ -622,12 +645,11 @@ def get_file_size(file_path: str) -> tuple[int, str]:
     """
     Get the size of a single file in a readable format (KB, MB, GB).
-    Parameters:
-    - file_path (str): The path of the file.
+    Args:
+        file_path (str): The path of the file.
     Returns:
-    tuple[int, str]: A tuple containing the size of the file in bytes and its formatted size.
-    If the file is not found, returns None.
+        tuple[int, str]: A tuple containing the size of the file in bytes and its formatted size. If the file is not found, returns None.
     """
     try:
         size = os.path.getsize(file_path)
@@ -654,12 +676,12 @@ def get_folder_size(folder_path: str) -> tuple[int, str]:
     """
     Get the size of all files contained in a folder in a readable format (KB, MB, GB).
-    Parameters:
-    - folder_path (str): The path of the folder.
+    Args:
+        folder_path (str): The path of the folder.
     Returns:
-    tuple[int, str]: A tuple containing the total size of all files in bytes and its formatted size.
-    If the folder is not found, returns None.
+        tuple[int, str]: A tuple containing the total size of all files in bytes and its formatted size.
+        If the folder is not found, returns None.
     """
     total_size = 0
@@ -691,12 +713,12 @@ def file_creation_date(file_path: str) -> datetime:
     """
     Return the last update timestamp of a file.
-    Parameters:
-    - file_path (str): The path of the file.
+    Args:
+        file_path (str): The path of the file.
     Returns:
-    datetime: The last update timestamp as a datetime object.
-    If the file does not exist, returns None.
+        datetime: The last update timestamp as a datetime object.
+        If the file does not exist, returns None.
     """
     # Check if the file exists
     if os.path.exists(file_path):
@@ -717,12 +739,12 @@ def transform_to_n_items_list(lst: list, n: int) -> list[list]:
     """
     Transform a list into a list of n-items sublists.
-    Parameters:
-    - lst (list): The input list to be transformed.
-    - n (int): The number of items in each sublist.
+    Args:
+        lst (list): The input list to be transformed.
+        n (int): The number of items in each sublist.
     Returns:
-    list[list]: A list of n-items sublists.
+        list[list]: A list of n-items sublists.
     """
     return [lst[i:i + n] for i in range(0, len(lst), n)]
@@ -731,11 +753,11 @@ def unduplicate_list(lst: list) -> list:
     """
     Remove duplicate elements from a list.
-    Parameters:
-    - lst (list): The input list with possible duplicate elements.
+    Args:
+        lst (list): The input list with possible duplicate elements.
     Returns:
-    list: A list with duplicate elements removed.
+        list: A list with duplicate elements removed.
     """
     return list(set(lst))
@@ -744,13 +766,13 @@ def sort_list(lst: list, reverse: bool = False) -> list:
     """
     Sort the list in ascending or descending order.
-    Parameters:
-    - lst (list): The input list.
-    - reverse (bool): If True, sort the list in descending order.
+    Args:
+        lst (list): The input list.
+        reverse (bool): If True, sort the list in descending order.
                      If False (default), sort the list in ascending order.
     Returns:
-    list: A new list sorted based on the specified order.
+        list: A new list sorted based on the specified order.
     """
     return sorted(lst, reverse=reverse)
@@ -759,12 +781,12 @@ def map_list(lst: list, function: callable) -> list:
     """
     Apply a function to each element of the list.
-    Parameters:
-    - lst (list): The input list.
-    - function (callable): The function to apply to each element.
+    Args:
+        lst (list): The input list.
+        function (callable): The function to apply to each element.
     Returns:
-    list: A new list with the function applied to each element.
+        list: A new list with the function applied to each element.
     """
     return [function(element) for element in lst]
@@ -773,11 +795,11 @@ def flatten_list(lst: list) -> list:
     """
     Flatten a nested list into a single list.
-    Parameters:
-    - lst (list): The input nested list.
+    Args:
+        lst (list): The input nested list.
     Returns:
-    list: A new list with all nested elements flattened.
+        list: A new list with all nested elements flattened.
     """
     flattened_list = []
@@ -796,12 +818,12 @@ def find_occurrences(lst: list, element) -> int:
     """
     Find the occurrences of a specific element in the list.
-    Parameters:
-    - lst (list): The input list.
-    - element: The element to find occurrences of.
+    Args:
+        lst (list): The input list.
+        element: The element to find occurrences of.
     Returns:
-    int: The number of occurrences of the specified element in the list.
+        int: The number of occurrences of the specified element in the list.
     """
     return lst.count(element)
@@ -810,12 +832,12 @@ def is_subset(subset: list, superset: list) -> bool:
     """
     Check if one list is a subset of another.
-    Parameters:
-    - subset (list): The potential subset list.
-    - superset (list): The superset list.
+    Args:
+        subset (list): The potential subset list.
+        superset (list): The superset list.
     Returns:
-    bool: True if the subset is a subset of the superset, False otherwise.
+        bool: True if the subset is a subset of the superset, False otherwise.
     """
     return all(element in superset for element in subset)
@@ -823,12 +845,12 @@ def common_elements(list1: list, list2: list) -> list:
     """
     Find the common elements between two lists.
-    Parameters:
-    - list1 (list): The first list.
-    - list2 (list): The second list.
+    Args:
+        list1 (list): The first list.
+        list2 (list): The second list.
     Returns:
-    list: A new list containing the common elements between list1 and list2.
+        list: A new list containing the common elements between list1 and list2.
     """
     return list(set(list1) & set(list2))
@@ -837,11 +859,11 @@ def shuffle_list(lst: list) -> list:
     """
     Shuffle the elements of the list randomly.
-    Parameters:
-    - lst (list): The input list.
+    Args:
+        lst (list): The input list.
     Returns:
-    list: A new list with the elements shuffled randomly.
+        list: A new list with the elements shuffled randomly.
     """
     shuffled_list = lst.copy()
     random.shuffle(shuffled_list)
@@ -852,17 +874,17 @@ def sample_list(lst: list, sample_size) -> list:
     """
     Sample a list based on an integer or a float representing the sample size.
-    Parameters:
-    - lst (list): The input list.
-    - sample_size (int or float): If an integer, the number of elements to keep.
+    Args:
+        lst (list): The input list.
+        sample_size (int or float): If an integer, the number of elements to keep.
                                  If a float, the percentage of elements to keep.
     Returns:
-    list: A new list containing the sampled elements.
+        list: A new list containing the sampled elements.
     Raises:
-    - ValueError: If the sample size is invalid (negative integer or float outside [0, 1]).
-    - TypeError: If the sample size is neither an integer nor a float.
+        ValueError: If the sample size is invalid (negative integer or float outside [0, 1]).
+        TypeError: If the sample size is neither an integer nor a float.
     """
     if isinstance(sample_size, int):
         if sample_size < 0:
@@ -880,11 +902,11 @@ def count_elements(lst: list) -> dict:
     """
     Count the occurrences of each element in the list.
-    Parameters:
-    - lst (list): The input list.
+    Args:
+        lst (list): The input list.
     Returns:
-    dict: A dictionary where keys are unique elements from the list, and values are their counts.
+        dict: A dictionary where keys are unique elements from the list, and values are their counts.
     """
     return dict(Counter(lst))
@@ -892,13 +914,13 @@ def scale_list(lst: list, min_val: float = 1, max_val: float = 5) -> list:
     """
     Scale the values of a list to a specified range.
-    Parameters:
-    - lst (list): The input list of values to be scaled.
-    - min_val (float): The minimum value of the output range (default is 1).
-    - max_val (float): The maximum value of the output range (default is 5).
+    Args:
+        lst (list): The input list of values to be scaled.
+        min_val (float): The minimum value of the output range (default is 1).
+        max_val (float): The maximum value of the output range (default is 5).
     Returns:
-    - list: A new list with values scaled to the specified range.
+        list: A new list with values scaled to the specified range.
     """
     min_w = min(lst)
     max_w = max(lst)
@@ -916,15 +938,15 @@ def df_scale_column(df: pd.DataFrame, col_to_scale: str, col_out: str, min_val:
     """
     Scale values in a DataFrame column to a specified range.
-    Parameters:
-    - df (pd.DataFrame): The input DataFrame.
-    - col_to_scale (str): The name of the column to be scaled.
-    - col_out (str): The name of the new column to store scaled values.
-    - min_val (float): The minimum value of the output range.
-    - max_val (float): The maximum value of the output range.
+    Args:
+        df (pd.DataFrame): The input DataFrame.
+        col_to_scale (str): The name of the column to be scaled.
+        col_out (str): The name of the new column to store scaled values.
+        min_val (float): The minimum value of the output range.
+        max_val (float): The maximum value of the output range.
     Returns:
-    - pd.DataFrame: The DataFrame with a new column containing scaled values.
+        pd.DataFrame: The DataFrame with a new column containing scaled values.
     """
     min_freq = df[col_to_scale].min()
     max_freq = df[col_to_scale].max()
@@ -939,13 +961,13 @@ def zip_file(source_file_path: str, zip_file_path: str, name: str) -> str:
     """
     Zip a single file.
-    Parameters:
-    - source_file_path (str): Path to the file to be zipped.
-    - zip_file_path (str): Path for the resulting zip file.
-    - name (str): Name for the resulting zip file (without extension).
+    Args:
+        source_file_path (str): Path to the file to be zipped.
+        zip_file_path (str): Path for the resulting zip file.
+        name (str): Name for the resulting zip file (without extension).
     Returns:
-    str: Path to the resulting zip file.
+        str: Path to the resulting zip file.
     """
     file_path = os.path.join(zip_file_path, f"{name}.zip")
@@ -959,13 +981,13 @@ def zip_folder(source_folder_path: str, zip_file_path: str, name: str) -> str:
     """
     Zip an entire folder.
-    Parameters:
-    - source_folder_path (str): Path to the folder to be zipped.
-    - zip_file_path (str): Path for the resulting zip file.
-    - name (str): Name for the resulting zip file (without extension).
+    Args:
+        source_folder_path (str): Path to the folder to be zipped.
+        zip_file_path (str): Path for the resulting zip file.
+        name (str): Name for the resulting zip file (without extension).
     Returns:
-    str: Path to the resulting zip file.
+        str: Path to the resulting zip file.
     """
     file_path = os.path.join(zip_file_path, f"{name}.zip")
@@ -982,12 +1004,12 @@ def unzip_file(zip_file_path: str, destination_path: str) -> None:
     """
     Unzip a zip file.
-    Parameters:
-    - zip_file_path (str): Path to the zip file to be unzipped.
-    - destination_path (str): Path where the contents of the zip file will be extracted.
+    Args:
+        zip_file_path (str): Path to the zip file to be unzipped.
+        destination_path (str): Path where the contents of the zip file will be extracted.
     Returns:
-    None
+        None
     """
     with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
         zip_ref.extractall(destination_path)
@@ -1002,11 +1024,11 @@ def create_google_spreadsheet_client(credentials: str):
     """
     Create a Gspread client to interact with Google Sheets.
-    Parameters:
-    - credentials (str): Path to the JSON file containing Google Service Account credentials.
+    Args:
+        credentials (str): Path to the JSON file containing Google Service Account credentials.
     Returns:
-    gspread.Client: A client object for interacting with Google Sheets.
+        gspread.Client: A client object for interacting with Google Sheets.
     """
     return gspread.service_account(filename=credentials)
@@ -1014,13 +1036,13 @@ def read_google_spreadsheet(client: gspread.Client, sheet_id: str, worksheet_nam
     """
     Read data from a Google spreadsheet and return it as a DataFrame.
-    Parameters:
-    - client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
-    - sheet_id (str): The ID of the Google spreadsheet.
-    - worksheet_name (str): The name of the worksheet within the spreadsheet.
+    Args:
+        client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
+        sheet_id (str): The ID of the Google spreadsheet.
+        worksheet_name (str): The name of the worksheet within the spreadsheet.
     Returns:
-    pd.DataFrame: A DataFrame containing the data from the specified worksheet.
+        pd.DataFrame: A DataFrame containing the data from the specified worksheet.
     """
     try:
         # Open the Google Spreadsheet by ID
@@ -1047,12 +1069,12 @@ def list_google_worksheets(client: gspread.Client, sheet_id: str) -> list:
     """
     Return a list of worksheet names for a spreadsheet ID.
-    Parameters:
-    - client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
-    - sheet_id (str): The ID of the Google spreadsheet.
+    Args:
+        client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
+        sheet_id (str): The ID of the Google spreadsheet.
     Returns:
-    list: A list of worksheet names.
+        list: A list of worksheet names.
     """
     sheet = client.open_by_key(sheet_id)
     worksheet_obj = sheet.worksheets()
@@ -1063,12 +1085,12 @@ def get_spreadsheet_permissions(client: gspread.Client, sheet_id: str) -> pd.Dat
     """
     Return a DataFrame with the list of user email and type that can access the document.
-    Parameters:
-    - client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
-    - sheet_id (str): The ID of the Google spreadsheet.
+    Args:
+        client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
+        sheet_id (str): The ID of the Google spreadsheet.
     Returns:
-    pd.DataFrame: A DataFrame containing the list of user email addresses and their access types.
+        pd.DataFrame: A DataFrame containing the list of user email addresses and their access types.
     """
     sheet = client.open_by_key(sheet_id)
     permissions = sheet.list_permissions()
@@ -1081,14 +1103,14 @@ def create_google_spreadsheet(client: gspread.Client, df: pd.DataFrame, filename
     """
     Create a new Google spreadsheet and load a DataFrame into it.
-    Parameters:
-    - client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
-    - df (pd.DataFrame): The DataFrame to be loaded into the spreadsheet.
-    - filename (str): The desired filename for the new spreadsheet.
-    - worksheet_name (str, optional): The name of the worksheet within the spreadsheet. Defaults to "Sheet1".
+    Args:
+        client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
+        df (pd.DataFrame): The DataFrame to be loaded into the spreadsheet.
+        filename (str): The desired filename for the new spreadsheet.
+        worksheet_name (str, optional): The name of the worksheet within the spreadsheet. Defaults to "Sheet1".
     Returns:
-    gspread.Spreadsheet: The created spreadsheet object.
+        gspread.Spreadsheet: The created spreadsheet object.
     """
     spreadsheet = client.create(filename)
     worksheet = spreadsheet.sheet1
@@ -1102,17 +1124,17 @@ def share_google_spreadsheet(spreadsheet: gspread.Spreadsheet, email: str, user_
     """
     Share a spreadsheet with a user.
-    Parameters:
-    - spreadsheet (gspread.Spreadsheet): The Google spreadsheet object to be shared.
-    - email (str): The email address of the user with whom the spreadsheet will be shared.
-    - user_type (str, optional): The permission type for the user. Defaults to "user".
-    - user_role (str, optional): The role assigned to the user. Defaults to "writer".
-    - notify (bool, optional): Whether to notify the user about the sharing. Defaults to False.
-    - email_message (str, optional): The message to include in the notification email.
-    - with_link (bool, optional): Whether to include a link to the shared document in the notification email. Defaults to False.
+    Args:
+        spreadsheet (gspread.Spreadsheet): The Google spreadsheet object to be shared.
+        email (str): The email address of the user with whom the spreadsheet will be shared.
+        user_type (str, optional): The permission type for the user. Defaults to "user".
+        user_role (str, optional): The role assigned to the user. Defaults to "writer".
+        notify (bool, optional): Whether to notify the user about the sharing. Defaults to False.
+        email_message (str, optional): The message to include in the notification email.
+        with_link (bool, optional): Whether to include a link to the shared document in the notification email. Defaults to False.
     Returns:
-    gspread.Spreadsheet: The updated spreadsheet object.
+        gspread.Spreadsheet: The updated spreadsheet object.
     """
     spreadsheet.share(email, perm_type=user_type, role=user_role, notify=notify, email_message=email_message, with_link=with_link)
     return spreadsheet
@@ -1121,11 +1143,11 @@ def generate_short_id(variables: dict) -> tuple[str, str]:
     """
     Generate an 8-character ID using a dictionary as input.
-    Parameters:
-    - variables (dict): A dictionary containing the variables to be serialized.
+    Args:
+        variables (dict): A dictionary containing the variables to be serialized.
     Returns:
-    tuple: A tuple containing the generated short ID and the serialized variables.
+        tuple: A tuple containing the generated short ID and the serialized variables.
     """
     # Serialize variables into JSON string
     serialized_variables = json.dumps(variables, sort_keys=True)
@@ -1136,6 +1158,16 @@ def generate_short_id(variables: dict) -> tuple[str, str]:
     return short_id, serialized_variables
 def df_transform_column_as_list(column: pd.Series) -> pd.Series:
+    """
+    Transform a pandas Series where each cell is a string representation of a list,
+    a single value, or already a list into a pandas Series with each cell as a list.
+    Args:
+        column (pd.Series): The input pandas Series to transform.
+    Returns:
+        pd.Series: A pandas Series with each cell as a list.
+    """
     def transform(cell):
         if isinstance(cell, str):
             # Check if it's a list formatted as string, and convert to list
@@ -1168,7 +1200,17 @@ def top_rows_per_category(df: pd.DataFrame,
                           cols_to_keep: list[str],
                           top_rows: int) -> pd.DataFrame:
     """
-    Select top rows for each category in a dataframe
+    Select the top rows for each category in a dataframe.
+    Args:
+        df (pd.DataFrame): The input dataframe.
+        col_to_sort (str): The column name by which to sort the rows.
+        col_to_gb (str): The column name to group by.
+        cols_to_keep (List[str]): The list of columns to keep in the final output.
+        top_rows (int): The number of top rows to select for each group.
+    Returns:
+        pd.DataFrame: A dataframe containing the top rows for each category.
     """
     df_gb = (df.sort_values(by=col_to_sort, ascending=False)
                  .groupby(col_to_gb)
@@ -1179,7 +1221,13 @@ def top_rows_per_category(df: pd.DataFrame,
 def format_number(number: int) -> str:
     """
-    Function to format a number in K, M or B
+    Format a number into a human-readable string with K, M, or B suffixes.
+    Args:
+        number (int): The number to format.
+    Returns:
+        str: The formatted number as a string with an appropriate suffix.
     """
     if number < 1000:
         return str(number)
@@ -1196,12 +1244,12 @@ def unrar_file(rar_file_path : str, output_dir : str) -> None:
     """
     Extracts a .rar file to the specified output directory using the unrar command.
-    Parameters:
-    rar_file_path (str): The path to the .rar file.
-    output_dir (str): The directory where the contents should be extracted.
+    Args:
+        rar_file_path (str): The path to the .rar file.
+        output_dir (str): The directory where the contents should be extracted.
     Returns:
-    None
+        None
     """
     try:
         # Ensure the output directory exists
@@ -1216,4 +1264,70 @@ def unrar_file(rar_file_path : str, output_dir : str) -> None:
             print(f"Extraction failed. Error: {result.stderr}")
     except Exception as e:
-        print(f"An error occurred: {e}")
+        print(f"An error occurred: {e}")
+def fill_nan(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Fill missing values in a DataFrame with appropriate defaults based on the column data type.
+    For string columns, missing values are replaced with an empty string.
+    For numeric columns, missing values are replaced with zero.
+    For datetime columns, missing values are replaced with the default date '1970-01-01'.
+    For other types, missing values are filled with NaN.
+    Args:
+        df (DataFrame): The DataFrame in which missing values will be filled.
+    Returns:
+        DataFrame: The DataFrame with missing values filled.
+    """
+    mixed_columns = df.columns[df.isna().any()]
+    for col in mixed_columns:
+        if df[col].dtype == 'object':
+            # For string columns, replace NaN with an empty string
+            df[col] = df[col].fillna('')
+        elif pd.api.types.is_numeric_dtype(df[col]):
+            # For numeric columns, replace NaN with the column mean
+            df[col] = df[col].fillna(0)
+        elif pd.api.types.is_datetime64_any_dtype(df[col]):
+            # For datetime columns, replace NaN with a default date
+            default_date = pd.Timestamp('1970-01-01')
+            df[col] = df[col].fillna(default_date)
+        else:
+            # For other types, we can use a general approach, such as fill with None or NaN
+            df[col] = df[col].fillna(None)
+    return df
+def detect_encoding(file_path : str) -> str:
+    """
+    Detect the encoding of a file.
+    Args:
+        file_path (str): The path to the file whose encoding needs to be detected.
+    Returns:
+        str: The detected encoding of the file.
+    """
+    with open(file_path, 'rb') as file:
+        raw_data = file.read()
+    result = chardet.detect(raw_data)
+    return result['encoding']
+def remove_empty_folders(path: str):
+    """
+    Recursively remove empty folders from the specified directory.
+    Parameters:
+    - path (str): Path to the directory to scan for empty folders.
+    """
+    # Iterate over the directory tree
+    for root, dirs, files in os.walk(path, topdown=False):
+        for dir_name in dirs:
+            dir_path = os.path.join(root, dir_name)
+            # If the directory is empty, remove it
+            if not os.listdir(dir_path):
+                os.rmdir(dir_path)
+                print(f"Removed empty folder: {dir_path}")

opsci-toolbox 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl

opsci-toolbox 0.0.7py3-none-any.whl → 0.0.8py3-none-any.whl