PyPI - opsci-toolbox - Versions diffs - 0.0.2__py3-none-any.whl → 0.0.6__py3-none-any.whl - Mend

opsci-toolbox 0.0.2py3-none-any.whl → 0.0.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

opsci_toolbox/apis/rapidapi_helpers.py +82 -0
opsci_toolbox/helpers/common.py +566 -191
opsci_toolbox/helpers/cv.py +298 -123
opsci_toolbox/helpers/dataviz.py +1005 -216
opsci_toolbox/helpers/dates.py +55 -8
opsci_toolbox/helpers/nlp.py +768 -110
opsci_toolbox/helpers/nlp_cuml.py +280 -0
opsci_toolbox/helpers/sna.py +101 -10
opsci_toolbox/helpers/surreaction.py +156 -0
{opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.6.dist-info}/METADATA +9 -11
opsci_toolbox-0.0.6.dist-info/RECORD +21 -0
opsci_toolbox-0.0.2.dist-info/RECORD +0 -19
{opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.6.dist-info}/WHEEL +0 -0
{opsci_toolbox-0.0.2.dist-info → opsci_toolbox-0.0.6.dist-info}/top_level.txt +0 -0

opsci_toolbox/helpers/common.py CHANGED Viewed

@@ -15,12 +15,27 @@ import pyarrow.parquet as pq
 from datetime import datetime
 import hashlib
 import ast
+import subprocess
 ####################################################################################################
 # FILE LOADERS
 ####################################################################################################
-def load_file(path, delimiter = ";", decimal ="."):
+def load_file(path: str, delimiter: str = ";", decimal: str = ".")  -> pd.DataFrame:
+    """
+    Load a file into a Pandas DataFrame based on the file extension.
+    Parameters:
+    path (str): The file path to load.
+    delimiter (str, optional): The delimiter used in CSV/TSV files. Default is ";".
+    decimal (str, optional): The character used for decimal points in CSV/TSV files. Default is ".".
+    Returns:
+    pd.DataFrame: The loaded data as a Pandas DataFrame.
+    Raises:
+    ValueError: If the file extension is not supported.
+    """
     extension = os.path.splitext(os.path.basename(path))[1]
     if extension == ".parquet":
         df = load_parquet(path)
@@ -38,9 +53,18 @@ def load_file(path, delimiter = ";", decimal ="."):
         print("Check your input file. Extension isn't supported : .parquet, .pickle, .json, .jsonl, .csv, .tsv")
     return df
-def load_parquet(path):
+def load_parquet(path: str) -> pd.DataFrame:
     """
-    Load a parquet file into a DataFrame
+    Load a parquet file into a DataFrame.
+    Parameters:
+    path (str): The file path to the parquet file.
+    Returns:
+    pd.DataFrame: The loaded data as a Pandas DataFrame.
+    Raises:
+    Exception: If there is an error reading the parquet file.
     """
     try:
         table = pq.read_table(path)
@@ -50,66 +74,108 @@ def load_parquet(path):
         print(e)
     return df
-def load_pickle(path: str):
-    """
-    Load a pickle file into a dataframe
+def load_pickle(path: str) -> pd.DataFrame:
     """
+    Load a pickle file into a DataFrame.
-    with open(path, 'rb') as f:
-        df=pickle.load(f)
-    return df
+    Parameters:
+    path (str): The file path to the pickle file.
+    Returns:
+    pd.DataFrame: The loaded data as a Pandas DataFrame.
+    """
+    return pd.read_pickle(path)
-def load_json(path: str):
+def load_json(path: str) -> pd.DataFrame:
     """
-    Load a json file into a DataFrame
+    Load a JSON file into a DataFrame.
+    Parameters:
+    path (str): The file path to the JSON file.
+    Returns:
+    pd.DataFrame: The loaded data as a Pandas DataFrame.
+    Raises:
+    Exception: If there is an error reading the JSON file.
     """
-    df=pd.DataFrame()
+    df = pd.DataFrame()
     try:
         with open(path, 'r') as json_file:
             data = json.load(json_file)
-        df=pd.json_normalize(data)
+        df = pd.json_normalize(data)
     except Exception as e:
-        pass
-        print(e)
+        print(f"Error reading the JSON file: {e}")
+        raise
     return df
-def load_jsonl(path: str):
+def load_jsonl(path: str) -> pd.DataFrame:
     """
-    Load a jsonl file into a dataframe
+    Load a JSON Lines (jsonl) file into a DataFrame.
+    Parameters:
+    path (str): The file path to the jsonl file.
+    Returns:
+    pd.DataFrame: The loaded data as a Pandas DataFrame.
+    Raises:
+    Exception: If there is an error reading the jsonl file.
     """
     df = pd.DataFrame()
     try:
         data = []
         with open(path, 'r') as json_file:
-            for line in tqdm(json_file):
+            for line in tqdm(json_file, desc="Loading JSON Lines"):
                 try:
                     data.append(json.loads(line))
-                except:
-                    pass
+                except json.JSONDecodeError as line_error:
+                    print(f"Error decoding line: {line_error}")
         df = pd.json_normalize(data)
     except Exception as e:
-        pass
-        print(e)
+        print(f"Error reading the jsonl file: {e}")
+        raise
     return df
-def load_csv(path: str, delimiter: str =";", decimal:str ="."):
+def load_csv(path: str, delimiter: str = ";", decimal: str = ".") -> pd.DataFrame:
     """
-    Load a csv file into a dataframe
+    Load a CSV file into a DataFrame.
+    Parameters:
+    path (str): The file path to the CSV file.
+    delimiter (str, optional): The delimiter used in the CSV file. Default is ";".
+    decimal (str, optional): The character used for decimal points in the CSV file. Default is ".".
+    Returns:
+    pd.DataFrame: The loaded data as a Pandas DataFrame.
+    Raises:
+    Exception: If there is an error reading the CSV file.
     """
-    df= pd.DataFrame()
+    df = pd.DataFrame()
     try:
         df = pd.read_csv(path, delimiter=delimiter, encoding="utf-8", decimal=decimal)
     except Exception as e:
-        pass
-        print(e)
+        print(f"Error reading the CSV file: {e}")
+        raise
     return df
-def read_txt_to_list(file_path: str):
+def read_txt_to_list(file_path: str) -> list[str]:
     """
-    Read a text file line by line and append to a Python list
+    Read a text file line by line and append to a Python list.
+    Parameters:
+    file_path (str): The file path to the text file.
+    Returns:
+    list[str]: A list of lines read from the text file.
+    Raises:
+    FileNotFoundError: If the file does not exist.
+    Exception: If any other error occurs during file reading.
     """
     # Initialize an empty list to store the lines
@@ -124,12 +190,22 @@ def read_txt_to_list(file_path: str):
         print(f"File not found: {file_path}")
     except Exception as e:
         print(f"An error occurred: {e}")
+        raise
     return lines
-def read_json(path: str):
+def read_json(path: str) -> dict:
     """
-    Read a json file and return a dict
+    Read a JSON file and return a dictionary.
+    Parameters:
+    path (str): The file path to the JSON file.
+    Returns:
+    dict: The data read from the JSON file as a dictionary.
+    Raises:
+    FileNotFoundError: If the file does not exist.
+    Exception: If there is an error reading the JSON file.
     """
     with open(path, 'r') as json_file:
         data = json.load(json_file)
@@ -137,25 +213,55 @@ def read_json(path: str):
 def read_txt_file(file_path: str) -> str:
     """
-    Read a text file
+    Read the content of a text file and return it as a string.
+    Parameters:
+    file_path (str): The file path to the text file.
+    Returns:
+    str: The content of the text file as a string.
+    Raises:
+    FileNotFoundError: If the file does not exist.
+    Exception: If there is an error reading the text file.
     """
-    with open(file_path, 'r') as file:
-        content = file.read()
+    try:
+        with open(file_path, 'r') as file:
+            content = file.read()
+    except FileNotFoundError:
+        print(f"File not found: {file_path}")
+        raise
+    except Exception as e:
+        print(f"An error occurred while reading the file: {e}")
+        raise
     return content
-def read_jsonl(path: str):
+def read_jsonl(path: str) -> list[dict]:
     """
-    Load a jsonl file into a dataframe
+    Load a JSON Lines (jsonl) file into a list of dictionaries.
+    Parameters:
+    path (str): The file path to the jsonl file.
+    Returns:
+    list[dict]: A list of dictionaries containing the data read from the JSON Lines file.
+    Raises:
+    FileNotFoundError: If the file does not exist.
+    Exception: If there is an error reading the jsonl file.
     """
     json_data = []
-    with open(path, 'r') as json_file:
-        for line in tqdm(json_file):
-            try:
-                json_data.append(json.loads(line))
-            except Exception as e:
-                pass
-                print(e)
+    try:
+        with open(path, 'r') as json_file:
+            for line in tqdm(json_file, desc="Reading JSON Lines"):
+                try:
+                    json_data.append(json.loads(line))
+                except Exception as e:
+                    print(f"Error decoding line: {e}")
+                    raise
+    except FileNotFoundError:
+        print(f"File not found: {path}")
+        raise
     return json_data
@@ -164,37 +270,55 @@ def read_jsonl(path: str):
 #########################################################################################
-def write_pickle(df: pd.DataFrame, path: str, name: str):
+def write_pickle(data: pd.DataFrame, path: str, filename: str) -> str:
     """
-    Write a dataframe into a pickle file
-    """
-    file_path=os.path.join(path, name+'.pickle')
+    Write a DataFrame into a pickle file.
+    Parameters:
+    data (pd.DataFrame): The DataFrame to be written to the pickle file.
+    path (str): The directory where the pickle file will be saved.
+    filename (str): The name of the pickle file (without the extension).
+    Returns:
+    str: The full path to the saved pickle file.
+    """
+    file_path = os.path.join(path, filename + '.pickle')
     with open(file_path, 'wb') as f:
-        pickle.dump(df, f)
+        pickle.dump(data, f)
     return file_path
-def write_list_to_txt(input_list: list, path: str, name: str):
+def write_list_to_txt(input_list: list, path: str, name: str) -> str:
     """
     Write a list to a text file, with each item on a new line.
     Parameters:
-    - file_path (str): The path to the text file.
     - input_list (list): The list to be written to the text file.
+    - path (str): The directory path where the text file will be saved.
+    - name (str): The name of the text file (without the extension).
+    Returns:
+    str: The full path to the saved text file.
     """
-    file_path=os.path.join(path, name+'.txt')
+    file_path = os.path.join(path, name + '.txt')
     with open(file_path, 'w') as file:
         for item in input_list:
             file.write(str(item) + '\n')
     return file_path
-def write_jsonl(data: list,  path: str, name: str):
+def write_jsonl(data: list[dict], path: str, name: str) -> str:
     """
-    Write a jsonl file. Function takes as input a list of dict.
+    Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
+    Parameters:
+    - data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
+    - path (str): The directory path where the JSON Lines file will be saved.
+    - name (str): The name of the JSON Lines file (without the extension).
+    Returns:
+    str: The full path to the saved JSON Lines file.
     """
-    file_path=os.path.join(path, name+'.jsonl')
+    file_path = os.path.join(path, name + '.jsonl')
     with open(file_path, 'w') as file:
         for entry in data:
             json.dump(entry, file)
@@ -202,41 +326,67 @@ def write_jsonl(data: list,  path: str, name: str):
     return file_path
-def write_json(json_dict: dict, path: str, name: str):
+def write_json(json_dict: dict, path: str, name: str) -> str:
     """
-    Write a dict into a json file
+    Write a dictionary to a JSON file.
+    Parameters:
+    - json_dict (dict): The dictionary to be written to the JSON file.
+    - path (str): The directory path where the JSON file will be saved.
+    - name (str): The name of the JSON file (without the extension).
+    Returns:
+    str: The full path to the saved JSON file.
     """
-    file_path=os.path.join(path, name+'.json')
+    file_path = os.path.join(path, name + '.json')
     with open(file_path, 'w') as outfile:
         json.dump(json_dict, outfile)
     return file_path
-def write_dataframe_to_json(df: pd.DataFrame, path: str, name: str, orient='records'):
+def write_dataframe_to_json(df: pd.DataFrame, path: str, name: str, orient: str = 'records') -> str:
     """
-    Write a dataframe into a json file
+    Write a DataFrame to a JSON file.
+    Parameters:
+    - df (pd.DataFrame): The DataFrame to be written to the JSON file.
+    - path (str): The directory path where the JSON file will be saved.
+    - name (str): The name of the JSON file (without the extension).
+    - orient (str, optional): The format of the JSON file. Default is 'records'.
+    Returns:
+    str: The full path to the saved JSON file.
     """
-    file_path=os.path.join(path, name+".json")
+    file_path = os.path.join(path, name + ".json")
     df.to_json(file_path, orient=orient, lines=True)
+    return file_path
-def save_dataframe_excel(df: pd.DataFrame, path: str, name :str, sheet_name:str):
+def save_dataframe_excel(df: pd.DataFrame, path: str, name: str, sheet_name: str) -> str:
     """
-    Write a dataframe into a XLSX file
+    Write a DataFrame to an Excel file.
+    Parameters:
+    - df (pd.DataFrame): The DataFrame to be written to the Excel file.
+    - path (str): The directory path where the Excel file will be saved.
+    - name (str): The name of the Excel file (without the extension).
+    - sheet_name (str): The name of the Excel sheet.
+    Returns:
+    str: The full path to the saved Excel file.
     """
-    file_path=os.path.join(path,  f"{name}.xlsx")
+    file_path = os.path.join(path, f"{name}.xlsx")
     df.to_excel(file_path, sheet_name=sheet_name, index=False)
     print(file_path, "- File created")
     return file_path
-def add_dataframe_to_excel(df: pd.DataFrame, existing_file_path: str, new_sheet_name: str):
+def add_dataframe_to_excel(df: pd.DataFrame, existing_file_path: str, new_sheet_name: str) -> None:
     """
     Adds a DataFrame to an existing Excel file as a new sheet.
     Parameters:
+    - df (pd.DataFrame): The DataFrame to be added.
     - existing_file_path (str): Path to the existing Excel file.
-    - dataframe (pd.DataFrame): The DataFrame to be added.
     - new_sheet_name (str): Name of the new sheet in the Excel file.
     Returns:
@@ -245,7 +395,7 @@ def add_dataframe_to_excel(df: pd.DataFrame, existing_file_path: str, new_sheet_
     # Read existing Excel file into a dictionary of DataFrames
     excel_file = pd.read_excel(existing_file_path, sheet_name=None)
-    # Add the new DataFrame to the dictionary with the specified sheet aname
+    # Add the new DataFrame to the dictionary with the specified sheet name
     excel_file[new_sheet_name] = df
     # Write the updated dictionary of DataFrames back to the Excel file
@@ -253,46 +403,62 @@ def add_dataframe_to_excel(df: pd.DataFrame, existing_file_path: str, new_sheet_
         for sheet_name, df in excel_file.items():
             df.to_excel(writer, sheet_name=sheet_name, index=False)
-def save_dataframe_csv(df: pd.DataFrame, path: str, name: str):
+def save_dataframe_csv(df: pd.DataFrame, path: str, name: str) -> str:
     """
-    This function saves a DataFrame to a CSV file within a project directory.
-    :param df: The DataFrame to be saved.
-    :type df: pandas.DataFrame
-    :param dir_csv: The directory where the CSV file will be saved.
-    :type dir_csv: str
-    :param name: The desired name for the CSV file (without extension).
-    :type name: str
+    Save a DataFrame to a CSV file within a specified directory.
+    Parameters:
+    - df (pd.DataFrame): The DataFrame to be saved.
+    - path (str): The directory where the CSV file will be saved.
+    - name (str): The desired name for the CSV file (without extension).
+    Returns:
+    str: The full path to the saved CSV file.
     """
-    names = df.columns
+    file_path = os.path.join(path, f"{name}.csv")
     df.to_csv(
-        os.path.join(path, f"{name}.csv"),
-        header=names,
+        file_path,
         sep=";",
         encoding="utf-8",
         index=False,
         decimal=",",
     )
-    print("FILE SAVED: ", os.path.join(path, f"{name}.csv"))
+    print("File saved:", file_path)
+    return file_path
-def write_txt_file(data: str,  path: str, name: str):
+def write_txt_file(data: str, path: str, name: str) -> str:
     """
-    Write a text file
+    Write a string to a text file.
+    Parameters:
+    - data (str): The string to be written to the text file.
+    - path (str): The directory path where the text file will be saved.
+    - name (str): The name of the text file (without the extension).
+    Returns:
+    str: The full path to the saved text file.
     """
-    file_path=os.path.join(path, name+'.txt')
+    file_path = os.path.join(path, name + '.txt')
     with open(file_path, "w") as file:
         file.write(data)
     return file_path
-def split_df_into_chunks(df, path, name, chunk_size = 10000):
+def split_df_into_chunks(df: pd.DataFrame, path: str, name: str, chunk_size: int = 10000) -> list[str]:
     """
-    Split a dataframe into n pickle files
+    Split a DataFrame into multiple pickle files with a specified chunk size.
+    Parameters:
+    - df (pd.DataFrame): The DataFrame to be split.
+    - path (str): The directory path where the pickle files will be saved.
+    - name (str): The base name for the pickle files.
+    - chunk_size (int, optional): The size of each chunk. Default is 10000.
+    Returns:
+    list[str]: A list of file paths to the saved pickle files.
     """
     num_chunks = -(-len(df) // chunk_size)  # Calculate the number of chunks using ceil division
-    file_paths=[]
+    file_paths = []
     # create smaller datasets of chunk_size each
     for i in range(num_chunks):
@@ -305,16 +471,19 @@ def split_df_into_chunks(df, path, name, chunk_size = 10000):
     return file_paths
 ###################################################################################################
 # FOLDERS / FILES HELPERS
 ###################################################################################################
-def create_dir(path:str):
+def create_dir(path: str) -> str:
     """
-    Create a local directory
+    Create a local directory if it doesn't exist.
+    Parameters:
+    - path (str): The directory path to be created.
+    Returns:
+    str: The path of the created directory.
     """
     if not os.path.exists(path):
         os.makedirs(path)
@@ -322,18 +491,31 @@ def create_dir(path:str):
     return path
-def list_files_in_dir(path: str, filetype:str ='*.json'):
+def list_files_in_dir(path: str, filetype: str = '*.json') -> list[str]:
     """
-    List files of a specific format in a directory
+    List files of a specific format in a directory.
+    Parameters:
+    - path (str): The directory path to search for files.
+    - filetype (str, optional): The file type pattern to search for. Default is '*.json'.
+    Returns:
+    list[str]: A list of file paths matching the specified file type pattern.
     """
     pattern = os.path.join(path, filetype)
     files = glob.glob(pattern)
     return files
-def list_subdirectories(root_directory: str):
+def list_subdirectories(root_directory: str) -> list[str]:
     """
-    List subdirectories in a root directory
+    List subdirectories in a root directory.
+    Parameters:
+    - root_directory (str): The root directory path.
+    Returns:
+    list[str]: A list of subdirectory names.
     """
     subdirectories = []
     for entry in os.scandir(root_directory):
@@ -342,9 +524,15 @@ def list_subdirectories(root_directory: str):
     return subdirectories
-def list_recursive_subdirectories(root_directory: str):
+def list_recursive_subdirectories(root_directory: str) -> list[str]:
     """
-    List recursively all subdirectories from a root directory
+    List recursively all subdirectories from a root directory.
+    Parameters:
+    - root_directory (str): The root directory path.
+    Returns:
+    list[str]: A list of subdirectory paths.
     """
     subdirectories = []
     for root, dirs, files in os.walk(root_directory):
@@ -352,9 +540,16 @@ def list_recursive_subdirectories(root_directory: str):
     return subdirectories
-def list_files_in_subdirectories(path:str, filetype:str='*.json'):
+def list_files_in_subdirectories(path: str, filetype: str = '*.json') -> list[str]:
     """
-    Walk through subdirectories of a root directory to list files of a specific format
+    Walk through subdirectories of a root directory to list files of a specific format.
+    Parameters:
+    - path (str): The root directory path.
+    - filetype (str, optional): The file type pattern to search for. Default is '*.json'.
+    Returns:
+    list[str]: A list of file paths matching the specified file type pattern in subdirectories.
     """
     files = []
@@ -369,21 +564,36 @@ def list_files_in_subdirectories(path:str, filetype:str='*.json'):
     return files
-def copy_file(source_path: str, destination_path: str, new_filename:str):
+def copy_file(source_path: str, destination_path: str, new_filename: str = '') -> str:
     """
-    Function to copy a file to another path
+    Copy a file from a source path to a destination path.
+    Parameters:
+    - source_path (str): The path of the source file.
+    - destination_path (str): The path of the destination directory.
+    - new_filename (str, optional): The new filename. If not provided, the original filename is used.
+    Returns:
+    str: The path of the copied file.
     """
     if new_filename:
-        file_path=os.path.join(destination_path, new_filename)
+        file_path = os.path.join(destination_path, new_filename)
     else:
-        filename=os.path.basename(source_path)
-        file_path=os.path.join(destination_path,filename)
+        filename = os.path.basename(source_path)
+        file_path = os.path.join(destination_path, filename)
     shutil.copy(source_path, file_path)
     return file_path
-def remove_file(file_path):
+def remove_file(file_path: str) -> None:
     """
-    Remove a single file
+    Remove a single file.
+    Parameters:
+    - file_path (str): The path of the file to be removed.
+    Returns:
+    None
     """
     try:
         os.remove(file_path)
@@ -391,20 +601,33 @@ def remove_file(file_path):
     except OSError as e:
         print(f"Error removing file {file_path}: {e}")
-def remove_folder(folder_path):
+def remove_folder(folder_path: str) -> None:
     """
-    Remove a folder and all the files inside
+    Remove a folder and all its contents.
+    Parameters:
+    - folder_path (str): The path of the folder to be removed.
+    Returns:
+    None
     """
     try:
         shutil.rmtree(folder_path)
         print(f"Folder {folder_path} and its contents removed successfully.")
     except OSError as e:
-        print(f"Error removing folder {folder_path}: {e}")
+        print(f"Error removing folder {folder_path}: {e}")
-def get_file_size(file_path):
+def get_file_size(file_path: str) -> tuple[int, str]:
     """
-    Get a single file size in a readable format (KB, MB, GB)
+    Get the size of a single file in a readable format (KB, MB, GB).
+    Parameters:
+    - file_path (str): The path of the file.
+    Returns:
+    tuple[int, str]: A tuple containing the size of the file in bytes and its formatted size.
+    If the file is not found, returns None.
     """
     try:
         size = os.path.getsize(file_path)
@@ -427,9 +650,16 @@ def get_file_size(file_path):
         print(f"File not found: {file_path}")
         return None
-def get_folder_size(folder_path):
+def get_folder_size(folder_path: str) -> tuple[int, str]:
     """
-    Get size of all files contained in a folder in a readable format (KB, MB, GB)
+    Get the size of all files contained in a folder in a readable format (KB, MB, GB).
+    Parameters:
+    - folder_path (str): The path of the folder.
+    Returns:
+    tuple[int, str]: A tuple containing the total size of all files in bytes and its formatted size.
+    If the folder is not found, returns None.
     """
     total_size = 0
@@ -457,9 +687,16 @@ def get_folder_size(folder_path):
         print(f"Folder not found: {folder_path}")
         return None
-def file_creation_date(file_path):
+def file_creation_date(file_path: str) -> datetime:
     """
-    Return the last update timestamp
+    Return the last update timestamp of a file.
+    Parameters:
+    - file_path (str): The path of the file.
+    Returns:
+    datetime: The last update timestamp as a datetime object.
+    If the file does not exist, returns None.
     """
     # Check if the file exists
     if os.path.exists(file_path):
@@ -476,27 +713,34 @@ def file_creation_date(file_path):
 ############################################################################
-def transform_to_n_items_list(input_list : list, n: int):
+def transform_to_n_items_list(lst: list, n: int) -> list[list]:
     """
     Transform a list into a list of n-items sublists.
     Parameters:
-    - input_list: The input list to be transformed.
-    - n: The number of items in each sublist.
+    - lst (list): The input list to be transformed.
+    - n (int): The number of items in each sublist.
     Returns:
-    A list of n-items sublists.
+    list[list]: A list of n-items sublists.
     """
-    return [input_list[i:i + n] for i in range(0, len(input_list), n)]
+    return [lst[i:i + n] for i in range(0, len(lst), n)]
-def unduplicate_list(lst):
+def unduplicate_list(lst: list) -> list:
     """
-    Unduplicate elements of a list
+    Remove duplicate elements from a list.
+    Parameters:
+    - lst (list): The input list with possible duplicate elements.
+    Returns:
+    list: A list with duplicate elements removed.
     """
     return list(set(lst))
-def sort_list(lst, reverse=False):
+def sort_list(lst: list, reverse: bool = False) -> list:
     """
     Sort the list in ascending or descending order.
@@ -506,12 +750,12 @@ def sort_list(lst, reverse=False):
                      If False (default), sort the list in ascending order.
     Returns:
-    - list: A new list sorted based on the specified order.
+    list: A new list sorted based on the specified order.
     """
     return sorted(lst, reverse=reverse)
-def map_list(lst, function):
+def map_list(lst: list, function: callable) -> list:
     """
     Apply a function to each element of the list.
@@ -520,12 +764,12 @@ def map_list(lst, function):
     - function (callable): The function to apply to each element.
     Returns:
-    - list: A new list with the function applied to each element.
+    list: A new list with the function applied to each element.
     """
     return [function(element) for element in lst]
-def flatten_list(lst):
+def flatten_list(lst: list) -> list:
     """
     Flatten a nested list into a single list.
@@ -533,7 +777,7 @@ def flatten_list(lst):
     - lst (list): The input nested list.
     Returns:
-    - list: A new list with all nested elements flattened.
+    list: A new list with all nested elements flattened.
     """
     flattened_list = []
@@ -548,7 +792,7 @@ def flatten_list(lst):
     return flattened_list
-def find_occurrences(lst, element):
+def find_occurrences(lst: list, element) -> int:
     """
     Find the occurrences of a specific element in the list.
@@ -557,12 +801,12 @@ def find_occurrences(lst, element):
     - element: The element to find occurrences of.
     Returns:
-    - int: The number of occurrences of the specified element in the list.
+    int: The number of occurrences of the specified element in the list.
     """
     return lst.count(element)
-def is_subset(subset, superset):
+def is_subset(subset: list, superset: list) -> bool:
     """
     Check if one list is a subset of another.
@@ -571,11 +815,11 @@ def is_subset(subset, superset):
     - superset (list): The superset list.
     Returns:
-    - bool: True if the subset is a subset of the superset, False otherwise.
+    bool: True if the subset is a subset of the superset, False otherwise.
     """
     return all(element in superset for element in subset)
-def common_elements(list1, list2):
+def common_elements(list1: list, list2: list) -> list:
     """
     Find the common elements between two lists.
@@ -584,12 +828,12 @@ def common_elements(list1, list2):
     - list2 (list): The second list.
     Returns:
-    - list: A new list containing the common elements between list1 and list2.
+    list: A new list containing the common elements between list1 and list2.
     """
     return list(set(list1) & set(list2))
-def shuffle_list(lst):
+def shuffle_list(lst: list) -> list:
     """
     Shuffle the elements of the list randomly.
@@ -597,14 +841,14 @@ def shuffle_list(lst):
     - lst (list): The input list.
     Returns:
-    - list: A new list with the elements shuffled randomly.
+    list: A new list with the elements shuffled randomly.
     """
     shuffled_list = lst.copy()
     random.shuffle(shuffled_list)
     return shuffled_list
-def sample_list(lst, sample_size):
+def sample_list(lst: list, sample_size) -> list:
     """
     Sample a list based on an integer or a float representing the sample size.
@@ -614,7 +858,11 @@ def sample_list(lst, sample_size):
                                  If a float, the percentage of elements to keep.
     Returns:
-    - list: A new list containing the sampled elements.
+    list: A new list containing the sampled elements.
+    Raises:
+    - ValueError: If the sample size is invalid (negative integer or float outside [0, 1]).
+    - TypeError: If the sample size is neither an integer nor a float.
     """
     if isinstance(sample_size, int):
         if sample_size < 0:
@@ -628,7 +876,7 @@ def sample_list(lst, sample_size):
     else:
         raise TypeError("Sample size must be an integer or a float.")
-def count_elements(lst):
+def count_elements(lst: list) -> dict:
     """
     Count the occurrences of each element in the list.
@@ -636,46 +884,70 @@ def count_elements(lst):
     - lst (list): The input list.
     Returns:
-    - dict: A dictionary where keys are unique elements from the list, and values are their counts.
+    dict: A dictionary where keys are unique elements from the list, and values are their counts.
     """
     return dict(Counter(lst))
-def scale_list(lst, min_val=1, max_val=5):
+def scale_list(lst: list, min_val: float = 1, max_val: float = 5) -> list:
+    """
+    Scale the values of a list to a specified range.
+    Parameters:
+    - lst (list): The input list of values to be scaled.
+    - min_val (float): The minimum value of the output range (default is 1).
+    - max_val (float): The maximum value of the output range (default is 5).
+    Returns:
+    - list: A new list with values scaled to the specified range.
+    """
     min_w = min(lst)
     max_w = max(lst)
-    scaled_w = [ ]
+    scaled_w = []
     for x in lst:
         try:
             scaled_value = (x - min_w) / (max_w - min_w) * (max_val - min_val) + min_val
-        except :
-            pass
+        except ZeroDivisionError:
             scaled_value = min_val
         scaled_w.append(scaled_value)
     return scaled_w
-def df_scale_column(df, col_to_scale, col_out, min_val, max_val):
+def df_scale_column(df: pd.DataFrame, col_to_scale: str, col_out: str, min_val: float, max_val: float) -> pd.DataFrame:
+    """
+    Scale values in a DataFrame column to a specified range.
+    Parameters:
+    - df (pd.DataFrame): The input DataFrame.
+    - col_to_scale (str): The name of the column to be scaled.
+    - col_out (str): The name of the new column to store scaled values.
+    - min_val (float): The minimum value of the output range.
+    - max_val (float): The maximum value of the output range.
+    Returns:
+    - pd.DataFrame: The DataFrame with a new column containing scaled values.
+    """
     min_freq = df[col_to_scale].min()
     max_freq = df[col_to_scale].max()
-    df[col_out] = df[col_to_scale].apply(lambda x : ((x - min_freq) / (max_freq - min_freq)) * (max_val - min_val) + min_val)
+    df[col_out] = df[col_to_scale].apply(lambda x: ((x - min_freq) / (max_freq - min_freq)) * (max_val - min_val) + min_val)
     return df
 ############################################################################
 # ZIP HELPERS
 ############################################################################
-def zip_file(source_file_path, zip_file_path, name):
+def zip_file(source_file_path: str, zip_file_path: str, name: str) -> str:
     """
     Zip a single file.
-    Args:
-        source_file_path (str): Path to the file to be zipped.
-        zip_file_path (str): Path for the resulting zip file.
+    Parameters:
+    - source_file_path (str): Path to the file to be zipped.
+    - zip_file_path (str): Path for the resulting zip file.
+    - name (str): Name for the resulting zip file (without extension).
     Returns:
-        None
+    str: Path to the resulting zip file.
     """
-    file_path=os.path.join(zip_file_path, name+".zip")
+    file_path = os.path.join(zip_file_path, f"{name}.zip")
     with zipfile.ZipFile(file_path, 'w') as zip_file:
         # The second argument to `arcname` is used to set the name of the file inside the zip
@@ -683,18 +955,19 @@ def zip_file(source_file_path, zip_file_path, name):
     return file_path
-def zip_folder(source_folder_path, zip_file_path, name):
+def zip_folder(source_folder_path: str, zip_file_path: str, name: str) -> str:
     """
     Zip an entire folder.
-    Args:
-        source_folder_path (str): Path to the folder to be zipped.
-        zip_file_path (str): Path for the resulting zip file.
+    Parameters:
+    - source_folder_path (str): Path to the folder to be zipped.
+    - zip_file_path (str): Path for the resulting zip file.
+    - name (str): Name for the resulting zip file (without extension).
     Returns:
-        None
+    str: Path to the resulting zip file.
     """
-    file_path=os.path.join(zip_file_path, name+".zip")
+    file_path = os.path.join(zip_file_path, f"{name}.zip")
     with zipfile.ZipFile(file_path, 'w', zipfile.ZIP_DEFLATED) as zip_file:
         for foldername, subfolders, filenames in os.walk(source_folder_path):
@@ -705,13 +978,19 @@ def zip_folder(source_folder_path, zip_file_path, name):
     return file_path
-def unzip_file(zip_file_path, destination_path):
+def unzip_file(zip_file_path: str, destination_path: str) -> None:
     """
-    unzip a zip file
+    Unzip a zip file.
+    Parameters:
+    - zip_file_path (str): Path to the zip file to be unzipped.
+    - destination_path (str): Path where the contents of the zip file will be extracted.
+    Returns:
+    None
     """
     with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
         zip_ref.extractall(destination_path)
 ############################################################################
@@ -719,19 +998,32 @@ def unzip_file(zip_file_path, destination_path):
 ############################################################################
-def create_google_spreadsheet_client(credentials:str):
+def create_google_spreadsheet_client(credentials: str):
     """
-    Create a Gspread client to interact with Google Sheets
+    Create a Gspread client to interact with Google Sheets.
+    Parameters:
+    - credentials (str): Path to the JSON file containing Google Service Account credentials.
+    Returns:
+    gspread.Client: A client object for interacting with Google Sheets.
     """
     return gspread.service_account(filename=credentials)
-def read_google_spreadsheet(client, sheet_id: str, worksheet_name: str):
+def read_google_spreadsheet(client: gspread.Client, sheet_id: str, worksheet_name: str) -> pd.DataFrame:
     """
-    Function to read a Google spreadsheet in a DataFrame
+    Read data from a Google spreadsheet and return it as a DataFrame.
+    Parameters:
+    - client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
+    - sheet_id (str): The ID of the Google spreadsheet.
+    - worksheet_name (str): The name of the worksheet within the spreadsheet.
+    Returns:
+    pd.DataFrame: A DataFrame containing the data from the specified worksheet.
     """
     try:
-        # Open the Google Spreadsheet by name
+        # Open the Google Spreadsheet by ID
         sheet = client.open_by_key(sheet_id)
         # Select a specific worksheet by name
@@ -751,29 +1043,52 @@ def read_google_spreadsheet(client, sheet_id: str, worksheet_name: str):
         print(f"An error occurred: {e}")
-def list_google_worksheets(client, sheet_id:str):
+def list_google_worksheets(client: gspread.Client, sheet_id: str) -> list:
     """
-    Return a list of worksheet names for a spreadsheet ID
+    Return a list of worksheet names for a spreadsheet ID.
+    Parameters:
+    - client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
+    - sheet_id (str): The ID of the Google spreadsheet.
+    Returns:
+    list: A list of worksheet names.
     """
     sheet = client.open_by_key(sheet_id)
     worksheet_obj = sheet.worksheets()
     worksheet_list = [sheet.title for sheet in worksheet_obj]
     return worksheet_list
-def get_spreadsheet_permissions(client, sheet_id:str):
+def get_spreadsheet_permissions(client: gspread.Client, sheet_id: str) -> pd.DataFrame:
     """
-    Return a DataFrame with the list of user email and type that can access the document
+    Return a DataFrame with the list of user email and type that can access the document.
+    Parameters:
+    - client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
+    - sheet_id (str): The ID of the Google spreadsheet.
+    Returns:
+    pd.DataFrame: A DataFrame containing the list of user email addresses and their access types.
     """
     sheet = client.open_by_key(sheet_id)
-    permissions=sheet.list_permissions()
-    user_list=[(user.get("emailAddress"),user.get("type")) for user in permissions if user.get("emailAddress") is not None]
+    permissions = sheet.list_permissions()
+    user_list = [(user.get("emailAddress"), user.get("type")) for user in permissions if user.get("emailAddress") is not None]
     df = pd.DataFrame(user_list, columns=['email', 'type'])
     return df
-def create_google_spreadsheet(client, df, filename:str, worksheet_name:str = "Sheet1"):
+def create_google_spreadsheet(client: gspread.Client, df: pd.DataFrame, filename: str, worksheet_name: str = "Sheet1") -> gspread.Spreadsheet:
     """
-    Load a dataframe in a new spreadsheet
+    Create a new Google spreadsheet and load a DataFrame into it.
+    Parameters:
+    - client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
+    - df (pd.DataFrame): The DataFrame to be loaded into the spreadsheet.
+    - filename (str): The desired filename for the new spreadsheet.
+    - worksheet_name (str, optional): The name of the worksheet within the spreadsheet. Defaults to "Sheet1".
+    Returns:
+    gspread.Spreadsheet: The created spreadsheet object.
     """
     spreadsheet = client.create(filename)
     worksheet = spreadsheet.sheet1
@@ -783,17 +1098,34 @@ def create_google_spreadsheet(client, df, filename:str, worksheet_name:str = "Sh
     return spreadsheet
-def share_google_spreadsheet(spreadsheet, email, user_type="user", user_role="writer", notify=False, email_message=None, with_link=False):
+def share_google_spreadsheet(spreadsheet: gspread.Spreadsheet, email: str, user_type: str = "user", user_role: str = "writer", notify: bool = False, email_message: str = None, with_link: bool = False) -> gspread.Spreadsheet:
     """
-    Share a spreadsheet with a user
+    Share a spreadsheet with a user.
+    Parameters:
+    - spreadsheet (gspread.Spreadsheet): The Google spreadsheet object to be shared.
+    - email (str): The email address of the user with whom the spreadsheet will be shared.
+    - user_type (str, optional): The permission type for the user. Defaults to "user".
+    - user_role (str, optional): The role assigned to the user. Defaults to "writer".
+    - notify (bool, optional): Whether to notify the user about the sharing. Defaults to False.
+    - email_message (str, optional): The message to include in the notification email.
+    - with_link (bool, optional): Whether to include a link to the shared document in the notification email. Defaults to False.
+    Returns:
+    gspread.Spreadsheet: The updated spreadsheet object.
     """
-    spreadsheet.share(email, perm_type=user_type, role=user_role, notify = notify, email_message=email_message, with_link=with_link)
+    spreadsheet.share(email, perm_type=user_type, role=user_role, notify=notify, email_message=email_message, with_link=with_link)
     return spreadsheet
-def generate_short_id(variables : dict):
+def generate_short_id(variables: dict) -> tuple[str, str]:
     """
-    Generate a 8 characters ID using a dict as input
+    Generate an 8-character ID using a dictionary as input.
+    Parameters:
+    - variables (dict): A dictionary containing the variables to be serialized.
+    Returns:
+    tuple: A tuple containing the generated short ID and the serialized variables.
     """
     # Serialize variables into JSON string
     serialized_variables = json.dumps(variables, sort_keys=True)
@@ -803,7 +1135,7 @@ def generate_short_id(variables : dict):
     short_id = hash_value[:8]
     return short_id, serialized_variables
-def df_transform_column_as_list(column):
+def df_transform_column_as_list(column: pd.Series) -> pd.Series:
     def transform(cell):
         if isinstance(cell, str):
             # Check if it's a list formatted as string, and convert to list
@@ -812,9 +1144,7 @@ def df_transform_column_as_list(column):
             else:
                 try:
                     values = ast.literal_eval(cell)
                 except Exception as e:
-                    pass
                     # If it's a single URL as string, make it a list
                     values = [cell]
         elif isinstance(cell, (int, float, bool)):
@@ -832,7 +1162,11 @@ def df_transform_column_as_list(column):
     return column.apply(transform)
-def top_rows_per_category(df, col_to_sort, col_to_gb, cols_to_keep, top_rows) :
+def top_rows_per_category(df: pd.DataFrame,
+                          col_to_sort: str,
+                          col_to_gb: str,
+                          cols_to_keep: list[str],
+                          top_rows: int) -> pd.DataFrame:
     """
     Select top rows for each category in a dataframe
     """
@@ -842,3 +1176,44 @@ def top_rows_per_category(df, col_to_sort, col_to_gb, cols_to_keep, top_rows) :
                  .reset_index(drop=True)
                 )[cols_to_keep]
     return df_gb
+def format_number(number: int) -> str:
+    """
+    Function to format a number in K, M or B
+    """
+    if number < 1000:
+        return str(number)
+    elif number < 1000000:
+        return f"{number / 1000:.1f}K"
+    elif number < 1000000000:
+        return f"{number / 1000000:.1f}M"
+    else:
+        return f"{number / 1000000000:.1f}B"
+def unrar_file(rar_file_path : str, output_dir : str) -> None:
+    """
+    Extracts a .rar file to the specified output directory using the unrar command.
+    Parameters:
+    rar_file_path (str): The path to the .rar file.
+    output_dir (str): The directory where the contents should be extracted.
+    Returns:
+    None
+    """
+    try:
+        # Ensure the output directory exists
+        subprocess.run(['mkdir', '-p', output_dir], check=True)
+        # Run the unrar command
+        result = subprocess.run(['unrar', 'x', '-y', rar_file_path, output_dir],
+                                stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        # Check if the extraction was successful
+        if result.returncode != 0:
+            print(f"Extraction failed. Error: {result.stderr}")
+    except Exception as e:
+        print(f"An error occurred: {e}")

opsci-toolbox 0.0.2__py3-none-any.whl → 0.0.6__py3-none-any.whl

opsci-toolbox 0.0.2py3-none-any.whl → 0.0.6py3-none-any.whl