PyPI - rgwfuncs - Versions diffs - 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl - Mend

rgwfuncs 0.0.2py3-none-any.whl → 0.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

rgwfuncs/df_lib.py +730 -217
rgwfuncs-0.0.4.dist-info/METADATA +999 -0
rgwfuncs-0.0.4.dist-info/RECORD +8 -0
rgwfuncs-0.0.2.dist-info/METADATA +0 -325
rgwfuncs-0.0.2.dist-info/RECORD +0 -8
{rgwfuncs-0.0.2.dist-info → rgwfuncs-0.0.4.dist-info}/LICENSE +0 -0
{rgwfuncs-0.0.2.dist-info → rgwfuncs-0.0.4.dist-info}/WHEEL +0 -0
{rgwfuncs-0.0.2.dist-info → rgwfuncs-0.0.4.dist-info}/entry_points.txt +0 -0
{rgwfuncs-0.0.2.dist-info → rgwfuncs-0.0.4.dist-info}/top_level.txt +0 -0

rgwfuncs/df_lib.py CHANGED Viewed

@@ -18,22 +18,30 @@ import sqlite3
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
 from email.mime.base import MIMEBase
+from email import encoders
 from googleapiclient.discovery import build
 import base64
-from typing import Optional, Callable, Dict, List
+import inspect
+from typing import Optional, Callable, Dict, List, Tuple, Any
 def docs(method_type_filter: Optional[str] = None) -> None:
     """
-    Print a list of function names in alphabetical order. If method_type_filter is specified,
-    print the docstrings of the functions that match the filter.
+    Print a list of function names in alphabetical order. If
+    method_type_filter is specified, print the docstrings of the functions
+    that match the filter. Using '*' as a filter will print the docstrings for
+    all functions.
     Parameters:
-        method_type_filter: Optional filter string, comma-separated, to select docstring types.
+        method_type_filter: Optional filter string, comma-separated to select
+        docstring types, or '*' for all.
     """
     # Get the current module's namespace
+    current_module = __name__
     local_functions: Dict[str, Callable] = {
-        name: obj for name, obj in globals().items() if callable(obj)
+        name: obj for name, obj in globals().items()
+        if inspect.isfunction(obj) and obj.__module__ == current_module
     }
     # List of function names sorted alphabetically
@@ -44,23 +52,28 @@ def docs(method_type_filter: Optional[str] = None) -> None:
     for name in function_names:
         print(name)
-    # If a filter is provided, print the docstrings of functions that match the filter
+    # If a filter is provided or '*', print the docstrings of functions
     if method_type_filter:
-        function_type_list: List[str] = [mt.strip() for mt in method_type_filter.split(',')]
         print("\nFiltered function documentation:")
         for name, func in local_functions.items():
             docstring: Optional[str] = func.__doc__
             if docstring:
-                # Extract only the first line of the docstring
-                first_line: str = docstring.split('\n')[0]
-                if "::" in first_line:
-                    # Find the first occurrence of "::" and split there
-                    split_index: int = first_line.find("::")
-                    function_type: str = first_line[:split_index].strip()
-                    if function_type in function_type_list:
-                        function_description: str = first_line[split_index + 2:].strip()
-                        print(f"{name}: {function_description}")
+                if method_type_filter == '*':
+                    # Print the entire docstring for each function
+                    print(f"\n{name}:\n{docstring}")
+                else:
+                    # Extract only the first line of the docstring
+                    first_line: str = docstring.split('\n')[0]
+                    if "::" in first_line:
+                        # Find the first occurrence of "::" and split there
+                        split_index: int = first_line.find("::")
+                        function_type: str = first_line[:split_index].strip()
+                        function_type_list: List[str] = [
+                            mt.strip() for mt in method_type_filter.split(',')]
+                        if function_type in function_type_list:
+                            # Print the entire docstring if the filter matches
+                            print(f"\n{name}:\n{docstring}")
 def numeric_clean(
     df: pd.DataFrame,
@@ -70,34 +83,45 @@ def numeric_clean(
 ) -> pd.DataFrame:
     """
     Cleans the numeric columns based on specified treatments.
     Parameters:
         df: The DataFrame to clean.
-        column_names: A comma-separated string containing the names of the columns to clean.
-        column_type: The type to convert the column to ('INTEGER' or 'FLOAT').
-        irregular_value_treatment: How to treat irregular values ('NAN', 'TO_ZERO', 'MEAN').
+        column_names: A comma-separated string containing the names of the
+        columns to clean.
+        column_type: The type to convert the column to ('INTEGER' or
+        'FLOAT').
+        irregular_value_treatment: How to treat irregular values ('NAN',
+        'TO_ZERO', 'MEAN').
     Returns:
         A new DataFrame with cleaned numeric columns.
     """
     df_copy = df.copy()  # Avoid mutating the original DataFrame
-    columns_list: List[str] = [name.strip() for name in column_names.split(',')]
+    columns_list: List[str] = [name.strip()
+                               for name in column_names.split(',')]
     for column_name in columns_list:
         if column_name not in df_copy.columns:
-            raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
+            raise ValueError(
+                f"Column '{column_name}' does not exist in the DataFrame.")
         if column_type not in ['INTEGER', 'FLOAT']:
             raise ValueError("column_type must be 'INTEGER' or 'FLOAT'.")
         if irregular_value_treatment not in ['NAN', 'TO_ZERO', 'MEAN']:
-            raise ValueError("irregular_value_treatment must be 'NAN', 'TO_ZERO', or 'MEAN'.")
+            raise ValueError(
+                "irregular_value_treatment must be 'NAN', 'TO_ZERO', or"
+                + "'MEAN'.")
         # Convert column type
         if column_type == 'INTEGER':
-            df_copy[column_name] = pd.to_numeric(df_copy[column_name], errors='coerce').astype(pd.Int64Dtype())
+            df_copy[column_name] = pd.to_numeric(
+                df_copy[column_name],
+                errors='coerce').astype(
+                pd.Int64Dtype())
         elif column_type == 'FLOAT':
-            df_copy[column_name] = pd.to_numeric(df_copy[column_name], errors='coerce').astype(float)
+            df_copy[column_name] = pd.to_numeric(
+                df_copy[column_name], errors='coerce').astype(float)
         # Handle irregular values
         if irregular_value_treatment == 'NAN':
@@ -110,6 +134,7 @@ def numeric_clean(
     return df_copy
 def limit_dataframe(df: pd.DataFrame, num_rows: int) -> pd.DataFrame:
     """
     Limit the DataFrame to a specified number of rows.
@@ -120,15 +145,16 @@ def limit_dataframe(df: pd.DataFrame, num_rows: int) -> pd.DataFrame:
     Returns:
         A new DataFrame limited to the specified number of rows.
     Raises:
         ValueError: If num_rows is not an integer.
     """
     if not isinstance(num_rows, int):
         raise ValueError("The number of rows should be an integer.")
     return df.head(num_rows)
 def from_raw_data(headers: List[str], data: List[List[int]]) -> pd.DataFrame:
     """
     Create a DataFrame from raw data.
@@ -150,13 +176,15 @@ def from_raw_data(headers: List[str], data: List[List[int]]) -> pd.DataFrame:
     return df
 def append_rows(df: pd.DataFrame, rows: List[List]) -> pd.DataFrame:
     """
     Append rows to the DataFrame.
     Parameters:
         df: The original DataFrame.
-        rows: A list of lists, where each inner list represents a row to be appended.
+        rows: A list of lists, where each inner list represents a row to be
+        appended.
     Returns:
         A new DataFrame with the appended rows.
@@ -164,7 +192,12 @@ def append_rows(df: pd.DataFrame, rows: List[List]) -> pd.DataFrame:
     Raises:
         ValueError: If rows are not in the correct format.
     """
-    if not isinstance(rows, list) or not all(isinstance(row, list) for row in rows):
+    if not isinstance(
+        rows,
+        list) or not all(
+        isinstance(
+            row,
+            list) for row in rows):
         raise ValueError("Rows should be provided as a list of lists.")
     if df.empty:
@@ -175,6 +208,7 @@ def append_rows(df: pd.DataFrame, rows: List[List]) -> pd.DataFrame:
     return new_df
 def append_columns(df: pd.DataFrame, *col_names: str) -> pd.DataFrame:
     """
     Append columns to the DataFrame with None values.
@@ -198,6 +232,7 @@ def append_columns(df: pd.DataFrame, *col_names: str) -> pd.DataFrame:
     return new_df
 def update_rows(
     df: pd.DataFrame,
     condition: str,
@@ -209,7 +244,8 @@ def update_rows(
     Parameters:
         df: The original DataFrame.
         condition: A query condition to identify rows for updating.
-        updates: A dictionary with column names as keys and new values as values.
+        updates: A dictionary with column names as keys and new values as
+        values.
     Returns:
         A new DataFrame with the updated rows.
@@ -227,7 +263,9 @@ def update_rows(
     invalid_cols = [col for col in updates if col not in df.columns]
     if invalid_cols:
-        raise ValueError(f"Columns {', '.join(invalid_cols)} do not exist in the DataFrame.")
+        raise ValueError(
+            f"Columns {
+                ', '.join(invalid_cols)} do not exist in the DataFrame.")
     new_df = df.copy()
     for col_name, new_value in updates.items():
@@ -235,6 +273,7 @@ def update_rows(
     return new_df
 def delete_rows(df: pd.DataFrame, condition: str) -> pd.DataFrame:
     """
     Delete rows from the DataFrame based on a condition.
@@ -258,6 +297,7 @@ def delete_rows(df: pd.DataFrame, condition: str) -> pd.DataFrame:
     return new_df
 def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
     """
     Drop duplicate rows in the DataFrame, retaining the first occurrence.
@@ -267,7 +307,7 @@ def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
     Returns:
         A new DataFrame with duplicates removed.
     Raises:
         ValueError: If the DataFrame is None.
     """
@@ -275,54 +315,73 @@ def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
         raise ValueError("DataFrame is not initialized.")
     return df.drop_duplicates(keep='first')
-def drop_duplicates_retain_first(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
+def drop_duplicates_retain_first(
+        df: pd.DataFrame,
+        columns: Optional[str] = None) -> pd.DataFrame:
     """
-    Drop duplicate rows in the DataFrame based on specified columns, retaining the first occurrence.
+    Drop duplicate rows in the DataFrame based on specified columns, retaining
+    the first occurrence.
     Parameters:
         df: The DataFrame from which duplicates will be dropped.
-        columns: A comma-separated string with the column names used to identify duplicates.
+        columns: A comma-separated string with the column names used to
+        identify duplicates.
     Returns:
         A new DataFrame with duplicates removed.
     Raises:
         ValueError: If the DataFrame is None.
     """
     if df is None:
         raise ValueError("DataFrame is not initialized.")
-    columns_list = [col.strip() for col in columns.split(',')] if columns else None
+    columns_list = [col.strip()
+                    for col in columns.split(',')] if columns else None
     return df.drop_duplicates(subset=columns_list, keep='first')
-def drop_duplicates_retain_last(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
+def drop_duplicates_retain_last(
+        df: pd.DataFrame,
+        columns: Optional[str] = None) -> pd.DataFrame:
     """
-    Drop duplicate rows in the DataFrame based on specified columns, retaining the last occurrence.
+    Drop duplicate rows in the DataFrame based on specified columns, retaining
+    the last occurrence.
     Parameters:
         df: The DataFrame from which duplicates will be dropped.
-        columns: A comma-separated string with the column names used to identify duplicates.
+        columns: A comma-separated string with the column names used to
+        identify duplicates.
     Returns:
         A new DataFrame with duplicates removed.
     Raises:
         ValueError: If the DataFrame is None.
     """
     if df is None:
         raise ValueError("DataFrame is not initialized.")
-    columns_list = [col.strip() for col in columns.split(',')] if columns else None
+    columns_list = [col.strip()
+                    for col in columns.split(',')] if columns else None
     return df.drop_duplicates(subset=columns_list, keep='last')
-def load_data_from_query(db_preset_name: str, query: str, config_file_name: str = "rgwml.config") -> pd.DataFrame:
+def load_data_from_query(
+        db_preset_name: str,
+        query: str,
+        config_file_name: str = "rgwml.config") -> pd.DataFrame:
     """
-    Load data from a database query into a DataFrame based on a configuration preset.
+    Load data from a database query into a DataFrame based on a configuration
+    preset.
     Parameters:
-        db_preset_name: The name of the database preset in the configuration file.
+        db_preset_name: The name of the database preset in the configuration
+        file.
         query: The SQL query to execute.
-        config_file_name: Name of the configuration file (default: 'rgwml.config').
+        config_file_name: Name of the configuration file
+        (default: 'rgwml.config').
     Returns:
         A DataFrame containing the query result.
@@ -344,45 +403,56 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
             for root, dirs, files in os.walk(path):
                 if filename in files:
                     return os.path.join(root, filename)
-        raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
+        raise FileNotFoundError(
+            f"{filename} not found in Desktop, Documents, or Downloads"
+            + "folders")
     def query_mssql(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
-        """Execute a query on an MSSQL database and return the result as a DataFrame."""
         server = db_preset['host']
         user = db_preset['username']
         password = db_preset['password']
         database = db_preset.get('database', '')
-        with pymssql.connect(server=server, user=user, password=password, database=database) as conn:
+        with pymssql.connect(server=server, user=user, password=password,
+                             database=database) as conn:
             with conn.cursor() as cursor:
                 cursor.execute(query)
                 rows = cursor.fetchall()
                 columns = [desc[0] for desc in cursor.description]
         return pd.DataFrame(rows, columns=columns)
     def query_mysql(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
-        """Execute a query on a MySQL database and return the result as a DataFrame."""
         host = db_preset['host']
         user = db_preset['username']
         password = db_preset['password']
         database = db_preset.get('database', '')
-        with mysql.connector.connect(host=host, user=user, password=password, database=database) as conn:
+        with mysql.connector.connect(
+            host=host,
+            user=user,
+            password=password,
+            database=database
+        ) as conn:
             with conn.cursor() as cursor:
                 cursor.execute(query)
                 rows = cursor.fetchall()
-                columns = [desc[0] for desc in cursor.description] if cursor.description else []
+                columns = (
+                    [desc[0] for desc in cursor.description]
+                    if cursor.description
+                    else []
+                )
         return pd.DataFrame(rows, columns=columns)
-    def query_clickhouse(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
-        """Query a ClickHouse database and return the result as a DataFrame."""
+    def query_clickhouse(
+            db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
         host = db_preset['host']
         user = db_preset['username']
         password = db_preset['password']
         database = db_preset['database']
         max_retries = 5
         retry_delay = 5
@@ -405,21 +475,23 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
                     print(f"Retrying in {retry_delay} seconds...")
                     time.sleep(retry_delay)
                 else:
-                    raise ConnectionError("All attempts to connect to ClickHouse failed.")
+                    raise ConnectionError(
+                        "All attempts to connect to ClickHouse failed.")
-    def query_google_big_query(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
-        """Query a Google BigQuery database and return the result as a DataFrame."""
+    def query_google_big_query(
+            db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
         json_file_path = db_preset['json_file_path']
         project_id = db_preset['project_id']
-        credentials = service_account.Credentials.from_service_account_file(json_file_path)
+        credentials = service_account.Credentials.from_service_account_file(
+            json_file_path)
         client = bigquery.Client(credentials=credentials, project=project_id)
         query_job = client.query(query)
         results = query_job.result()
         rows = [list(row.values()) for row in results]
         columns = [field.name for field in results.schema]
         return pd.DataFrame(rows, columns=columns)
     # Read the configuration file to get the database preset
@@ -428,7 +500,9 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
         config = json.load(f)
     db_presets = config.get('db_presets', [])
-    db_preset = next((preset for preset in db_presets if preset['name'] == db_preset_name), None)
+    db_preset = next(
+        (preset for preset in db_presets if preset['name'] == db_preset_name),
+        None)
     if not db_preset:
         raise ValueError(f"No matching db_preset found for {db_preset_name}")
@@ -446,7 +520,6 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
         raise ValueError(f"Unsupported db_type: {db_type}")
 def load_data_from_path(file_path: str) -> pd.DataFrame:
     """
     Load data from a file into a DataFrame based on the file extension.
@@ -460,7 +533,7 @@ def load_data_from_path(file_path: str) -> pd.DataFrame:
     Raises:
         ValueError: If the file extension is unsupported.
     """
     def load_hdf5(file_path: str) -> pd.DataFrame:
         """Helper function to load HDF5 files and select a key if necessary."""
         with pd.HDFStore(file_path, mode='r') as store:
@@ -476,7 +549,8 @@ def load_data_from_path(file_path: str) -> pd.DataFrame:
                         df = pd.read_hdf(file_path, key=key)
                         break
                     else:
-                        print(f"Key '{key}' is not in the available keys. Please try again.")
+                        print(
+                            f"Key '{key}' is not in the available keys.")
         return df
     # Ensure the file path is absolute
@@ -510,7 +584,8 @@ def load_data_from_path(file_path: str) -> pd.DataFrame:
 def load_data_from_sqlite_path(sqlite_path: str, query: str) -> pd.DataFrame:
     """
-    Execute a query on a SQLite database specified by its path and return the results as a DataFrame.
+    Execute a query on a SQLite database specified by its path and return the
+     results as a DataFrame.
     Parameters:
         sqlite_path: The absolute path to the SQLite database file.
@@ -522,7 +597,7 @@ def load_data_from_sqlite_path(sqlite_path: str, query: str) -> pd.DataFrame:
     Raises:
         ValueError: If there is a problem executing the query.
     """
     # Ensure the file path is absolute
     sqlite_path = os.path.abspath(sqlite_path)
@@ -535,68 +610,168 @@ def load_data_from_sqlite_path(sqlite_path: str, query: str) -> pd.DataFrame:
     gc.collect()
     return df
 def first_n_rows(df: pd.DataFrame, n: int) -> None:
-    """Print the first n rows of the DataFrame."""
+    """
+    Display the first n rows of the DataFrame.
+    This function prints out the first `n` rows of a given DataFrame. Each row
+    is formatted for clarity and
+    printed as a dictionary. If the DataFrame is empty or `None`, it raises a
+    ValueError.
+    Parameters:
+    - df (pd.DataFrame): The DataFrame to display rows from.
+    - n (int): The number of rows to display from the start of the DataFrame.
+    Raises:
+    - ValueError: If the DataFrame is `None`.
+    """
     if df is not None:
         first_n_rows = df.head(n).to_dict(orient="records")
         for row in first_n_rows:
             pprint(row, indent=4)
             print()
     else:
-        raise ValueError("No DataFrame to display. Please provide a DataFrame.")
+        raise ValueError(
+            "No DataFrame to display. Please provide a DataFrame.")
     gc.collect()
 def last_n_rows(df: pd.DataFrame, n: int) -> None:
-    """Print the last n rows of the DataFrame."""
+    """
+    Display the last n rows of the DataFrame.
+    Prints the last `n` rows of a given DataFrame, formatted as dictionaries.
+    Useful for end-segment analysis and verifying data continuity.
+    Parameters:
+    - df (pd.DataFrame): The DataFrame from which to display rows.
+    - n (int): The number of rows to display from the end of the DataFrame.
+    Raises:
+    - ValueError: If the DataFrame is `None`.
+    """
     if df is not None:
         last_n_rows = df.tail(n).to_dict(orient="records")
         for row in last_n_rows:
             pprint(row, indent=4)
             print()
     else:
-        raise ValueError("No DataFrame to display. Please provide a DataFrame.")
+        raise ValueError(
+            "No DataFrame to display. Please provide a DataFrame.")
     gc.collect()
 def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
-    """Print top n unique values for specified columns in the DataFrame."""
+    """
+    Print the top `n` unique values for specified columns in the DataFrame.
+    This method calculates and prints the top `n` unique frequency values for
+    specified columns in a DataFrame.
+    Parameters:
+    - df (pd.DataFrame): The DataFrame from which to calculate top unique
+    values.
+    - n (int): Number of top values to display.
+    - columns (List[str]): List of column names for which to display top
+    unique values.
+    Raises:
+    - ValueError: If the DataFrame is `None`.
+    """
     if df is not None:
         report = {}
         for column in columns:
             if column in df.columns:
                 frequency = df[column].astype(str).value_counts(dropna=False)
-                frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
+                frequency = frequency.rename(
+                    index={
+                        'nan': 'NaN',
+                        'NaT': 'NaT',
+                        'None': 'None',
+                        '': 'Empty'})
                 top_n_values = frequency.nlargest(n)
-                report[column] = {str(value): str(count) for value, count in top_n_values.items()}
-                print(f"Top {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
+                report[column] = {str(value): str(count)
+                                  for value, count in top_n_values.items()}
+                print(
+                    f"Top {n} unique values for column '{column}':\n{
+                        json.dumps(
+                            report[column],
+                            indent=2)}\n")
             else:
                 print(f"Column '{column}' does not exist in the DataFrame.")
     else:
-        raise ValueError("No DataFrame to display. Please provide a DataFrame.")
+        raise ValueError(
+            "No DataFrame to display. Please provide a DataFrame.")
     gc.collect()
-def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
-    """Print bottom n unique values for specified columns in the DataFrame."""
+def bottom_n_unique_values(
+        df: pd.DataFrame,
+        n: int,
+        columns: List[str]) -> None:
+    """
+    Print the bottom `n` unique values for specified columns in the DataFrame.
+    This method calculates and prints the bottom `n` unique frequency values
+    for specified columns in a DataFrame.
+    Parameters:
+    - df (pd.DataFrame): The DataFrame from which to calculate bottom unique
+    values.
+    - n (int): Number of bottom unique frequency values to display.
+    - columns (List[str]): List of column names for which to display bottom
+    unique values.
+    Raises:
+    - ValueError: If the DataFrame is `None`.
+    """
     if df is not None:
         report = {}
         for column in columns:
             if column in df.columns:
                 frequency = df[column].astype(str).value_counts(dropna=False)
-                frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
+                frequency = frequency.rename(
+                    index={
+                        'nan': 'NaN',
+                        'NaT': 'NaT',
+                        'None': 'None',
+                        '': 'Empty'})
                 bottom_n_values = frequency.nsmallest(n)
-                report[column] = {str(value): str(count) for value, count in bottom_n_values.items()}
-                print(f"Bottom {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
+                report[column] = {
+                    str(value): str(count) for value,
+                    count in bottom_n_values.items()}
+                print(
+                    f"Bottom {n} unique values for column '{column}':\n{
+                        json.dumps(
+                            report[column],
+                            indent=2)}\n")
             else:
                 print(f"Column '{column}' does not exist in the DataFrame.")
     else:
-        raise ValueError("No DataFrame to display. Please provide a DataFrame.")
+        raise ValueError(
+            "No DataFrame to display. Please provide a DataFrame.")
     gc.collect()
-def print_correlation(df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
-    """Print correlation for multiple pairs of columns in the DataFrame."""
+def print_correlation(
+        df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
+    """
+    Print correlation for multiple pairs of columns in the DataFrame.
+    This function computes and displays the correlation coefficients for
+    specified pairs of columns.
+    Parameters:
+    - df (pd.DataFrame): The DataFrame containing the columns to analyze.
+    - column_pairs (List[Tuple[str, str]]): List of column pairs for which to
+    compute correlations.
+    """
     if df is not None:
         for col1, col2 in column_pairs:
             if col1 in df.columns and col2 in df.columns:
@@ -606,30 +781,68 @@ def print_correlation(df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) ->
                     correlation = numeric_col1.corr(numeric_col2)
                     if pd.notnull(correlation):
-                        print(f"The correlation between '{col1}' and '{col2}' is {correlation}.")
+                        print(
+                            f"The correlation between '{col1}' and '{col2}'"
+                            + f" is {correlation}.")
                     else:
-                        print(f"Cannot calculate correlation between '{col1}' and '{col2}' due to insufficient numeric data.")
+                        print(
+                            f"Cannot calculate correlation between '{col1}'"
+                            + f" and '{col2}' due to insufficient numeric"
+                            + " data.")
                 except Exception as e:
-                    print(f"Error processing columns '{col1}' and '{col2}': {e}")
+                    print(
+                        f"Error processing cols '{col1}' and '{col2}': {e}")
             else:
-                print(f"One or both of the specified columns ('{col1}', '{col2}') do not exist in the DataFrame.")
+                print(
+                    f"One or both of the specified cols ('{col1}', '{col2}')"
+                    + " do not exist in the DataFrame.")
     else:
         print("The DataFrame is empty.")
     gc.collect()
 def print_memory_usage(df: pd.DataFrame) -> None:
-    """Print memory usage of the DataFrame."""
+    """
+    Prints the memory usage of the DataFrame.
+    This function computes the memory footprint of a DataFrame in megabytes
+    and displays it, rounding to two decimal places for clarity.
+    Parameters:
+    - df (pd.DataFrame): The DataFrame for which the memory usage is computed.
+    Raises:
+    - ValueError: If the DataFrame is `None`.
+    """
     if df is not None:
-        memory_usage = df.memory_usage(deep=True).sum() / (1024 * 1024)  # Convert bytes to MB
+        memory_usage = df.memory_usage(deep=True).sum(
+        ) / (1024 * 1024)  # Convert bytes to MB
         print(f"Memory usage of DataFrame: {memory_usage:.2f} MB")
     else:
         raise ValueError("No DataFrame to print. Please provide a DataFrame.")
     gc.collect()
 def filter_dataframe(df: pd.DataFrame, filter_expr: str) -> pd.DataFrame:
-    """Filter DataFrame with a given expression."""
+    """
+    Return a filtered DataFrame according to the given expression.
+    This function filters rows of a DataFrame using a specified query
+    expression, returning a new DataFrame containing only the rows that
+    match the criteria.
+    Parameters:
+    - df (pd.DataFrame): The original DataFrame to be filtered.
+    - filter_expr (str): A query string to be evaluated against the DataFrame.
+    Returns:
+    - pd.DataFrame: A new DataFrame containing the filtered rows.
+    Raises:
+    - ValueError: If the DataFrame is `None`.
+    """
     if df is not None:
         try:
             filtered_df = df.query(filter_expr)
@@ -642,14 +855,34 @@ def filter_dataframe(df: pd.DataFrame, filter_expr: str) -> pd.DataFrame:
     return filtered_df
 def filter_indian_mobiles(df: pd.DataFrame, mobile_col: str) -> pd.DataFrame:
-    """Filter DataFrame for Indian mobile numbers."""
+    """
+    Filter and return DataFrame rows containing valid Indian mobile numbers.
+    This function processes a DataFrame to extract and retain rows where the
+    specified column matches the typical format for Indian mobile numbers.
+    An Indian mobile number is expected to be a digit-only string starting
+    with 6, 7, 8, or 9, and should have at least 4 distinct digits.
+    Parameters:
+    - df (pd.DataFrame): The DataFrame to filter.
+    - mobile_col (str): The name of the column in the DataFrame that contains
+    mobile number data.
+    Returns:
+    - pd.DataFrame: A new DataFrame containing only rows with valid Indian
+    mobile numbers.
+    Raises:
+    - ValueError: If the DataFrame is `None`.
+    """
     if df is not None:
         filtered_df = df[
             df[mobile_col].apply(
                 lambda x: (
-                    str(x).isdigit() and
-                    str(x).startswith(('6', '7', '8', '9')) and
+                    str(x).isdigit() and
+                    str(x).startswith(('6', '7', '8', '9')) and
                     len(set(str(x))) >= 4
                 )
             )
@@ -661,17 +894,21 @@ def filter_indian_mobiles(df: pd.DataFrame, mobile_col: str) -> pd.DataFrame:
     return filtered_df
 def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
     """
-    Print the DataFrame and its column types. If a source path is provided, print it as well.
+    Print the DataFrame and its column types. If a source path is provided,
+    print it as well.
     Parameters:
         df: The DataFrame to print.
-        source: Optional; The source path of the DataFrame for logging purposes.
+        source: Optional; The source path of the DataFrame for logging
+        purposes.
     """
     if df is not None:
         print(df)
-        columns_with_types = [f"{col} ({df[col].dtypes})" for col in df.columns]
+        columns_with_types = [
+            f"{col} ({df[col].dtypes})" for col in df.columns]
         print("Columns:", columns_with_types)
         if source:
             print(f"Source: {source}")
@@ -680,28 +917,43 @@ def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
     gc.collect()
-def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
+def send_dataframe_via_telegram(
+        df: pd.DataFrame,
+        bot_name: str,
+        message: Optional[str] = None,
+        as_file: bool = True,
+        remove_after_send: bool = True) -> None:
     """
     Send a DataFrame via Telegram using a specified bot configuration.
     Parameters:
         df: The DataFrame to send.
-        bot_name: The name of the Telegram bot as specified in the configuration.
+        bot_name: The name of the Telegram bot as specified in the
+        configuration.
         message: Custom message to send along with the DataFrame or file.
-        as_file: Boolean flag to decide whether to send the DataFrame as a file or as text.
+        as_file: Boolean flag to decide whether to send the DataFrame as a
+        file or as text.
         remove_after_send: If True, removes the file after sending.
     """
     def locate_config_file(filename: str = "rgwml.config") -> str:
         """Retrieve the configuration file path."""
         home_dir = os.path.expanduser("~")
-        search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
+        search_paths = [
+            os.path.join(
+                home_dir,
+                folder) for folder in [
+                "Desktop",
+                "Documents",
+                "Downloads"]]
         for path in search_paths:
             for root, _, files in os.walk(path):
                 if filename in files:
                     return os.path.join(root, filename)
-        raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
+        raise FileNotFoundError(
+            f"{filename} not found in Desktop, Documents, or Downloads")
     def get_config(config_path: str) -> dict:
         """Load configuration from a json file."""
@@ -710,8 +962,14 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
     config_path = locate_config_file()
     config = get_config(config_path)
+    bot_config = next(
+        (
+            bot for bot in config['telegram_bot_presets']
+            if bot['name'] == bot_name
+        ),
+        None
+    )
-    bot_config = next((bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name), None)
     if not bot_config:
         raise ValueError(f"No bot found with the name {bot_name}")
@@ -724,9 +982,15 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
         df.to_csv(file_name, index=False)
         try:
             with open(file_name, 'rb') as file:
-                payload = {'chat_id': bot_config['chat_id'], 'caption': message or ''}
+                payload = {
+                    'chat_id': bot_config['chat_id'],
+                    'caption': message or ''}
                 files = {'document': file}
-                response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendDocument", data=payload, files=files)
+                response = requests.post(
+                    f"https://api.telegram.org/bot{
+                        bot_config['bot_token']}/sendDocument",
+                    data=payload,
+                    files=files)
             if remove_after_send and os.path.exists(file_name):
                 os.remove(file_name)
         except Exception as e:
@@ -734,14 +998,20 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
             raise
     else:
         df_str = df.to_string()
-        payload = {'chat_id': bot_config['chat_id'], 'text': message + "\n\n" + df_str if message else df_str, 'parse_mode': 'HTML'}
-        response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendMessage", data=payload)
+        payload = {
+            'chat_id': bot_config['chat_id'],
+            'text': message + "\n\n" + df_str if message else df_str,
+            'parse_mode': 'HTML'}
+        response = requests.post(
+            f"https://api.telegram.org/bot{bot_config['bot_token']}"
+            + "/sendMessage", data=payload)
     if not response.ok:
         raise Exception(f"Error sending message: {response.text}")
     print("Message sent successfully.")
 def send_data_to_email(
     df: pd.DataFrame,
     preset_name: str,
@@ -752,39 +1022,50 @@ def send_data_to_email(
     remove_after_send: bool = True
 ) -> None:
     """
-    Send an email with optional DataFrame attachment using Gmail API via a specified preset.
+    Send an email with optional DataFrame attachment using Gmail API via a
+    specified preset.
     Parameters:
         df: The DataFrame to send.
-        preset_name: The configuration preset name to use for sending the email.
+        preset_name: The configuration preset name to use for sending the
+        email.
         to_email: The recipient email address.
         subject: Optional subject of the email.
         body: Optional message body of the email.
-        as_file: Boolean flag to decide whether to send the DataFrame as a file.
+        as_file: Boolean flag to decide whether to send the DataFrame as a
+        file.
         remove_after_send: If True, removes the CSV file after sending.
     """
     def locate_config_file(filename: str = "rgwml.config") -> str:
         """Locate config file in common user directories."""
         home_dir = os.path.expanduser("~")
-        search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
+        search_paths = [
+            os.path.join(
+                home_dir,
+                folder) for folder in [
+                "Desktop",
+                "Documents",
+                "Downloads"]]
         for path in search_paths:
             for root, _, files in os.walk(path):
                 if filename in files:
                     return os.path.join(root, filename)
-        raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
+        raise FileNotFoundError(
+            f"{filename} not found in Desktop, Documents, or Downloads"
+            + " folders")
     def get_config(config_path: str) -> dict:
-        """Load configuration from a json file."""
         with open(config_path, 'r') as file:
             try:
                 return json.load(file)
             except json.JSONDecodeError as e:
                 raise ValueError(f"Invalid JSON format in config file: {e}")
-    def authenticate_service_account(service_account_credentials_path: str, sender_email_id: str) -> Any:
-        """Authenticate the service account and return a Gmail API service instance."""
+    def authenticate_service_account(
+            service_account_credentials_path: str,
+            sender_email_id: str) -> Any:
         credentials = service_account.Credentials.from_service_account_file(
             service_account_credentials_path,
             scopes=['https://mail.google.com/'],
@@ -797,7 +1078,14 @@ def send_data_to_email(
     config = get_config(config_path)
     # Retrieve Gmail preset configuration
-    gmail_config = next((preset for preset in config['gmail_bot_presets'] if preset['name'] == preset_name), None)
+    gmail_config = next(
+        (
+            preset for preset in config['gmail_bot_presets']
+            if preset['name'] == preset_name
+        ),
+        None
+    )
     if not gmail_config:
         raise ValueError(f"No preset found with the name {preset_name}")
@@ -809,7 +1097,9 @@ def send_data_to_email(
     if as_file:
         # Create a temporary file for the DataFrame as CSV
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
+        with tempfile.NamedTemporaryFile(
+            delete=False, suffix=".csv"
+        ) as tmp_file:
             tmp_file_name = tmp_file.name
             df.to_csv(tmp_file_name, index=False)
@@ -819,13 +1109,18 @@ def send_data_to_email(
             message['to'] = to_email
             message['from'] = sender_email
             message['subject'] = subject if subject else 'DataFrame CSV File'
-            message.attach(MIMEText(body if body else 'Please find the CSV file attached.'))
+            message.attach(
+                MIMEText(
+                    body if body else 'Please find the CSV file attached.'))
             with open(tmp_file_name, 'rb') as file:
                 part = MIMEBase('application', 'octet-stream')
                 part.set_payload(file.read())
                 encoders.encode_base64(part)
-                part.add_header('Content-Disposition', f'attachment; filename={os.path.basename(tmp_file_name)}')
+                part.add_header(
+                    'Content-Disposition',
+                    f'attachment; filename={
+                        os.path.basename(tmp_file_name)}')
                 message.attach(part)
             if remove_after_send and os.path.exists(tmp_file_name):
@@ -847,11 +1142,13 @@ def send_data_to_email(
     try:
         raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
         email_body = {'raw': raw}
-        sent_message = service.users().messages().send(userId="me", body=email_body).execute()
+        sent_message = service.users().messages().send(
+            userId="me", body=email_body).execute()
         print(f"Email with Message Id {sent_message['id']} successfully sent.")
     except Exception as error:
         raise Exception(f"Error sending email: {error}")
 def send_data_to_slack(
     df: pd.DataFrame,
     bot_name: str,
@@ -866,20 +1163,29 @@ def send_data_to_slack(
         df: The DataFrame to send.
         bot_name: The Slack bot configuration preset name.
         message: Custom message to send along with the DataFrame or file.
-        as_file: Boolean flag to decide whether to send the DataFrame as a file.
+        as_file: Boolean flag to decide whether to send the DataFrame as a
+        file.
         remove_after_send: If True, removes the CSV file after sending.
     """
     def locate_config_file(filename: str = "rgwml.config") -> str:
         """Locate config file in common user directories."""
         home_dir = os.path.expanduser("~")
-        search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
+        search_paths = [
+            os.path.join(
+                home_dir,
+                folder) for folder in [
+                "Desktop",
+                "Documents",
+                "Downloads"]]
         for path in search_paths:
             for root, _, files in os.walk(path):
                 if filename in files:
                     return os.path.join(root, filename)
-        raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
+        raise FileNotFoundError(
+            f"{filename} not found in Desktop, Documents, or Downloads"
+            + " folders")
     def get_config(config_path: str) -> dict:
         """Load configuration from a JSON file."""
@@ -890,7 +1196,14 @@ def send_data_to_slack(
     config_path = locate_config_file()
     config = get_config(config_path)
-    bot_config = next((bot for bot in config['slack_bot_presets'] if bot['name'] == bot_name), None)
+    bot_config = next(
+        (
+            bot for bot in config['slack_bot_presets']
+            if bot['name'] == bot_name
+        ),
+        None
+    )
     if not bot_config:
         raise ValueError(f"No bot found with the name {bot_name}")
@@ -898,7 +1211,9 @@ def send_data_to_slack(
     if as_file:
         # Create a temporary file for the DataFrame as CSV
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
+        with tempfile.NamedTemporaryFile(
+            delete=False, suffix=".csv"
+        ) as tmp_file:
             file_name = tmp_file.name
             df.to_csv(file_name, index=False)
@@ -927,13 +1242,15 @@ def send_data_to_slack(
     print("Message sent successfully.")
 def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
     """
     Reorder the columns of the DataFrame based on a string input.
     Parameters:
         df: The DataFrame whose columns will be reordered.
-        column_order_str: A string specifying the desired order of columns, using ',' to separate columns.
+        column_order_str: A string specifying the desired order of columns,
+        using ',' to separate columns.
     Returns:
         A new DataFrame with reordered columns.
@@ -942,7 +1259,8 @@ def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
         ValueError: If a specified column does not exist in the DataFrame.
     """
     if df is None:
-        raise ValueError("No DataFrame to reorder. Please provide a valid DataFrame.")
+        raise ValueError(
+            "No DataFrame to reorder. Please provide a valid DataFrame.")
     columns = df.columns.tolist()
     parts = [part.strip() for part in column_order_str.split(',')]
@@ -972,10 +1290,11 @@ def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
     return df[new_order]
 def append_ranged_classification_column(
-    df: pd.DataFrame,
-    ranges: str,
-    target_col: str,
+    df: pd.DataFrame,
+    ranges: str,
+    target_col: str,
     new_col_name: str
 ) -> pd.DataFrame:
     """
@@ -992,7 +1311,6 @@ def append_ranged_classification_column(
     """
     def pad_number(number, integer_length, decimal_length=0, decimal=False):
-        """Pad number to have a consistent length for integer and decimal parts."""
         if decimal:
             str_number = f"{number:.{decimal_length}f}"
             integer_part, decimal_part = str_number.split('.')
@@ -1006,25 +1324,70 @@ def append_ranged_classification_column(
     if has_decimals:
         range_list = [float(r) for r in range_list]
-        max_decimal_length = max(len(str(r).split('.')[1]) for r in range_list if '.' in str(r))
-        max_integer_length = max(len(str(int(float(r)))) for r in range_list)
-        labels = [f"{pad_number(range_list[i], max_integer_length, max_decimal_length, decimal=True)} to {pad_number(range_list[i + 1], max_integer_length, max_decimal_length, decimal=True)}" for i in range(len(range_list) - 1)]
+        max_decimal_length = max(
+            len(str(r).split('.')[1])
+            for r in range_list
+            if '.' in str(r)
+        )
+        max_integer_length = max(
+            len(str(int(float(r))))
+            for r in range_list
+        )
+        labels = []
+        for i in range(len(range_list) - 1):
+            start = pad_number(
+                range_list[i],
+                max_integer_length,
+                max_decimal_length,
+                decimal=True
+            )
+            end = pad_number(
+                range_list[i + 1],
+                max_integer_length,
+                max_decimal_length,
+                decimal=True
+            )
+            label = f"{start} to {end}"
+            labels.append(label)
     else:
         range_list = [int(r) for r in range_list]
-        max_integer_length = max(len(str(r)) for r in range_list)
-        labels = [f"{pad_number(range_list[i], max_integer_length)} to {pad_number(range_list[i + 1], max_integer_length)}" for i in range(len(range_list) - 1)]
+        max_integer_length = max(
+            len(str(r))
+            for r in range_list
+        )
+        labels = [
+            f"{pad_number(range_list[i], max_integer_length)}"
+            f" to "
+            f"{pad_number(range_list[i + 1], max_integer_length)}"
+            for i in range(len(range_list) - 1)
+        ]
     # Ensure the target column is numeric
     df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
-    df[new_col_name] = pd.cut(df[target_col], bins=range_list, labels=labels, right=False, include_lowest=True)
+    df[new_col_name] = pd.cut(
+        df[target_col],
+        bins=range_list,
+        labels=labels,
+        right=False,
+        include_lowest=True)
     return df
 def append_percentile_classification_column(
-    df: pd.DataFrame,
-    percentiles: str,
-    target_col: str,
+    df: pd.DataFrame,
+    percentiles: str,
+    target_col: str,
     new_col_name: str
 ) -> pd.DataFrame:
     """
@@ -1032,7 +1395,8 @@ def append_percentile_classification_column(
     Parameters:
         df: The DataFrame to modify.
-        percentiles: A string representation of percentile values separated by commas.
+        percentiles: A string representation of percentile values separated
+        by commas.
         target_col: The column to analyze.
         new_col_name: The name of the new classification column.
@@ -1041,7 +1405,6 @@ def append_percentile_classification_column(
     """
     def pad_number(number, integer_length, decimal_length=0, decimal=False):
-        """Pad number to have a consistent length for integer and decimal parts."""
         if decimal:
             str_number = f"{number:.{decimal_length}f}"
             integer_part, decimal_part = str_number.split('.')
@@ -1055,26 +1418,78 @@ def append_percentile_classification_column(
     if has_decimals:
         percentiles_list = [float(p) for p in percentiles_list]
-        max_decimal_length = max(len(str(p).split('.')[1]) for p in percentiles_list if '.' in str(p))
-        max_integer_length = max(len(str(int(float(p)))) for p in percentiles_list)
-        labels = [f"{pad_number(percentiles_list[i], max_integer_length, max_decimal_length, decimal=True)} to {pad_number(percentiles_list[i + 1], max_integer_length, max_decimal_length, decimal=True)}" for i in range(len(percentiles_list) - 1)]
+        max_decimal_length = max(
+            len(str(p).split('.')[1])
+            for p in percentiles_list
+            if '.' in str(p)
+        )
+        max_integer_length = max(
+            len(str(int(float(p))))
+            for p in percentiles_list
+        )
+        labels = []
+        for i in range(len(percentiles_list) - 1):
+            start = pad_number(
+                percentiles_list[i],
+                max_integer_length,
+                max_decimal_length,
+                decimal=True
+            )
+            end = pad_number(
+                percentiles_list[i + 1],
+                max_integer_length,
+                max_decimal_length,
+                decimal=True
+            )
+            label = f"{start} to {end}"
+            labels.append(label)
     else:
         percentiles_list = [int(p) for p in percentiles_list]
-        max_integer_length = max(len(str(p)) for p in percentiles_list)
-        labels = [f"{pad_number(percentiles_list[i], max_integer_length)} to {pad_number(percentiles_list[i + 1], max_integer_length)}" for i in range(len(percentiles_list) - 1)]
+        max_integer_length = max(
+            len(str(p))
+            for p in percentiles_list
+        )
+        labels = []
+        for i in range(len(percentiles_list) - 1):
+            start = pad_number(
+                percentiles_list[i],
+                max_integer_length
+            )
+            end = pad_number(
+                percentiles_list[i + 1],
+                max_integer_length
+            )
+            label = f"{start} to {end}"
+            labels.append(label)
     # Ensure the target column is numeric
     df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
     quantiles = [df[target_col].quantile(p / 100) for p in percentiles_list]
-    df[new_col_name] = pd.cut(df[target_col], bins=quantiles, labels=labels, include_lowest=True)
+    df[new_col_name] = pd.cut(
+        df[target_col],
+        bins=quantiles,
+        labels=labels,
+        include_lowest=True)
     return df
 def append_ranged_date_classification_column(
-    df: pd.DataFrame,
-    date_ranges: str,
-    target_col: str,
+    df: pd.DataFrame,
+    date_ranges: str,
+    target_col: str,
     new_col_name: str
 ) -> pd.DataFrame:
     """
@@ -1082,7 +1497,8 @@ def append_ranged_date_classification_column(
     Parameters:
         df: The DataFrame to modify.
-        date_ranges: A string representation of date ranges separated by commas.
+        date_ranges: A string representation of date ranges separated by
+        commas.
         target_col: The date column to analyze.
         new_col_name: The name of the new date classification column.
@@ -1091,41 +1507,61 @@ def append_ranged_date_classification_column(
     """
     date_list = [pd.to_datetime(date) for date in date_ranges.split(',')]
-    labels = [f"{date_list[i].strftime('%Y-%m-%d')} to {date_list[i + 1].strftime('%Y-%m-%d')}" for i in range(len(date_list) - 1)]
-    df[new_col_name] = pd.cut(pd.to_datetime(df[target_col]), bins=date_list, labels=labels, right=False)
+    labels = []
+    for i in range(len(date_list) - 1):
+        start_date = date_list[i].strftime('%Y-%m-%d')
+        end_date = date_list[i + 1].strftime('%Y-%m-%d')
+        label = f"{start_date} to {end_date}"
+        labels.append(label)
+    df[new_col_name] = pd.cut(
+        pd.to_datetime(df[target_col]),
+        bins=date_list,
+        labels=labels,
+        right=False)
     return df
-def rename_columns(df: pd.DataFrame, rename_pairs: Dict[str, str]) -> pd.DataFrame:
+def rename_columns(df: pd.DataFrame,
+                   rename_pairs: Dict[str,
+                                      str]) -> pd.DataFrame:
     """
     Rename columns in the DataFrame.
     Parameters:
         df: The DataFrame to modify.
-        rename_pairs: A dictionary mapping old column names to new column names.
+        rename_pairs: A dictionary mapping old column names to new column
+        names.
     Returns:
         A new DataFrame with columns renamed.
     """
     if df is None:
-        raise ValueError("No DataFrame to rename columns. Please provide a valid DataFrame.")
+        raise ValueError(
+            "No DataFrame to rename columns. Please provide a valid"
+            + " DataFrame.")
     return df.rename(columns=rename_pairs)
 def cascade_sort(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
     """
     Cascade sort the DataFrame by specified columns and order.
     Parameters:
         df: The DataFrame to sort.
-        columns: A list of column names with sorting order, e.g., ['Column1::ASC', 'Column2::DESC'].
+        columns: A list of column names with sorting order, e.g.,
+        ['Column1::ASC', 'Column2::DESC'].
     Returns:
         A new DataFrame sorted by specified columns.
     """
     if df is None:
-        raise ValueError("No DataFrame to sort. Please provide a valid DataFrame.")
+        raise ValueError(
+            "No DataFrame to sort. Please provide a valid DataFrame.")
     col_names = []
     asc_order = []
@@ -1147,19 +1583,22 @@ def cascade_sort(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
     return df.sort_values(by=col_names, ascending=asc_order)
 def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
     """
     Append XGB training labels based on a ratio string.
     Parameters:
         df: The DataFrame to modify.
-        ratio_str: A string specifying the ratio of TRAIN:TEST or TRAIN:VALIDATE:TEST.
+        ratio_str: A string specifying the ratio of TRAIN:TEST or
+        TRAIN:VALIDATE:TEST.
     Returns:
         A new DataFrame with XGB_TYPE labels appended.
     """
     if df is None:
-        raise ValueError("No DataFrame to add labels. Please provide a valid DataFrame.")
+        raise ValueError(
+            "No DataFrame to add labels. Please provide a valid DataFrame.")
     ratios = list(map(int, ratio_str.split(':')))
     total_ratio = sum(ratios)
@@ -1173,25 +1612,30 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
         train_rows = (ratios[0] * total_rows) // total_ratio
         validate_rows = (ratios[1] * total_rows) // total_ratio
         test_rows = total_rows - train_rows - validate_rows
-        labels = ['TRAIN'] * train_rows + ['VALIDATE'] * validate_rows + ['TEST'] * test_rows
+        labels = ['TRAIN'] * train_rows + ['VALIDATE'] * \
+            validate_rows + ['TEST'] * test_rows
     else:
-        raise ValueError("Invalid ratio string format. Use 'TRAIN:TEST' or 'TRAIN:VALIDATE:TEST'.")
+        raise ValueError(
+            "Invalid ratio string format. Use 'TRAIN:TEST' or"
+            + "'TRAIN:VALIDATE:TEST'.")
     df_with_labels = df.copy()
     df_with_labels['XGB_TYPE'] = labels
     return df_with_labels
 def append_xgb_regression_predictions(
-    df: pd.DataFrame,
-    target_col: str,
-    feature_cols: str,
-    pred_col: str,
-    boosting_rounds: int = 100,
+    df: pd.DataFrame,
+    target_col: str,
+    feature_cols: str,
+    pred_col: str,
+    boosting_rounds: int = 100,
     model_path: Optional[str] = None
 ) -> pd.DataFrame:
     """
-    Append XGB regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
+    Append XGB regression predictions to DataFrame. Assumes data is labeled
+    by an 'XGB_TYPE' column.
     Parameters:
         df: DataFrame to modify.
@@ -1205,7 +1649,8 @@ def append_xgb_regression_predictions(
         DataFrame with predictions appended.
     """
     if df is None or 'XGB_TYPE' not in df.columns:
-        raise ValueError("DataFrame is not initialized or 'XGB_TYPE' column is missing.")
+        raise ValueError(
+            "DataFrame is not initialized or 'XGB_TYPE' column is missing.")
     features = feature_cols.replace(' ', '').split(',')
@@ -1215,13 +1660,23 @@ def append_xgb_regression_predictions(
             df[col] = df[col].astype('category')
     train_data = df[df['XGB_TYPE'] == 'TRAIN']
-    validate_data = df[df['XGB_TYPE'] == 'VALIDATE'] if 'VALIDATE' in df['XGB_TYPE'].values else None
-    dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
+    if 'VALIDATE' in df['XGB_TYPE'].values:
+        validate_data = df[df['XGB_TYPE'] == 'VALIDATE']
+    else:
+        validate_data = None
+    dtrain = xgb.DMatrix(
+        train_data[features],
+        label=train_data[target_col],
+        enable_categorical=True)
     evals = [(dtrain, 'train')]
     if validate_data is not None:
-        dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
+        dvalidate = xgb.DMatrix(
+            validate_data[features],
+            label=validate_data[target_col],
+            enable_categorical=True)
         evals.append((dvalidate, 'validate'))
     params = {
@@ -1229,7 +1684,12 @@ def append_xgb_regression_predictions(
         'eval_metric': 'rmse'
     }
-    model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
+    model = xgb.train(
+        params,
+        dtrain,
+        num_boost_round=boosting_rounds,
+        evals=evals,
+        early_stopping_rounds=10 if validate_data is not None else None)
     # Make predictions for all data
     dall = xgb.DMatrix(df[features], enable_categorical=True)
@@ -1238,21 +1698,24 @@ def append_xgb_regression_predictions(
     if model_path:
         model.save_model(model_path)
-    columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
+    columns_order = [col for col in df.columns if col not in [
+        'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
     df = df[columns_order]
     return df
 def append_xgb_logistic_regression_predictions(
-    df: pd.DataFrame,
-    target_col: str,
-    feature_cols: str,
-    pred_col: str,
-    boosting_rounds: int = 100,
+    df: pd.DataFrame,
+    target_col: str,
+    feature_cols: str,
+    pred_col: str,
+    boosting_rounds: int = 100,
     model_path: Optional[str] = None
 ) -> pd.DataFrame:
     """
-    Append XGB logistic regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
+    Append XGB logistic regression predictions to DataFrame. Assumes data is
+    labeled by an 'XGB_TYPE' column.
     Parameters:
         df: DataFrame to modify.
@@ -1266,7 +1729,8 @@ def append_xgb_logistic_regression_predictions(
         DataFrame with predictions appended.
     """
     if df is None or 'XGB_TYPE' not in df.columns:
-        raise ValueError("DataFrame is not initialized or 'XGB_TYPE' column is missing.")
+        raise ValueError(
+            "DataFrame is not initialized or 'XGB_TYPE' column is missing.")
     features = feature_cols.replace(' ', '').split(',')
@@ -1276,13 +1740,22 @@ def append_xgb_logistic_regression_predictions(
             df[col] = df[col].astype('category')
     train_data = df[df['XGB_TYPE'] == 'TRAIN']
-    validate_data = df[df['XGB_TYPE'] == 'VALIDATE'] if 'VALIDATE' in df['XGB_TYPE'].values else None
-    dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
+    validate_data = None
+    if 'VALIDATE' in df['XGB_TYPE'].values:
+        validate_data = df[df['XGB_TYPE'] == 'VALIDATE']
+    dtrain = xgb.DMatrix(
+        train_data[features],
+        label=train_data[target_col],
+        enable_categorical=True)
     evals = [(dtrain, 'train')]
     if validate_data is not None:
-        dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
+        dvalidate = xgb.DMatrix(
+            validate_data[features],
+            label=validate_data[target_col],
+            enable_categorical=True)
         evals.append((dvalidate, 'validate'))
     params = {
@@ -1290,7 +1763,12 @@ def append_xgb_logistic_regression_predictions(
         'eval_metric': 'auc'
     }
-    model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
+    model = xgb.train(
+        params,
+        dtrain,
+        num_boost_round=boosting_rounds,
+        evals=evals,
+        early_stopping_rounds=10 if validate_data is not None else None)
     # Make predictions for all data
     dall = xgb.DMatrix(df[features], enable_categorical=True)
@@ -1299,15 +1777,17 @@ def append_xgb_logistic_regression_predictions(
     if model_path:
         model.save_model(model_path)
-    columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
+    columns_order = [col for col in df.columns if col not in [
+        'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
     df = df[columns_order]
     return df
 def print_n_frequency_cascading(
-    df: pd.DataFrame,
-    n: int,
-    columns: str,
+    df: pd.DataFrame,
+    n: int,
+    columns: str,
     order_by: str = "FREQ_DESC"
 ) -> None:
     """
@@ -1332,7 +1812,12 @@ def print_n_frequency_cascading(
         # Convert the column to string representation
         df[current_col] = df[current_col].astype(str)
         frequency = df[current_col].value_counts(dropna=False)
-        frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
+        frequency = frequency.rename(
+            index={
+                'nan': 'NaN',
+                'NaT': 'NaT',
+                'None': 'None',
+                '': 'Empty'})
         if limit is not None:
             frequency = frequency.nlargest(limit)
@@ -1347,11 +1832,11 @@ def print_n_frequency_cascading(
                 filtered_df = df[df[current_col] == value]
             if len(columns) > 1:
-                sub_report = generate_cascade_report(filtered_df, columns[1:], limit, order_by)
+                sub_report = generate_cascade_report(
+                    filtered_df, columns[1:], limit, order_by)
                 report[value] = {
-                    "count": str(count),
-                    f"sub_distribution({columns[1]})": sub_report if sub_report else {}
-                }
+                    "count": str(count), f"sub_distribution({
+                        columns[1]})": sub_report if sub_report else {}}
             else:
                 report[value] = {
                     "count": str(count)
@@ -1363,19 +1848,28 @@ def print_n_frequency_cascading(
         if order_by == "ASC":
             return dict(sorted(frequency.items(), key=lambda item: item[0]))
         elif order_by == "DESC":
-            return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
+            return dict(
+                sorted(
+                    frequency.items(),
+                    key=lambda item: item[0],
+                    reverse=True))
         elif order_by == "FREQ_ASC":
             return dict(sorted(frequency.items(), key=lambda item: item[1]))
         else:  # Default to "FREQ_DESC"
-            return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
+            return dict(
+                sorted(
+                    frequency.items(),
+                    key=lambda item: item[1],
+                    reverse=True))
     report = generate_cascade_report(df, columns, n, order_by)
     print(json.dumps(report, indent=2))
 def print_n_frequency_linear(
-    df: pd.DataFrame,
-    n: int,
-    columns: str,
+    df: pd.DataFrame,
+    n: int,
+    columns: str,
     order_by: str = "FREQ_DESC"
 ) -> None:
     """
@@ -1397,13 +1891,19 @@ def print_n_frequency_linear(
                 continue
             frequency = df[current_col].astype(str).value_counts(dropna=False)
-            frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
+            frequency = frequency.rename(
+                index={
+                    'nan': 'NaN',
+                    'NaT': 'NaT',
+                    'None': 'None',
+                    '': 'Empty'})
             if limit is not None:
                 frequency = frequency.nlargest(limit)
             sorted_frequency = sort_frequency(frequency, order_by)
-            col_report = {str(value): str(count) for value, count in sorted_frequency.items()}
+            col_report = {str(value): str(count)
+                          for value, count in sorted_frequency.items()}
             report[current_col] = col_report
         return report
@@ -1412,16 +1912,27 @@ def print_n_frequency_linear(
         if order_by == "ASC":
             return dict(sorted(frequency.items(), key=lambda item: item[0]))
         elif order_by == "DESC":
-            return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
+            return dict(
+                sorted(
+                    frequency.items(),
+                    key=lambda item: item[0],
+                    reverse=True))
         elif order_by == "FREQ_ASC":
             return dict(sorted(frequency.items(), key=lambda item: item[1]))
         else:  # Default to "FREQ_DESC"
-            return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
+            return dict(
+                sorted(
+                    frequency.items(),
+                    key=lambda item: item[1],
+                    reverse=True))
     report = generate_linear_report(df, columns, n, order_by)
     print(json.dumps(report, indent=2))
-def retain_columns(df: pd.DataFrame, columns_to_retain: List[str]) -> pd.DataFrame:
+def retain_columns(
+        df: pd.DataFrame,
+        columns_to_retain: List[str]) -> pd.DataFrame:
     """
     Retain specified columns in the DataFrame and drop the others.
@@ -1436,9 +1947,10 @@ def retain_columns(df: pd.DataFrame, columns_to_retain: List[str]) -> pd.DataFra
         raise ValueError("columns_to_retain should be a list of column names.")
     return df[columns_to_retain]
 def mask_against_dataframe(
-    df: pd.DataFrame,
-    other_df: pd.DataFrame,
+    df: pd.DataFrame,
+    other_df: pd.DataFrame,
     column_name: str
 ) -> pd.DataFrame:
     """
@@ -1456,9 +1968,10 @@ def mask_against_dataframe(
         raise ValueError("The specified column must exist in both DataFrames.")
     return df[df[column_name].isin(other_df[column_name])]
 def mask_against_dataframe_converse(
-    df: pd.DataFrame,
-    other_df: pd.DataFrame,
+    df: pd.DataFrame,
+    other_df: pd.DataFrame,
     column_name: str
 ) -> pd.DataFrame:
     """
@@ -1470,10 +1983,10 @@ def mask_against_dataframe_converse(
         column_name: The column name to use for comparison.
     Returns:
-        A new DataFrame with rows whose column values do not exist in 'other_df'.
+        A new DataFrame with rows whose column values do not exist in
+        'other_df'.
     """
     if column_name not in df.columns or column_name not in other_df.columns:
         raise ValueError("The specified column must exist in both DataFrames.")
-    return df[~df[column_name].isin(other_df[column_name])]
+    return df[~df[column_name].isin(other_df[column_name])]

rgwfuncs 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

rgwfuncs 0.0.2py3-none-any.whl → 0.0.4py3-none-any.whl