rgwfuncs 0.0.16__py3-none-any.whl → 0.0.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rgwfuncs/__init__.py +2 -1
- rgwfuncs/df_lib.py +414 -158
- rgwfuncs/str_lib.py +62 -0
- {rgwfuncs-0.0.16.dist-info → rgwfuncs-0.0.18.dist-info}/METADATA +59 -20
- rgwfuncs-0.0.18.dist-info/RECORD +9 -0
- rgwfuncs-0.0.16.dist-info/RECORD +0 -8
- {rgwfuncs-0.0.16.dist-info → rgwfuncs-0.0.18.dist-info}/LICENSE +0 -0
- {rgwfuncs-0.0.16.dist-info → rgwfuncs-0.0.18.dist-info}/WHEEL +0 -0
- {rgwfuncs-0.0.16.dist-info → rgwfuncs-0.0.18.dist-info}/entry_points.txt +0 -0
- {rgwfuncs-0.0.16.dist-info → rgwfuncs-0.0.18.dist-info}/top_level.txt +0 -0
    
        rgwfuncs/__init__.py
    CHANGED
    
    | @@ -1,4 +1,5 @@ | |
| 1 1 | 
             
            # This file is automatically generated
         | 
| 2 2 | 
             
            # Dynamically importing functions from modules
         | 
| 3 3 |  | 
| 4 | 
            -
            from .df_lib import append_columns, append_percentile_classification_column, append_ranged_classification_column, append_ranged_date_classification_column, append_rows, append_xgb_labels, append_xgb_logistic_regression_predictions, append_xgb_regression_predictions, bag_union_join, bottom_n_unique_values, cascade_sort, delete_rows, docs, drop_duplicates, drop_duplicates_retain_first, drop_duplicates_retain_last, filter_dataframe, filter_indian_mobiles, first_n_rows, from_raw_data, last_n_rows, left_join, limit_dataframe, load_data_from_path, load_data_from_query, load_data_from_sqlite_path, mask_against_dataframe, mask_against_dataframe_converse, numeric_clean, order_columns, print_correlation, print_dataframe, print_memory_usage, print_n_frequency_cascading, print_n_frequency_linear, rename_columns, retain_columns, right_join, send_data_to_email, send_data_to_slack, send_dataframe_via_telegram, sync_dataframe_to_sqlite_database, top_n_unique_values, union_join, update_rows
         | 
| 4 | 
            +
            from .df_lib import append_columns, append_percentile_classification_column, append_ranged_classification_column, append_ranged_date_classification_column, append_rows, append_xgb_labels, append_xgb_logistic_regression_predictions, append_xgb_regression_predictions, bag_union_join, bottom_n_unique_values, cascade_sort, delete_rows, docs, drop_duplicates, drop_duplicates_retain_first, drop_duplicates_retain_last, filter_dataframe, filter_indian_mobiles, first_n_rows, from_raw_data, insert_dataframe_in_sqlite_database, last_n_rows, left_join, limit_dataframe, load_data_from_path, load_data_from_query, load_data_from_sqlite_path, mask_against_dataframe, mask_against_dataframe_converse, numeric_clean, order_columns, print_correlation, print_dataframe, print_memory_usage, print_n_frequency_cascading, print_n_frequency_linear, rename_columns, retain_columns, right_join, send_data_to_email, send_data_to_slack, send_dataframe_via_telegram, sync_dataframe_to_sqlite_database, top_n_unique_values, union_join, update_rows
         | 
| 5 | 
            +
            from .str_lib import send_telegram_message
         | 
    
        rgwfuncs/df_lib.py
    CHANGED
    
    | @@ -28,6 +28,7 @@ import warnings | |
| 28 28 | 
             
            # Suppress all FutureWarnings
         | 
| 29 29 | 
             
            warnings.filterwarnings("ignore", category=FutureWarning)
         | 
| 30 30 |  | 
| 31 | 
            +
             | 
| 31 32 | 
             
            def docs(method_type_filter: Optional[str] = None) -> None:
         | 
| 32 33 | 
             
                """
         | 
| 33 34 | 
             
                Print a list of function names in alphabetical order. If method_type_filter
         | 
| @@ -65,7 +66,11 @@ def docs(method_type_filter: Optional[str] = None) -> None: | |
| 65 66 | 
             
                                print(f"\n{name}:\n{docstring}")
         | 
| 66 67 |  | 
| 67 68 |  | 
| 68 | 
            -
            def numeric_clean( | 
| 69 | 
            +
            def numeric_clean(
         | 
| 70 | 
            +
                    df: pd.DataFrame,
         | 
| 71 | 
            +
                    column_names: str,
         | 
| 72 | 
            +
                    column_type: str,
         | 
| 73 | 
            +
                    irregular_value_treatment: str) -> pd.DataFrame:
         | 
| 69 74 | 
             
                """
         | 
| 70 75 | 
             
                Cleans the numeric columns based on specified treatments.
         | 
| 71 76 |  | 
| @@ -296,7 +301,9 @@ def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame: | |
| 296 301 | 
             
                return df.drop_duplicates(keep='first')
         | 
| 297 302 |  | 
| 298 303 |  | 
| 299 | 
            -
            def drop_duplicates_retain_first( | 
| 304 | 
            +
            def drop_duplicates_retain_first(
         | 
| 305 | 
            +
                    df: pd.DataFrame,
         | 
| 306 | 
            +
                    columns: Optional[str] = None) -> pd.DataFrame:
         | 
| 300 307 | 
             
                """
         | 
| 301 308 | 
             
                Drop duplicate rows in the DataFrame based on specified columns, retaining the first occurrence.
         | 
| 302 309 |  | 
| @@ -318,7 +325,9 @@ def drop_duplicates_retain_first(df: pd.DataFrame, columns: Optional[str] = None | |
| 318 325 | 
             
                return df.drop_duplicates(subset=columns_list, keep='first')
         | 
| 319 326 |  | 
| 320 327 |  | 
| 321 | 
            -
            def drop_duplicates_retain_last( | 
| 328 | 
            +
            def drop_duplicates_retain_last(
         | 
| 329 | 
            +
                    df: pd.DataFrame,
         | 
| 330 | 
            +
                    columns: Optional[str] = None) -> pd.DataFrame:
         | 
| 322 331 | 
             
                """
         | 
| 323 332 | 
             
                Drop duplicate rows in the DataFrame based on specified columns, retaining the last occurrence.
         | 
| 324 333 |  | 
| @@ -335,20 +344,18 @@ def drop_duplicates_retain_last(df: pd.DataFrame, columns: Optional[str] = None) | |
| 335 344 | 
             
                if df is None:
         | 
| 336 345 | 
             
                    raise ValueError("DataFrame is not initialized.")
         | 
| 337 346 |  | 
| 338 | 
            -
                columns_list = [col.strip() | 
| 347 | 
            +
                columns_list = [col.strip()
         | 
| 348 | 
            +
                                for col in columns.split(',')] if columns else None
         | 
| 339 349 | 
             
                return df.drop_duplicates(subset=columns_list, keep='last')
         | 
| 340 350 |  | 
| 341 351 |  | 
| 342 | 
            -
            def load_data_from_query(db_preset_name: str, query: str | 
| 352 | 
            +
            def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
         | 
| 343 353 | 
             
                """
         | 
| 344 | 
            -
                Load data from a database query into a DataFrame based on a configuration
         | 
| 345 | 
            -
                preset.
         | 
| 354 | 
            +
                Load data from a database query into a DataFrame based on a configuration preset.
         | 
| 346 355 |  | 
| 347 356 | 
             
                Parameters:
         | 
| 348 357 | 
             
                    db_preset_name: The name of the database preset in the configuration file.
         | 
| 349 358 | 
             
                    query: The SQL query to execute.
         | 
| 350 | 
            -
                    config_file_name: Name of the configuration file
         | 
| 351 | 
            -
                    (default: 'rgwml.config').
         | 
| 352 359 |  | 
| 353 360 | 
             
                Returns:
         | 
| 354 361 | 
             
                    A DataFrame containing the query result.
         | 
| @@ -358,17 +365,6 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str | |
| 358 365 | 
             
                    ValueError: If the database preset or db_type is invalid.
         | 
| 359 366 | 
             
                """
         | 
| 360 367 |  | 
| 361 | 
            -
                def locate_config_file(filename: str = config_file_name) -> str:
         | 
| 362 | 
            -
                    home_dir = os.path.expanduser("~")
         | 
| 363 | 
            -
                    search_paths = [os.path.join(home_dir, "Desktop"), os.path.join(home_dir, "Documents"), os.path.join(home_dir, "Downloads"),]
         | 
| 364 | 
            -
             | 
| 365 | 
            -
                    for path in search_paths:
         | 
| 366 | 
            -
                        for root, dirs, files in os.walk(path):
         | 
| 367 | 
            -
                            if filename in files:
         | 
| 368 | 
            -
                                return os.path.join(root, filename)
         | 
| 369 | 
            -
                    raise FileNotFoundError(
         | 
| 370 | 
            -
                        f"{filename} not found in Desktop, Documents, or Downloads folders")
         | 
| 371 | 
            -
             | 
| 372 368 | 
             
                def query_mssql(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
         | 
| 373 369 | 
             
                    server = db_preset['host']
         | 
| 374 370 | 
             
                    user = db_preset['username']
         | 
| @@ -393,12 +389,13 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str | |
| 393 389 | 
             
                        with conn.cursor() as cursor:
         | 
| 394 390 | 
             
                            cursor.execute(query)
         | 
| 395 391 | 
             
                            rows = cursor.fetchall()
         | 
| 396 | 
            -
                            columns = ([desc[0] for desc in cursor.description] | 
| 392 | 
            +
                            columns = ([desc[0] for desc in cursor.description]
         | 
| 393 | 
            +
                                       if cursor.description else [])
         | 
| 397 394 |  | 
| 398 395 | 
             
                    return pd.DataFrame(rows, columns=columns)
         | 
| 399 396 |  | 
| 400 | 
            -
                def query_clickhouse( | 
| 401 | 
            -
             | 
| 397 | 
            +
                def query_clickhouse(
         | 
| 398 | 
            +
                        db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
         | 
| 402 399 | 
             
                    host = db_preset['host']
         | 
| 403 400 | 
             
                    user = db_preset['username']
         | 
| 404 401 | 
             
                    password = db_preset['password']
         | 
| @@ -409,7 +406,8 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str | |
| 409 406 |  | 
| 410 407 | 
             
                    for attempt in range(max_retries):
         | 
| 411 408 | 
             
                        try:
         | 
| 412 | 
            -
                            client = clickhouse_connect.get_client( | 
| 409 | 
            +
                            client = clickhouse_connect.get_client(
         | 
| 410 | 
            +
                                host=host, port='8123', username=user, password=password, database=database)
         | 
| 413 411 | 
             
                            data = client.query(query)
         | 
| 414 412 | 
             
                            rows = data.result_rows
         | 
| 415 413 | 
             
                            columns = data.column_names
         | 
| @@ -423,11 +421,13 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str | |
| 423 421 | 
             
                                raise ConnectionError(
         | 
| 424 422 | 
             
                                    "All attempts to connect to ClickHouse failed.")
         | 
| 425 423 |  | 
| 426 | 
            -
                def query_google_big_query( | 
| 424 | 
            +
                def query_google_big_query(
         | 
| 425 | 
            +
                        db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
         | 
| 427 426 | 
             
                    json_file_path = db_preset['json_file_path']
         | 
| 428 427 | 
             
                    project_id = db_preset['project_id']
         | 
| 429 428 |  | 
| 430 | 
            -
                    credentials = service_account.Credentials.from_service_account_file( | 
| 429 | 
            +
                    credentials = service_account.Credentials.from_service_account_file(
         | 
| 430 | 
            +
                        json_file_path)
         | 
| 431 431 | 
             
                    client = bigquery.Client(credentials=credentials, project=project_id)
         | 
| 432 432 |  | 
| 433 433 | 
             
                    query_job = client.query(query)
         | 
| @@ -437,13 +437,15 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str | |
| 437 437 |  | 
| 438 438 | 
             
                    return pd.DataFrame(rows, columns=columns)
         | 
| 439 439 |  | 
| 440 | 
            -
                #  | 
| 441 | 
            -
                config_path =  | 
| 440 | 
            +
                # Assume the configuration file is located at ~/.rgwfuncsrc
         | 
| 441 | 
            +
                config_path = os.path.expanduser('~/.rgwfuncsrc')
         | 
| 442 442 | 
             
                with open(config_path, 'r') as f:
         | 
| 443 443 | 
             
                    config = json.load(f)
         | 
| 444 444 |  | 
| 445 445 | 
             
                db_presets = config.get('db_presets', [])
         | 
| 446 | 
            -
                db_preset = next( | 
| 446 | 
            +
                db_preset = next(
         | 
| 447 | 
            +
                    (preset for preset in db_presets if preset['name'] == db_preset_name),
         | 
| 448 | 
            +
                    None)
         | 
| 447 449 | 
             
                if not db_preset:
         | 
| 448 450 | 
             
                    raise ValueError(f"No matching db_preset found for {db_preset_name}")
         | 
| 449 451 |  | 
| @@ -621,10 +623,20 @@ def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None: | |
| 621 623 | 
             
                    for column in columns:
         | 
| 622 624 | 
             
                        if column in df.columns:
         | 
| 623 625 | 
             
                            frequency = df[column].astype(str).value_counts(dropna=False)
         | 
| 624 | 
            -
                            frequency = frequency.rename( | 
| 626 | 
            +
                            frequency = frequency.rename(
         | 
| 627 | 
            +
                                index={
         | 
| 628 | 
            +
                                    'nan': 'NaN',
         | 
| 629 | 
            +
                                    'NaT': 'NaT',
         | 
| 630 | 
            +
                                    'None': 'None',
         | 
| 631 | 
            +
                                    '': 'Empty'})
         | 
| 625 632 | 
             
                            top_n_values = frequency.nlargest(n)
         | 
| 626 | 
            -
                            report[column] = {str(value): str(count) | 
| 627 | 
            -
             | 
| 633 | 
            +
                            report[column] = {str(value): str(count)
         | 
| 634 | 
            +
                                              for value, count in top_n_values.items()}
         | 
| 635 | 
            +
                            print(
         | 
| 636 | 
            +
                                f"Top {n} unique values for column '{column}':\n{
         | 
| 637 | 
            +
                                    json.dumps(
         | 
| 638 | 
            +
                                        report[column],
         | 
| 639 | 
            +
                                        indent=2)}\n")
         | 
| 628 640 | 
             
                        else:
         | 
| 629 641 | 
             
                            print(f"Column '{column}' does not exist in the DataFrame.")
         | 
| 630 642 | 
             
                else:
         | 
| @@ -634,7 +646,10 @@ def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None: | |
| 634 646 | 
             
                gc.collect()
         | 
| 635 647 |  | 
| 636 648 |  | 
| 637 | 
            -
            def bottom_n_unique_values( | 
| 649 | 
            +
            def bottom_n_unique_values(
         | 
| 650 | 
            +
                    df: pd.DataFrame,
         | 
| 651 | 
            +
                    n: int,
         | 
| 652 | 
            +
                    columns: List[str]) -> None:
         | 
| 638 653 | 
             
                """
         | 
| 639 654 | 
             
                Print the bottom `n` unique values for specified columns in the DataFrame.
         | 
| 640 655 |  | 
| @@ -654,12 +669,21 @@ def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None | |
| 654 669 | 
             
                    for column in columns:
         | 
| 655 670 | 
             
                        if column in df.columns:
         | 
| 656 671 | 
             
                            frequency = df[column].astype(str).value_counts(dropna=False)
         | 
| 657 | 
            -
                            frequency = frequency.rename( | 
| 672 | 
            +
                            frequency = frequency.rename(
         | 
| 673 | 
            +
                                index={
         | 
| 674 | 
            +
                                    'nan': 'NaN',
         | 
| 675 | 
            +
                                    'NaT': 'NaT',
         | 
| 676 | 
            +
                                    'None': 'None',
         | 
| 677 | 
            +
                                    '': 'Empty'})
         | 
| 658 678 | 
             
                            bottom_n_values = frequency.nsmallest(n)
         | 
| 659 679 | 
             
                            report[column] = {
         | 
| 660 680 | 
             
                                str(value): str(count) for value,
         | 
| 661 681 | 
             
                                count in bottom_n_values.items()}
         | 
| 662 | 
            -
                            print( | 
| 682 | 
            +
                            print(
         | 
| 683 | 
            +
                                f"Bottom {n} unique values for column '{column}':\n{
         | 
| 684 | 
            +
                                    json.dumps(
         | 
| 685 | 
            +
                                        report[column],
         | 
| 686 | 
            +
                                        indent=2)}\n")
         | 
| 663 687 | 
             
                        else:
         | 
| 664 688 | 
             
                            print(f"Column '{column}' does not exist in the DataFrame.")
         | 
| 665 689 | 
             
                else:
         | 
| @@ -669,7 +693,8 @@ def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None | |
| 669 693 | 
             
                gc.collect()
         | 
| 670 694 |  | 
| 671 695 |  | 
| 672 | 
            -
            def print_correlation( | 
| 696 | 
            +
            def print_correlation(
         | 
| 697 | 
            +
                    df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
         | 
| 673 698 | 
             
                """
         | 
| 674 699 | 
             
                Print correlation for multiple pairs of columns in the DataFrame.
         | 
| 675 700 |  | 
| @@ -688,13 +713,16 @@ def print_correlation(df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> | |
| 688 713 |  | 
| 689 714 | 
             
                                correlation = numeric_col1.corr(numeric_col2)
         | 
| 690 715 | 
             
                                if pd.notnull(correlation):
         | 
| 691 | 
            -
                                    print( | 
| 716 | 
            +
                                    print(
         | 
| 717 | 
            +
                                        f"The correlation between '{col1}' and '{col2}' is {correlation}.")
         | 
| 692 718 | 
             
                                else:
         | 
| 693 | 
            -
                                    print( | 
| 719 | 
            +
                                    print(
         | 
| 720 | 
            +
                                        f"Cannot calculate correlation between '{col1}' and '{col2}' due to insufficient numeric data.")
         | 
| 694 721 | 
             
                            except Exception as e:
         | 
| 695 722 | 
             
                                print(f"Error processing cols '{col1}' and '{col2}': {e}")
         | 
| 696 723 | 
             
                        else:
         | 
| 697 | 
            -
                            print( | 
| 724 | 
            +
                            print(
         | 
| 725 | 
            +
                                f"One or both of the specified cols ('{col1}', '{col2}') do not exist in the DataFrame.")
         | 
| 698 726 | 
             
                else:
         | 
| 699 727 | 
             
                    print("The DataFrame is empty.")
         | 
| 700 728 |  | 
| @@ -714,7 +742,8 @@ def print_memory_usage(df: pd.DataFrame) -> None: | |
| 714 742 | 
             
                - ValueError: If the DataFrame is `None`.
         | 
| 715 743 | 
             
                """
         | 
| 716 744 | 
             
                if df is not None:
         | 
| 717 | 
            -
                    memory_usage = df.memory_usage(deep=True).sum( | 
| 745 | 
            +
                    memory_usage = df.memory_usage(deep=True).sum(
         | 
| 746 | 
            +
                    ) / (1024 * 1024)  # Convert bytes to MB
         | 
| 718 747 | 
             
                    print(f"Memory usage of DataFrame: {memory_usage:.2f} MB")
         | 
| 719 748 | 
             
                else:
         | 
| 720 749 | 
             
                    raise ValueError("No DataFrame to print. Please provide a DataFrame.")
         | 
| @@ -795,7 +824,8 @@ def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None: | |
| 795 824 | 
             
                """
         | 
| 796 825 | 
             
                if df is not None:
         | 
| 797 826 | 
             
                    print(df)
         | 
| 798 | 
            -
                    columns_with_types = [ | 
| 827 | 
            +
                    columns_with_types = [
         | 
| 828 | 
            +
                        f"{col} ({df[col].dtypes})" for col in df.columns]
         | 
| 799 829 | 
             
                    print("Columns:", columns_with_types)
         | 
| 800 830 | 
             
                    if source:
         | 
| 801 831 | 
             
                        print(f"Source: {source}")
         | 
| @@ -811,48 +841,53 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option | |
| 811 841 |  | 
| 812 842 | 
             
                Parameters:
         | 
| 813 843 | 
             
                    df: The DataFrame to send.
         | 
| 814 | 
            -
                    bot_name: The name of the Telegram bot as specified in the configuration.
         | 
| 815 | 
            -
                    message: Custom message to send along with the DataFrame or file.
         | 
| 816 | 
            -
                    as_file: Boolean flag to  | 
| 817 | 
            -
                    remove_after_send: If True, removes the file after sending.
         | 
| 818 | 
            -
                """
         | 
| 844 | 
            +
                    bot_name: The name of the Telegram bot as specified in the configuration file.
         | 
| 845 | 
            +
                    message: Custom message to send along with the DataFrame or file. Defaults to None.
         | 
| 846 | 
            +
                    as_file: Boolean flag to indicate whether the DataFrame should be sent as a file (True) or as text (False). Defaults to True.
         | 
| 847 | 
            +
                    remove_after_send: If True, removes the CSV file after sending. Defaults to True.
         | 
| 819 848 |  | 
| 820 | 
            -
                 | 
| 821 | 
            -
                     | 
| 822 | 
            -
                     | 
| 823 | 
            -
                    search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
         | 
| 849 | 
            +
                Raises:
         | 
| 850 | 
            +
                    ValueError: If the specified bot is not found or if no DataFrame is provided.
         | 
| 851 | 
            +
                    Exception: If the message sending fails.
         | 
| 824 852 |  | 
| 825 | 
            -
             | 
| 826 | 
            -
             | 
| 827 | 
            -
             | 
| 828 | 
            -
                                return os.path.join(root, filename)
         | 
| 829 | 
            -
                    raise FileNotFoundError(
         | 
| 830 | 
            -
                        f"{filename} not found in Desktop, Documents, or Downloads")
         | 
| 853 | 
            +
                Notes:
         | 
| 854 | 
            +
                    The configuration file is assumed to be located at `~/.rgwfuncsrc`.
         | 
| 855 | 
            +
                """
         | 
| 831 856 |  | 
| 832 857 | 
             
                def get_config(config_path: str) -> dict:
         | 
| 833 | 
            -
                    """Load configuration from a  | 
| 858 | 
            +
                    """Load configuration from a JSON file."""
         | 
| 834 859 | 
             
                    with open(config_path, 'r') as file:
         | 
| 835 860 | 
             
                        return json.load(file)
         | 
| 836 861 |  | 
| 837 | 
            -
                 | 
| 862 | 
            +
                # Assume the configuration file is located at ~/.rgwfuncsrc
         | 
| 863 | 
            +
                config_path = os.path.expanduser('~/.rgwfuncsrc')
         | 
| 838 864 | 
             
                config = get_config(config_path)
         | 
| 839 | 
            -
                bot_config = next((bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name), None)
         | 
| 840 865 |  | 
| 866 | 
            +
                bot_config = next(
         | 
| 867 | 
            +
                    (bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name),
         | 
| 868 | 
            +
                    None)
         | 
| 841 869 | 
             
                if not bot_config:
         | 
| 842 870 | 
             
                    raise ValueError(f"No bot found with the name {bot_name}")
         | 
| 843 871 |  | 
| 844 872 | 
             
                if df is None:
         | 
| 845 873 | 
             
                    raise ValueError("No DataFrame to send. Please provide a DataFrame.")
         | 
| 846 874 |  | 
| 875 | 
            +
                response = None
         | 
| 847 876 | 
             
                if as_file:
         | 
| 848 877 | 
             
                    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
         | 
| 849 878 | 
             
                    file_name = f"df_{timestamp}.csv"
         | 
| 850 879 | 
             
                    df.to_csv(file_name, index=False)
         | 
| 851 880 | 
             
                    try:
         | 
| 852 881 | 
             
                        with open(file_name, 'rb') as file:
         | 
| 853 | 
            -
                            payload = { | 
| 882 | 
            +
                            payload = {
         | 
| 883 | 
            +
                                'chat_id': bot_config['chat_id'],
         | 
| 884 | 
            +
                                'caption': message or ''}
         | 
| 854 885 | 
             
                            files = {'document': file}
         | 
| 855 | 
            -
                            response = requests.post( | 
| 886 | 
            +
                            response = requests.post(
         | 
| 887 | 
            +
                                f"https://api.telegram.org/bot{
         | 
| 888 | 
            +
                                    bot_config['bot_token']}/sendDocument",
         | 
| 889 | 
            +
                                data=payload,
         | 
| 890 | 
            +
                                files=files)
         | 
| 856 891 | 
             
                        if remove_after_send and os.path.exists(file_name):
         | 
| 857 892 | 
             
                            os.remove(file_name)
         | 
| 858 893 | 
             
                    except Exception as e:
         | 
| @@ -862,40 +897,45 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option | |
| 862 897 | 
             
                    df_str = df.to_string()
         | 
| 863 898 | 
             
                    payload = {
         | 
| 864 899 | 
             
                        'chat_id': bot_config['chat_id'],
         | 
| 865 | 
            -
                        'text': message + "\n\n" + df_str if message else df_str,
         | 
| 866 | 
            -
                        'parse_mode': 'HTML' | 
| 867 | 
            -
                     | 
| 900 | 
            +
                        'text': (message + "\n\n" + df_str) if message else df_str,
         | 
| 901 | 
            +
                        'parse_mode': 'HTML'
         | 
| 902 | 
            +
                    }
         | 
| 903 | 
            +
                    response = requests.post(
         | 
| 904 | 
            +
                        f"https://api.telegram.org/bot{bot_config['bot_token']}/sendMessage", data=payload)
         | 
| 868 905 |  | 
| 869 | 
            -
                if not response.ok:
         | 
| 906 | 
            +
                if response and not response.ok:
         | 
| 870 907 | 
             
                    raise Exception(f"Error sending message: {response.text}")
         | 
| 871 908 |  | 
| 872 909 | 
             
                print("Message sent successfully.")
         | 
| 873 910 |  | 
| 874 911 |  | 
| 875 | 
            -
            def send_data_to_email( | 
| 912 | 
            +
            def send_data_to_email(
         | 
| 913 | 
            +
                    df: pd.DataFrame,
         | 
| 914 | 
            +
                    preset_name: str,
         | 
| 915 | 
            +
                    to_email: str,
         | 
| 916 | 
            +
                    subject: Optional[str] = None,
         | 
| 917 | 
            +
                    body: Optional[str] = None,
         | 
| 918 | 
            +
                    as_file: bool = True,
         | 
| 919 | 
            +
                    remove_after_send: bool = True) -> None:
         | 
| 876 920 | 
             
                """
         | 
| 877 | 
            -
                Send an email with optional DataFrame attachment using Gmail API via a specified preset.
         | 
| 921 | 
            +
                Send an email with an optional DataFrame attachment using the Gmail API via a specified preset.
         | 
| 878 922 |  | 
| 879 923 | 
             
                Parameters:
         | 
| 880 924 | 
             
                    df: The DataFrame to send.
         | 
| 881 925 | 
             
                    preset_name: The configuration preset name to use for sending the email.
         | 
| 882 926 | 
             
                    to_email: The recipient email address.
         | 
| 883 | 
            -
                    subject: Optional subject of the email.
         | 
| 884 | 
            -
                    body: Optional message body of the email.
         | 
| 885 | 
            -
                    as_file: Boolean flag to decide whether to send the DataFrame as a file.
         | 
| 886 | 
            -
                    remove_after_send: If True, removes the CSV file after sending.
         | 
| 887 | 
            -
                """
         | 
| 927 | 
            +
                    subject: Optional subject of the email. Defaults to 'DataFrame CSV File' if not given.
         | 
| 928 | 
            +
                    body: Optional message body of the email. Defaults to 'Please find the CSV file attached.' if not given.
         | 
| 929 | 
            +
                    as_file: Boolean flag to decide whether to send the DataFrame as a file (True) or embed it in the email (False). Defaults to True.
         | 
| 930 | 
            +
                    remove_after_send: If True, removes the CSV file after sending. Defaults to True.
         | 
| 888 931 |  | 
| 889 | 
            -
                 | 
| 890 | 
            -
                     | 
| 891 | 
            -
                     | 
| 892 | 
            -
                    search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
         | 
| 932 | 
            +
                Raises:
         | 
| 933 | 
            +
                    ValueError: If the preset is not found in the configuration.
         | 
| 934 | 
            +
                    Exception: If the email preparation or sending fails.
         | 
| 893 935 |  | 
| 894 | 
            -
             | 
| 895 | 
            -
             | 
| 896 | 
            -
             | 
| 897 | 
            -
                                return os.path.join(root, filename)
         | 
| 898 | 
            -
                    raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
         | 
| 936 | 
            +
                Notes:
         | 
| 937 | 
            +
                    The configuration file is assumed to be located at `~/.rgwfuncsrc`.
         | 
| 938 | 
            +
                """
         | 
| 899 939 |  | 
| 900 940 | 
             
                def get_config(config_path: str) -> dict:
         | 
| 901 941 | 
             
                    with open(config_path, 'r') as file:
         | 
| @@ -914,12 +954,14 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec | |
| 914 954 | 
             
                    )
         | 
| 915 955 | 
             
                    return build('gmail', 'v1', credentials=credentials)
         | 
| 916 956 |  | 
| 917 | 
            -
                # Load configuration
         | 
| 918 | 
            -
                config_path =  | 
| 957 | 
            +
                # Load configuration from ~/.rgwfuncsrc
         | 
| 958 | 
            +
                config_path = os.path.expanduser('~/.rgwfuncsrc')
         | 
| 919 959 | 
             
                config = get_config(config_path)
         | 
| 920 960 |  | 
| 921 961 | 
             
                # Retrieve Gmail preset configuration
         | 
| 922 | 
            -
                gmail_config = next( | 
| 962 | 
            +
                gmail_config = next(
         | 
| 963 | 
            +
                    (preset for preset in config['gmail_bot_presets'] if preset['name'] == preset_name),
         | 
| 964 | 
            +
                    None)
         | 
| 923 965 |  | 
| 924 966 | 
             
                if not gmail_config:
         | 
| 925 967 | 
             
                    raise ValueError(f"No preset found with the name {preset_name}")
         | 
| @@ -942,13 +984,18 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec | |
| 942 984 | 
             
                        message['to'] = to_email
         | 
| 943 985 | 
             
                        message['from'] = sender_email
         | 
| 944 986 | 
             
                        message['subject'] = subject if subject else 'DataFrame CSV File'
         | 
| 945 | 
            -
                        message.attach( | 
| 987 | 
            +
                        message.attach(
         | 
| 988 | 
            +
                            MIMEText(
         | 
| 989 | 
            +
                                body if body else 'Please find the CSV file attached.'))
         | 
| 946 990 |  | 
| 947 991 | 
             
                        with open(tmp_file_name, 'rb') as file:
         | 
| 948 992 | 
             
                            part = MIMEBase('application', 'octet-stream')
         | 
| 949 993 | 
             
                            part.set_payload(file.read())
         | 
| 950 994 | 
             
                            encoders.encode_base64(part)
         | 
| 951 | 
            -
                            part.add_header( | 
| 995 | 
            +
                            part.add_header(
         | 
| 996 | 
            +
                                'Content-Disposition',
         | 
| 997 | 
            +
                                f'attachment; filename={
         | 
| 998 | 
            +
                                    os.path.basename(tmp_file_name)}')
         | 
| 952 999 | 
             
                            message.attach(part)
         | 
| 953 1000 |  | 
| 954 1001 | 
             
                        if remove_after_send and os.path.exists(tmp_file_name):
         | 
| @@ -970,46 +1017,49 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec | |
| 970 1017 | 
             
                try:
         | 
| 971 1018 | 
             
                    raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
         | 
| 972 1019 | 
             
                    email_body = {'raw': raw}
         | 
| 973 | 
            -
                    sent_message = service.users().messages().send( | 
| 1020 | 
            +
                    sent_message = service.users().messages().send(
         | 
| 1021 | 
            +
                        userId="me", body=email_body).execute()
         | 
| 974 1022 | 
             
                    print(f"Email with Message Id {sent_message['id']} successfully sent.")
         | 
| 975 1023 | 
             
                except Exception as error:
         | 
| 976 1024 | 
             
                    raise Exception(f"Error sending email: {error}")
         | 
| 977 1025 |  | 
| 978 1026 |  | 
| 979 | 
            -
            def send_data_to_slack( | 
| 1027 | 
            +
            def send_data_to_slack(
         | 
| 1028 | 
            +
                    df: pd.DataFrame,
         | 
| 1029 | 
            +
                    bot_name: str,
         | 
| 1030 | 
            +
                    message: Optional[str] = None,
         | 
| 1031 | 
            +
                    as_file: bool = True,
         | 
| 1032 | 
            +
                    remove_after_send: bool = True) -> None:
         | 
| 980 1033 | 
             
                """
         | 
| 981 1034 | 
             
                Send a DataFrame or message to Slack using a specified bot configuration.
         | 
| 982 1035 |  | 
| 983 1036 | 
             
                Parameters:
         | 
| 984 1037 | 
             
                    df: The DataFrame to send.
         | 
| 985 1038 | 
             
                    bot_name: The Slack bot configuration preset name.
         | 
| 986 | 
            -
                    message: Custom message to send along with the DataFrame or file.
         | 
| 987 | 
            -
                    as_file: Boolean flag to decide whether to send the DataFrame as a file.
         | 
| 988 | 
            -
                    remove_after_send: If True, removes the CSV file after sending.
         | 
| 989 | 
            -
                """
         | 
| 1039 | 
            +
                    message: Custom message to send along with the DataFrame or file. Defaults to None.
         | 
| 1040 | 
            +
                    as_file: Boolean flag to decide whether to send the DataFrame as a file (True) or as text (False). Defaults to True.
         | 
| 1041 | 
            +
                    remove_after_send: If True, removes the CSV file after sending. Defaults to True.
         | 
| 990 1042 |  | 
| 991 | 
            -
                 | 
| 992 | 
            -
                     | 
| 993 | 
            -
                     | 
| 994 | 
            -
                    search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
         | 
| 1043 | 
            +
                Raises:
         | 
| 1044 | 
            +
                    ValueError: If the specified bot is not found in the configuration.
         | 
| 1045 | 
            +
                    Exception: If the message sending fails.
         | 
| 995 1046 |  | 
| 996 | 
            -
             | 
| 997 | 
            -
             | 
| 998 | 
            -
             | 
| 999 | 
            -
                                return os.path.join(root, filename)
         | 
| 1000 | 
            -
                    raise FileNotFoundError(
         | 
| 1001 | 
            -
                        f"{filename} not found in Desktop, Documents, or Downloads folders")
         | 
| 1047 | 
            +
                Notes:
         | 
| 1048 | 
            +
                    The configuration file is assumed to be located at `~/.rgwfuncsrc`.
         | 
| 1049 | 
            +
                """
         | 
| 1002 1050 |  | 
| 1003 1051 | 
             
                def get_config(config_path: str) -> dict:
         | 
| 1004 1052 | 
             
                    """Load configuration from a JSON file."""
         | 
| 1005 1053 | 
             
                    with open(config_path, 'r') as file:
         | 
| 1006 1054 | 
             
                        return json.load(file)
         | 
| 1007 1055 |  | 
| 1008 | 
            -
                # Load the Slack configuration
         | 
| 1009 | 
            -
                config_path =  | 
| 1056 | 
            +
                # Load the Slack configuration from ~/.rgwfuncsrc
         | 
| 1057 | 
            +
                config_path = os.path.expanduser('~/.rgwfuncsrc')
         | 
| 1010 1058 | 
             
                config = get_config(config_path)
         | 
| 1011 1059 |  | 
| 1012 | 
            -
                bot_config = next( | 
| 1060 | 
            +
                bot_config = next(
         | 
| 1061 | 
            +
                    (bot for bot in config['slack_bot_presets'] if bot['name'] == bot_name),
         | 
| 1062 | 
            +
                    None)
         | 
| 1013 1063 |  | 
| 1014 1064 | 
             
                if not bot_config:
         | 
| 1015 1065 | 
             
                    raise ValueError(f"No bot found with the name {bot_name}")
         | 
| @@ -1024,13 +1074,22 @@ def send_data_to_slack(df: pd.DataFrame, bot_name: str, message: Optional[str] = | |
| 1024 1074 |  | 
| 1025 1075 | 
             
                    try:
         | 
| 1026 1076 | 
             
                        with open(file_name, 'rb') as file:
         | 
| 1027 | 
            -
                            response = client.files_upload( | 
| 1077 | 
            +
                            response = client.files_upload(
         | 
| 1078 | 
            +
                                channels=bot_config['channel_id'],
         | 
| 1079 | 
            +
                                file=file,
         | 
| 1080 | 
            +
                                filename=os.path.basename(file_name),
         | 
| 1081 | 
            +
                                title="DataFrame Upload",
         | 
| 1082 | 
            +
                                initial_comment=message or ''
         | 
| 1083 | 
            +
                            )
         | 
| 1028 1084 | 
             
                    finally:
         | 
| 1029 1085 | 
             
                        if remove_after_send and os.path.exists(file_name):
         | 
| 1030 1086 | 
             
                            os.remove(file_name)
         | 
| 1031 1087 | 
             
                else:
         | 
| 1032 1088 | 
             
                    df_str = df.to_string()
         | 
| 1033 | 
            -
                    response = client.chat_postMessage( | 
| 1089 | 
            +
                    response = client.chat_postMessage(
         | 
| 1090 | 
            +
                        channel=bot_config['channel_id'],
         | 
| 1091 | 
            +
                        text=(message + "\n\n" + df_str) if message else df_str
         | 
| 1092 | 
            +
                    )
         | 
| 1034 1093 |  | 
| 1035 1094 | 
             
                # Check if the message was sent successfully
         | 
| 1036 1095 | 
             
                if not response["ok"]:
         | 
| @@ -1087,7 +1146,11 @@ def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame: | |
| 1087 1146 | 
             
                return df[new_order]
         | 
| 1088 1147 |  | 
| 1089 1148 |  | 
| 1090 | 
            -
            def append_ranged_classification_column( | 
| 1149 | 
            +
            def append_ranged_classification_column(
         | 
| 1150 | 
            +
                    df: pd.DataFrame,
         | 
| 1151 | 
            +
                    ranges: str,
         | 
| 1152 | 
            +
                    target_col: str,
         | 
| 1153 | 
            +
                    new_col_name: str) -> pd.DataFrame:
         | 
| 1091 1154 | 
             
                """
         | 
| 1092 1155 | 
             
                Append a ranged classification column to the DataFrame.
         | 
| 1093 1156 |  | 
| @@ -1155,16 +1218,27 @@ def append_ranged_classification_column(df: pd.DataFrame, ranges: str, target_co | |
| 1155 1218 | 
             
                        for r in range_list
         | 
| 1156 1219 | 
             
                    )
         | 
| 1157 1220 |  | 
| 1158 | 
            -
                    labels = [f"{pad_number(range_list[i], | 
| 1221 | 
            +
                    labels = [f"{pad_number(range_list[i],
         | 
| 1222 | 
            +
                                            max_integer_length)} to {pad_number(range_list[i + 1],
         | 
| 1223 | 
            +
                                                                                max_integer_length)}" for i in range(len(range_list) - 1)]
         | 
| 1159 1224 |  | 
| 1160 1225 | 
             
                # Ensure the target column is numeric
         | 
| 1161 1226 | 
             
                df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
         | 
| 1162 | 
            -
                df[new_col_name] = pd.cut( | 
| 1227 | 
            +
                df[new_col_name] = pd.cut(
         | 
| 1228 | 
            +
                    df[target_col],
         | 
| 1229 | 
            +
                    bins=range_list,
         | 
| 1230 | 
            +
                    labels=labels,
         | 
| 1231 | 
            +
                    right=False,
         | 
| 1232 | 
            +
                    include_lowest=True)
         | 
| 1163 1233 |  | 
| 1164 1234 | 
             
                return df
         | 
| 1165 1235 |  | 
| 1166 1236 |  | 
| 1167 | 
            -
            def append_percentile_classification_column( | 
| 1237 | 
            +
            def append_percentile_classification_column(
         | 
| 1238 | 
            +
                    df: pd.DataFrame,
         | 
| 1239 | 
            +
                    percentiles: str,
         | 
| 1240 | 
            +
                    target_col: str,
         | 
| 1241 | 
            +
                    new_col_name: str) -> pd.DataFrame:
         | 
| 1168 1242 | 
             
                """
         | 
| 1169 1243 | 
             
                Append a percentile classification column to the DataFrame.
         | 
| 1170 1244 |  | 
| @@ -1192,14 +1266,21 @@ def append_percentile_classification_column(df: pd.DataFrame, percentiles: str, | |
| 1192 1266 |  | 
| 1193 1267 | 
             
                if has_decimals:
         | 
| 1194 1268 | 
             
                    percentiles_list = [float(p) for p in percentiles_list]
         | 
| 1195 | 
            -
                    max_decimal_length = max( | 
| 1196 | 
            -
             | 
| 1269 | 
            +
                    max_decimal_length = max(
         | 
| 1270 | 
            +
                        len(str(p).split('.')[1]) for p in percentiles_list if '.' in str(p))
         | 
| 1271 | 
            +
                    max_integer_length = max(len(str(int(float(p))))
         | 
| 1272 | 
            +
                                             for p in percentiles_list)
         | 
| 1197 1273 |  | 
| 1198 1274 | 
             
                    labels = []
         | 
| 1199 1275 |  | 
| 1200 1276 | 
             
                    for i in range(len(percentiles_list) - 1):
         | 
| 1201 | 
            -
                        start = pad_number( | 
| 1202 | 
            -
             | 
| 1277 | 
            +
                        start = pad_number(
         | 
| 1278 | 
            +
                            percentiles_list[i],
         | 
| 1279 | 
            +
                            max_integer_length,
         | 
| 1280 | 
            +
                            max_decimal_length,
         | 
| 1281 | 
            +
                            decimal=True)
         | 
| 1282 | 
            +
                        end = pad_number(
         | 
| 1283 | 
            +
                            percentiles_list[i + 1], max_integer_length, max_decimal_length, decimal=True)
         | 
| 1203 1284 |  | 
| 1204 1285 | 
             
                        label = f"{start} to {end}"
         | 
| 1205 1286 | 
             
                        labels.append(label)
         | 
| @@ -1222,12 +1303,20 @@ def append_percentile_classification_column(df: pd.DataFrame, percentiles: str, | |
| 1222 1303 | 
             
                df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
         | 
| 1223 1304 | 
             
                quantiles = [df[target_col].quantile(p / 100) for p in percentiles_list]
         | 
| 1224 1305 |  | 
| 1225 | 
            -
                df[new_col_name] = pd.cut( | 
| 1306 | 
            +
                df[new_col_name] = pd.cut(
         | 
| 1307 | 
            +
                    df[target_col],
         | 
| 1308 | 
            +
                    bins=quantiles,
         | 
| 1309 | 
            +
                    labels=labels,
         | 
| 1310 | 
            +
                    include_lowest=True)
         | 
| 1226 1311 |  | 
| 1227 1312 | 
             
                return df
         | 
| 1228 1313 |  | 
| 1229 1314 |  | 
| 1230 | 
            -
            def append_ranged_date_classification_column( | 
| 1315 | 
            +
            def append_ranged_date_classification_column(
         | 
| 1316 | 
            +
                    df: pd.DataFrame,
         | 
| 1317 | 
            +
                    date_ranges: str,
         | 
| 1318 | 
            +
                    target_col: str,
         | 
| 1319 | 
            +
                    new_col_name: str) -> pd.DataFrame:
         | 
| 1231 1320 | 
             
                """
         | 
| 1232 1321 | 
             
                Append a ranged date classification column to the DataFrame.
         | 
| 1233 1322 |  | 
| @@ -1260,7 +1349,9 @@ def append_ranged_date_classification_column(df: pd.DataFrame, date_ranges: str, | |
| 1260 1349 | 
             
                return df
         | 
| 1261 1350 |  | 
| 1262 1351 |  | 
| 1263 | 
            -
            def rename_columns(df: pd.DataFrame, | 
| 1352 | 
            +
            def rename_columns(df: pd.DataFrame,
         | 
| 1353 | 
            +
                               rename_pairs: Dict[str,
         | 
| 1354 | 
            +
                                                  str]) -> pd.DataFrame:
         | 
| 1264 1355 | 
             
                """
         | 
| 1265 1356 | 
             
                Rename columns in the DataFrame.
         | 
| 1266 1357 |  | 
| @@ -1272,7 +1363,8 @@ def rename_columns(df: pd.DataFrame, rename_pairs: Dict[str, str]) -> pd.DataFra | |
| 1272 1363 | 
             
                    A new DataFrame with columns renamed.
         | 
| 1273 1364 | 
             
                """
         | 
| 1274 1365 | 
             
                if df is None:
         | 
| 1275 | 
            -
                    raise ValueError( | 
| 1366 | 
            +
                    raise ValueError(
         | 
| 1367 | 
            +
                        "No DataFrame to rename columns. Please provide a valid DataFrame.")
         | 
| 1276 1368 |  | 
| 1277 1369 | 
             
                return df.rename(columns=rename_pairs)
         | 
| 1278 1370 |  | 
| @@ -1290,7 +1382,8 @@ def cascade_sort(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame: | |
| 1290 1382 | 
             
                    A new DataFrame sorted by specified columns.
         | 
| 1291 1383 | 
             
                """
         | 
| 1292 1384 | 
             
                if df is None:
         | 
| 1293 | 
            -
                    raise ValueError( | 
| 1385 | 
            +
                    raise ValueError(
         | 
| 1386 | 
            +
                        "No DataFrame to sort. Please provide a valid DataFrame.")
         | 
| 1294 1387 |  | 
| 1295 1388 | 
             
                col_names = []
         | 
| 1296 1389 | 
             
                asc_order = []
         | 
| @@ -1325,7 +1418,8 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame: | |
| 1325 1418 | 
             
                    A new DataFrame with XGB_TYPE labels appended.
         | 
| 1326 1419 | 
             
                """
         | 
| 1327 1420 | 
             
                if df is None:
         | 
| 1328 | 
            -
                    raise ValueError( | 
| 1421 | 
            +
                    raise ValueError(
         | 
| 1422 | 
            +
                        "No DataFrame to add labels. Please provide a valid DataFrame.")
         | 
| 1329 1423 |  | 
| 1330 1424 | 
             
                ratios = list(map(int, ratio_str.split(':')))
         | 
| 1331 1425 | 
             
                total_ratio = sum(ratios)
         | 
| @@ -1342,7 +1436,8 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame: | |
| 1342 1436 | 
             
                    labels = ['TRAIN'] * train_rows + ['VALIDATE'] * \
         | 
| 1343 1437 | 
             
                        validate_rows + ['TEST'] * test_rows
         | 
| 1344 1438 | 
             
                else:
         | 
| 1345 | 
            -
                    raise ValueError( | 
| 1439 | 
            +
                    raise ValueError(
         | 
| 1440 | 
            +
                        "Invalid ratio string format. Use 'TRAIN:TEST' or 'TRAIN:VALIDATE:TEST'.")
         | 
| 1346 1441 |  | 
| 1347 1442 | 
             
                df_with_labels = df.copy()
         | 
| 1348 1443 | 
             
                df_with_labels['XGB_TYPE'] = labels
         | 
| @@ -1350,7 +1445,13 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame: | |
| 1350 1445 | 
             
                return df_with_labels
         | 
| 1351 1446 |  | 
| 1352 1447 |  | 
| 1353 | 
            -
            def append_xgb_regression_predictions( | 
| 1448 | 
            +
            def append_xgb_regression_predictions(
         | 
| 1449 | 
            +
                    df: pd.DataFrame,
         | 
| 1450 | 
            +
                    target_col: str,
         | 
| 1451 | 
            +
                    feature_cols: str,
         | 
| 1452 | 
            +
                    pred_col: str,
         | 
| 1453 | 
            +
                    boosting_rounds: int = 100,
         | 
| 1454 | 
            +
                    model_path: Optional[str] = None) -> pd.DataFrame:
         | 
| 1354 1455 | 
             
                """
         | 
| 1355 1456 | 
             
                Append XGB regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
         | 
| 1356 1457 |  | 
| @@ -1366,7 +1467,8 @@ def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature | |
| 1366 1467 | 
             
                    DataFrame with predictions appended.
         | 
| 1367 1468 | 
             
                """
         | 
| 1368 1469 | 
             
                if df is None or 'XGB_TYPE' not in df.columns:
         | 
| 1369 | 
            -
                    raise ValueError( | 
| 1470 | 
            +
                    raise ValueError(
         | 
| 1471 | 
            +
                        "DataFrame is not initialized or 'XGB_TYPE' column is missing.")
         | 
| 1370 1472 |  | 
| 1371 1473 | 
             
                features = feature_cols.replace(' ', '').split(',')
         | 
| 1372 1474 |  | 
| @@ -1382,16 +1484,27 @@ def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature | |
| 1382 1484 | 
             
                else:
         | 
| 1383 1485 | 
             
                    validate_data = None
         | 
| 1384 1486 |  | 
| 1385 | 
            -
                dtrain = xgb.DMatrix( | 
| 1487 | 
            +
                dtrain = xgb.DMatrix(
         | 
| 1488 | 
            +
                    train_data[features],
         | 
| 1489 | 
            +
                    label=train_data[target_col],
         | 
| 1490 | 
            +
                    enable_categorical=True)
         | 
| 1386 1491 | 
             
                evals = [(dtrain, 'train')]
         | 
| 1387 1492 |  | 
| 1388 1493 | 
             
                if validate_data is not None:
         | 
| 1389 | 
            -
                    dvalidate = xgb.DMatrix( | 
| 1494 | 
            +
                    dvalidate = xgb.DMatrix(
         | 
| 1495 | 
            +
                        validate_data[features],
         | 
| 1496 | 
            +
                        label=validate_data[target_col],
         | 
| 1497 | 
            +
                        enable_categorical=True)
         | 
| 1390 1498 | 
             
                    evals.append((dvalidate, 'validate'))
         | 
| 1391 1499 |  | 
| 1392 1500 | 
             
                params = {'objective': 'reg:squarederror', 'eval_metric': 'rmse'}
         | 
| 1393 1501 |  | 
| 1394 | 
            -
                model = xgb.train( | 
| 1502 | 
            +
                model = xgb.train(
         | 
| 1503 | 
            +
                    params,
         | 
| 1504 | 
            +
                    dtrain,
         | 
| 1505 | 
            +
                    num_boost_round=boosting_rounds,
         | 
| 1506 | 
            +
                    evals=evals,
         | 
| 1507 | 
            +
                    early_stopping_rounds=10 if validate_data is not None else None)
         | 
| 1395 1508 |  | 
| 1396 1509 | 
             
                # Make predictions for all data
         | 
| 1397 1510 | 
             
                dall = xgb.DMatrix(df[features], enable_categorical=True)
         | 
| @@ -1400,13 +1513,20 @@ def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature | |
| 1400 1513 | 
             
                if model_path:
         | 
| 1401 1514 | 
             
                    model.save_model(model_path)
         | 
| 1402 1515 |  | 
| 1403 | 
            -
                columns_order = [col for col in df.columns if col not in [ | 
| 1516 | 
            +
                columns_order = [col for col in df.columns if col not in [
         | 
| 1517 | 
            +
                    'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
         | 
| 1404 1518 | 
             
                df = df[columns_order]
         | 
| 1405 1519 |  | 
| 1406 1520 | 
             
                return df
         | 
| 1407 1521 |  | 
| 1408 1522 |  | 
| 1409 | 
            -
            def append_xgb_logistic_regression_predictions( | 
| 1523 | 
            +
            def append_xgb_logistic_regression_predictions(
         | 
| 1524 | 
            +
                    df: pd.DataFrame,
         | 
| 1525 | 
            +
                    target_col: str,
         | 
| 1526 | 
            +
                    feature_cols: str,
         | 
| 1527 | 
            +
                    pred_col: str,
         | 
| 1528 | 
            +
                    boosting_rounds: int = 100,
         | 
| 1529 | 
            +
                    model_path: Optional[str] = None) -> pd.DataFrame:
         | 
| 1410 1530 | 
             
                """
         | 
| 1411 1531 | 
             
                Append XGB logistic regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
         | 
| 1412 1532 |  | 
| @@ -1438,16 +1558,27 @@ def append_xgb_logistic_regression_predictions(df: pd.DataFrame, target_col: str | |
| 1438 1558 | 
             
                if 'VALIDATE' in df['XGB_TYPE'].values:
         | 
| 1439 1559 | 
             
                    validate_data = df[df['XGB_TYPE'] == 'VALIDATE']
         | 
| 1440 1560 |  | 
| 1441 | 
            -
                dtrain = xgb.DMatrix( | 
| 1561 | 
            +
                dtrain = xgb.DMatrix(
         | 
| 1562 | 
            +
                    train_data[features],
         | 
| 1563 | 
            +
                    label=train_data[target_col],
         | 
| 1564 | 
            +
                    enable_categorical=True)
         | 
| 1442 1565 | 
             
                evals = [(dtrain, 'train')]
         | 
| 1443 1566 |  | 
| 1444 1567 | 
             
                if validate_data is not None:
         | 
| 1445 | 
            -
                    dvalidate = xgb.DMatrix( | 
| 1568 | 
            +
                    dvalidate = xgb.DMatrix(
         | 
| 1569 | 
            +
                        validate_data[features],
         | 
| 1570 | 
            +
                        label=validate_data[target_col],
         | 
| 1571 | 
            +
                        enable_categorical=True)
         | 
| 1446 1572 | 
             
                    evals.append((dvalidate, 'validate'))
         | 
| 1447 1573 |  | 
| 1448 1574 | 
             
                params = {'objective': 'binary:logistic', 'eval_metric': 'auc'}
         | 
| 1449 1575 |  | 
| 1450 | 
            -
                model = xgb.train( | 
| 1576 | 
            +
                model = xgb.train(
         | 
| 1577 | 
            +
                    params,
         | 
| 1578 | 
            +
                    dtrain,
         | 
| 1579 | 
            +
                    num_boost_round=boosting_rounds,
         | 
| 1580 | 
            +
                    evals=evals,
         | 
| 1581 | 
            +
                    early_stopping_rounds=10 if validate_data is not None else None)
         | 
| 1451 1582 |  | 
| 1452 1583 | 
             
                # Make predictions for all data
         | 
| 1453 1584 | 
             
                dall = xgb.DMatrix(df[features], enable_categorical=True)
         | 
| @@ -1456,13 +1587,18 @@ def append_xgb_logistic_regression_predictions(df: pd.DataFrame, target_col: str | |
| 1456 1587 | 
             
                if model_path:
         | 
| 1457 1588 | 
             
                    model.save_model(model_path)
         | 
| 1458 1589 |  | 
| 1459 | 
            -
                columns_order = [col for col in df.columns if col not in [ | 
| 1590 | 
            +
                columns_order = [col for col in df.columns if col not in [
         | 
| 1591 | 
            +
                    'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
         | 
| 1460 1592 | 
             
                df = df[columns_order]
         | 
| 1461 1593 |  | 
| 1462 1594 | 
             
                return df
         | 
| 1463 1595 |  | 
| 1464 1596 |  | 
| 1465 | 
            -
            def print_n_frequency_cascading( | 
| 1597 | 
            +
            def print_n_frequency_cascading(
         | 
| 1598 | 
            +
                    df: pd.DataFrame,
         | 
| 1599 | 
            +
                    n: int,
         | 
| 1600 | 
            +
                    columns: str,
         | 
| 1601 | 
            +
                    order_by: str = "FREQ_DESC") -> None:
         | 
| 1466 1602 | 
             
                """
         | 
| 1467 1603 | 
             
                Print the cascading frequency of top n values for specified columns.
         | 
| 1468 1604 |  | 
| @@ -1485,7 +1621,12 @@ def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by | |
| 1485 1621 | 
             
                    # Convert the column to string representation
         | 
| 1486 1622 | 
             
                    df[current_col] = df[current_col].astype(str)
         | 
| 1487 1623 | 
             
                    frequency = df[current_col].value_counts(dropna=False)
         | 
| 1488 | 
            -
                    frequency = frequency.rename( | 
| 1624 | 
            +
                    frequency = frequency.rename(
         | 
| 1625 | 
            +
                        index={
         | 
| 1626 | 
            +
                            'nan': 'NaN',
         | 
| 1627 | 
            +
                            'NaT': 'NaT',
         | 
| 1628 | 
            +
                            'None': 'None',
         | 
| 1629 | 
            +
                            '': 'Empty'})
         | 
| 1489 1630 |  | 
| 1490 1631 | 
             
                    if limit is not None:
         | 
| 1491 1632 | 
             
                        frequency = frequency.nlargest(limit)
         | 
| @@ -1500,8 +1641,11 @@ def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by | |
| 1500 1641 | 
             
                            filtered_df = df[df[current_col] == value]
         | 
| 1501 1642 |  | 
| 1502 1643 | 
             
                        if len(columns) > 1:
         | 
| 1503 | 
            -
                            sub_report = generate_cascade_report( | 
| 1504 | 
            -
             | 
| 1644 | 
            +
                            sub_report = generate_cascade_report(
         | 
| 1645 | 
            +
                                filtered_df, columns[1:], limit, order_by)
         | 
| 1646 | 
            +
                            report[value] = {
         | 
| 1647 | 
            +
                                "count": str(count), f"sub_distribution({
         | 
| 1648 | 
            +
                                    columns[1]})": sub_report if sub_report else {}}
         | 
| 1505 1649 | 
             
                        else:
         | 
| 1506 1650 | 
             
                            report[value] = {"count": str(count)}
         | 
| 1507 1651 |  | 
| @@ -1511,17 +1655,29 @@ def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by | |
| 1511 1655 | 
             
                    if order_by == "ASC":
         | 
| 1512 1656 | 
             
                        return dict(sorted(frequency.items(), key=lambda item: item[0]))
         | 
| 1513 1657 | 
             
                    elif order_by == "DESC":
         | 
| 1514 | 
            -
                        return dict( | 
| 1658 | 
            +
                        return dict(
         | 
| 1659 | 
            +
                            sorted(
         | 
| 1660 | 
            +
                                frequency.items(),
         | 
| 1661 | 
            +
                                key=lambda item: item[0],
         | 
| 1662 | 
            +
                                reverse=True))
         | 
| 1515 1663 | 
             
                    elif order_by == "FREQ_ASC":
         | 
| 1516 1664 | 
             
                        return dict(sorted(frequency.items(), key=lambda item: item[1]))
         | 
| 1517 1665 | 
             
                    else:  # Default to "FREQ_DESC"
         | 
| 1518 | 
            -
                        return dict( | 
| 1666 | 
            +
                        return dict(
         | 
| 1667 | 
            +
                            sorted(
         | 
| 1668 | 
            +
                                frequency.items(),
         | 
| 1669 | 
            +
                                key=lambda item: item[1],
         | 
| 1670 | 
            +
                                reverse=True))
         | 
| 1519 1671 |  | 
| 1520 1672 | 
             
                report = generate_cascade_report(df, columns, n, order_by)
         | 
| 1521 1673 | 
             
                print(json.dumps(report, indent=2))
         | 
| 1522 1674 |  | 
| 1523 1675 |  | 
| 1524 | 
            -
            def print_n_frequency_linear( | 
| 1676 | 
            +
            def print_n_frequency_linear(
         | 
| 1677 | 
            +
                    df: pd.DataFrame,
         | 
| 1678 | 
            +
                    n: int,
         | 
| 1679 | 
            +
                    columns: str,
         | 
| 1680 | 
            +
                    order_by: str = "FREQ_DESC") -> None:
         | 
| 1525 1681 | 
             
                """
         | 
| 1526 1682 | 
             
                Print the linear frequency of top n values for specified columns.
         | 
| 1527 1683 |  | 
| @@ -1541,13 +1697,19 @@ def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: str, order_by: s | |
| 1541 1697 | 
             
                            continue
         | 
| 1542 1698 |  | 
| 1543 1699 | 
             
                        frequency = df[current_col].astype(str).value_counts(dropna=False)
         | 
| 1544 | 
            -
                        frequency = frequency.rename( | 
| 1700 | 
            +
                        frequency = frequency.rename(
         | 
| 1701 | 
            +
                            index={
         | 
| 1702 | 
            +
                                'nan': 'NaN',
         | 
| 1703 | 
            +
                                'NaT': 'NaT',
         | 
| 1704 | 
            +
                                'None': 'None',
         | 
| 1705 | 
            +
                                '': 'Empty'})
         | 
| 1545 1706 |  | 
| 1546 1707 | 
             
                        if limit is not None:
         | 
| 1547 1708 | 
             
                            frequency = frequency.nlargest(limit)
         | 
| 1548 1709 |  | 
| 1549 1710 | 
             
                        sorted_frequency = sort_frequency(frequency, order_by)
         | 
| 1550 | 
            -
                        col_report = {str(value): str(count) | 
| 1711 | 
            +
                        col_report = {str(value): str(count)
         | 
| 1712 | 
            +
                                      for value, count in sorted_frequency.items()}
         | 
| 1551 1713 | 
             
                        report[current_col] = col_report
         | 
| 1552 1714 |  | 
| 1553 1715 | 
             
                    return report
         | 
| @@ -1556,17 +1718,27 @@ def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: str, order_by: s | |
| 1556 1718 | 
             
                    if order_by == "ASC":
         | 
| 1557 1719 | 
             
                        return dict(sorted(frequency.items(), key=lambda item: item[0]))
         | 
| 1558 1720 | 
             
                    elif order_by == "DESC":
         | 
| 1559 | 
            -
                        return dict( | 
| 1721 | 
            +
                        return dict(
         | 
| 1722 | 
            +
                            sorted(
         | 
| 1723 | 
            +
                                frequency.items(),
         | 
| 1724 | 
            +
                                key=lambda item: item[0],
         | 
| 1725 | 
            +
                                reverse=True))
         | 
| 1560 1726 | 
             
                    elif order_by == "FREQ_ASC":
         | 
| 1561 1727 | 
             
                        return dict(sorted(frequency.items(), key=lambda item: item[1]))
         | 
| 1562 1728 | 
             
                    else:  # Default to "FREQ_DESC"
         | 
| 1563 | 
            -
                        return dict( | 
| 1729 | 
            +
                        return dict(
         | 
| 1730 | 
            +
                            sorted(
         | 
| 1731 | 
            +
                                frequency.items(),
         | 
| 1732 | 
            +
                                key=lambda item: item[1],
         | 
| 1733 | 
            +
                                reverse=True))
         | 
| 1564 1734 |  | 
| 1565 1735 | 
             
                report = generate_linear_report(df, columns, n, order_by)
         | 
| 1566 1736 | 
             
                print(json.dumps(report, indent=2))
         | 
| 1567 1737 |  | 
| 1568 1738 |  | 
| 1569 | 
            -
            def retain_columns( | 
| 1739 | 
            +
            def retain_columns(
         | 
| 1740 | 
            +
                    df: pd.DataFrame,
         | 
| 1741 | 
            +
                    columns_to_retain: List[str]) -> pd.DataFrame:
         | 
| 1570 1742 | 
             
                """
         | 
| 1571 1743 | 
             
                Retain specified columns in the DataFrame and drop the others.
         | 
| 1572 1744 |  | 
| @@ -1582,7 +1754,10 @@ def retain_columns(df: pd.DataFrame, columns_to_retain: List[str]) -> pd.DataFra | |
| 1582 1754 | 
             
                return df[columns_to_retain]
         | 
| 1583 1755 |  | 
| 1584 1756 |  | 
| 1585 | 
            -
            def mask_against_dataframe( | 
| 1757 | 
            +
            def mask_against_dataframe(
         | 
| 1758 | 
            +
                    df: pd.DataFrame,
         | 
| 1759 | 
            +
                    other_df: pd.DataFrame,
         | 
| 1760 | 
            +
                    column_name: str) -> pd.DataFrame:
         | 
| 1586 1761 | 
             
                """
         | 
| 1587 1762 | 
             
                Retain only rows with common column values between two DataFrames.
         | 
| 1588 1763 |  | 
| @@ -1599,7 +1774,10 @@ def mask_against_dataframe(df: pd.DataFrame, other_df: pd.DataFrame, column_name | |
| 1599 1774 | 
             
                return df[df[column_name].isin(other_df[column_name])]
         | 
| 1600 1775 |  | 
| 1601 1776 |  | 
| 1602 | 
            -
            def mask_against_dataframe_converse( | 
| 1777 | 
            +
            def mask_against_dataframe_converse(
         | 
| 1778 | 
            +
                    df: pd.DataFrame,
         | 
| 1779 | 
            +
                    other_df: pd.DataFrame,
         | 
| 1780 | 
            +
                    column_name: str) -> pd.DataFrame:
         | 
| 1603 1781 | 
             
                """
         | 
| 1604 1782 | 
             
                Retain only rows with uncommon column values between two DataFrames.
         | 
| 1605 1783 |  | 
| @@ -1633,7 +1811,8 @@ def union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame: | |
| 1633 1811 | 
             
                    ValueError: If the DataFrames do not have the same columns.
         | 
| 1634 1812 | 
             
                """
         | 
| 1635 1813 | 
             
                if set(df1.columns) != set(df2.columns):
         | 
| 1636 | 
            -
                    raise ValueError( | 
| 1814 | 
            +
                    raise ValueError(
         | 
| 1815 | 
            +
                        "Both DataFrames must have the same columns for a union join")
         | 
| 1637 1816 |  | 
| 1638 1817 | 
             
                result_df = pd.concat([df1, df2], ignore_index=True).drop_duplicates()
         | 
| 1639 1818 | 
             
                return result_df
         | 
| @@ -1654,13 +1833,18 @@ def bag_union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame: | |
| 1654 1833 | 
             
                    ValueError: If the DataFrames do not have the same columns.
         | 
| 1655 1834 | 
             
                """
         | 
| 1656 1835 | 
             
                if set(df1.columns) != set(df2.columns):
         | 
| 1657 | 
            -
                    raise ValueError( | 
| 1836 | 
            +
                    raise ValueError(
         | 
| 1837 | 
            +
                        "Both DataFrames must have the same columns for a bag union join")
         | 
| 1658 1838 |  | 
| 1659 1839 | 
             
                result_df = pd.concat([df1, df2], ignore_index=True)
         | 
| 1660 1840 | 
             
                return result_df
         | 
| 1661 1841 |  | 
| 1662 1842 |  | 
| 1663 | 
            -
            def left_join( | 
| 1843 | 
            +
            def left_join(
         | 
| 1844 | 
            +
                    df1: pd.DataFrame,
         | 
| 1845 | 
            +
                    df2: pd.DataFrame,
         | 
| 1846 | 
            +
                    left_on: str,
         | 
| 1847 | 
            +
                    right_on: str) -> pd.DataFrame:
         | 
| 1664 1848 | 
             
                """
         | 
| 1665 1849 | 
             
                Perform a left join on two DataFrames.
         | 
| 1666 1850 |  | 
| @@ -1676,7 +1860,11 @@ def left_join(df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str) | |
| 1676 1860 | 
             
                return df1.merge(df2, how='left', left_on=left_on, right_on=right_on)
         | 
| 1677 1861 |  | 
| 1678 1862 |  | 
| 1679 | 
            -
            def right_join( | 
| 1863 | 
            +
            def right_join(
         | 
| 1864 | 
            +
                    df1: pd.DataFrame,
         | 
| 1865 | 
            +
                    df2: pd.DataFrame,
         | 
| 1866 | 
            +
                    left_on: str,
         | 
| 1867 | 
            +
                    right_on: str) -> pd.DataFrame:
         | 
| 1680 1868 | 
             
                """
         | 
| 1681 1869 | 
             
                Perform a right join on two DataFrames.
         | 
| 1682 1870 |  | 
| @@ -1692,7 +1880,72 @@ def right_join(df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str | |
| 1692 1880 | 
             
                return df1.merge(df2, how='right', left_on=left_on, right_on=right_on)
         | 
| 1693 1881 |  | 
| 1694 1882 |  | 
| 1695 | 
            -
            def  | 
| 1883 | 
            +
            def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.DataFrame) -> None:
         | 
| 1884 | 
            +
                """
         | 
| 1885 | 
            +
                Inserts a Pandas DataFrame into a SQLite database table.
         | 
| 1886 | 
            +
             | 
| 1887 | 
            +
                Parameters:
         | 
| 1888 | 
            +
                    db_path: str
         | 
| 1889 | 
            +
                        The file path to the SQLite database. If the database does not exist,
         | 
| 1890 | 
            +
                        it will be created.
         | 
| 1891 | 
            +
             | 
| 1892 | 
            +
                    tablename: str
         | 
| 1893 | 
            +
                        The name of the table where the data will be inserted. If the table does
         | 
| 1894 | 
            +
                        not exist, it will be created based on the DataFrame's columns and types.
         | 
| 1895 | 
            +
             | 
| 1896 | 
            +
                    df: pd.DataFrame
         | 
| 1897 | 
            +
                        The DataFrame containing the data to be inserted into the database.
         | 
| 1898 | 
            +
             | 
| 1899 | 
            +
                Functionality:
         | 
| 1900 | 
            +
                    - Checks if the specified table exists in the database.
         | 
| 1901 | 
            +
                    - Creates the table with appropriate column types if it doesn't exist.
         | 
| 1902 | 
            +
                    - Inserts the DataFrame's data into the table, appending to any existing data.
         | 
| 1903 | 
            +
             | 
| 1904 | 
            +
                Data Type Mapping:
         | 
| 1905 | 
            +
                    - Converts Pandas data types to SQLite types: 'int64' to 'INTEGER',
         | 
| 1906 | 
            +
                      'float64' to 'REAL', 'object' to 'TEXT', 'datetime64[ns]' to 'TEXT',
         | 
| 1907 | 
            +
                      and 'bool' to 'INTEGER'.
         | 
| 1908 | 
            +
             | 
| 1909 | 
            +
                Returns:
         | 
| 1910 | 
            +
                    None
         | 
| 1911 | 
            +
                """
         | 
| 1912 | 
            +
             | 
| 1913 | 
            +
                def table_exists(cursor, table_name):
         | 
| 1914 | 
            +
                    cursor.execute(
         | 
| 1915 | 
            +
                        f"SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{table_name}'")
         | 
| 1916 | 
            +
                    return cursor.fetchone()[0] == 1
         | 
| 1917 | 
            +
             | 
| 1918 | 
            +
                dtype_mapping = {
         | 
| 1919 | 
            +
                    'int64': 'INTEGER',
         | 
| 1920 | 
            +
                    'float64': 'REAL',
         | 
| 1921 | 
            +
                    'object': 'TEXT',
         | 
| 1922 | 
            +
                    'datetime64[ns]': 'TEXT',
         | 
| 1923 | 
            +
                    'bool': 'INTEGER',
         | 
| 1924 | 
            +
                }
         | 
| 1925 | 
            +
             | 
| 1926 | 
            +
                def map_dtype(dtype):
         | 
| 1927 | 
            +
                    return dtype_mapping.get(str(dtype), 'TEXT')
         | 
| 1928 | 
            +
             | 
| 1929 | 
            +
                with sqlite3.connect(db_path) as conn:
         | 
| 1930 | 
            +
                    cursor = conn.cursor()
         | 
| 1931 | 
            +
             | 
| 1932 | 
            +
                    if not table_exists(cursor, tablename):
         | 
| 1933 | 
            +
                        columns_with_types = ', '.join(
         | 
| 1934 | 
            +
                            f'"{col}" {
         | 
| 1935 | 
            +
                                map_dtype(dtype)}' for col,
         | 
| 1936 | 
            +
                            dtype in zip(
         | 
| 1937 | 
            +
                                df.columns,
         | 
| 1938 | 
            +
                                df.dtypes))
         | 
| 1939 | 
            +
                        create_table_query = f'CREATE TABLE "{tablename}" ({columns_with_types})'
         | 
| 1940 | 
            +
                        conn.execute(create_table_query)
         | 
| 1941 | 
            +
             | 
| 1942 | 
            +
                    df.to_sql(tablename, conn, if_exists='append', index=False)
         | 
| 1943 | 
            +
             | 
| 1944 | 
            +
             | 
| 1945 | 
            +
            def sync_dataframe_to_sqlite_database(
         | 
| 1946 | 
            +
                    db_path: str,
         | 
| 1947 | 
            +
                    tablename: str,
         | 
| 1948 | 
            +
                    df: pd.DataFrame) -> None:
         | 
| 1696 1949 | 
             
                """
         | 
| 1697 1950 | 
             
                Processes and saves a DataFrame to an SQLite database, adding a timestamp column
         | 
| 1698 1951 | 
             
                and replacing the existing table if needed. Creates the table if it does not exist.
         | 
| @@ -1702,6 +1955,10 @@ def sync_dataframe_to_sqlite_database(db_path: str, tablename: str, df: pd.DataF | |
| 1702 1955 | 
             
                - tablename (str): The name of the table in the database.
         | 
| 1703 1956 | 
             
                - df (pd.DataFrame): The DataFrame to be processed and saved.
         | 
| 1704 1957 | 
             
                """
         | 
| 1958 | 
            +
                # Helper function to map pandas dtype to SQLite type
         | 
| 1959 | 
            +
                def map_dtype(dtype):
         | 
| 1960 | 
            +
                    return dtype_mapping.get(str(dtype), 'TEXT')
         | 
| 1961 | 
            +
             | 
| 1705 1962 | 
             
                # Step 1: Add a timestamp column to the dataframe
         | 
| 1706 1963 | 
             
                df['rgwfuncs_sync_timestamp'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
         | 
| 1707 1964 |  | 
| @@ -1714,10 +1971,6 @@ def sync_dataframe_to_sqlite_database(db_path: str, tablename: str, df: pd.DataF | |
| 1714 1971 | 
             
                    'bool': 'INTEGER',  # SQLite does not have a separate Boolean storage class
         | 
| 1715 1972 | 
             
                }
         | 
| 1716 1973 |  | 
| 1717 | 
            -
                # Helper function to map pandas dtype to SQLite type
         | 
| 1718 | 
            -
                def map_dtype(dtype):
         | 
| 1719 | 
            -
                    return dtype_mapping.get(str(dtype), 'TEXT')
         | 
| 1720 | 
            -
             | 
| 1721 1974 | 
             
                # Step 2: Save df in SQLite3 db as '{tablename}_new'
         | 
| 1722 1975 | 
             
                with sqlite3.connect(db_path) as conn:
         | 
| 1723 1976 | 
             
                    new_table_name = f"{tablename}_new"
         | 
| @@ -1728,8 +1981,11 @@ def sync_dataframe_to_sqlite_database(db_path: str, tablename: str, df: pd.DataF | |
| 1728 1981 | 
             
                    if cursor.fetchall() == []:  # Table does not exist
         | 
| 1729 1982 | 
             
                        # Create a table using the DataFrame's column names and types
         | 
| 1730 1983 | 
             
                        columns_with_types = ', '.join(
         | 
| 1731 | 
            -
                            f'"{col}" { | 
| 1732 | 
            -
             | 
| 1984 | 
            +
                            f'"{col}" {
         | 
| 1985 | 
            +
                                map_dtype(dtype)}' for col,
         | 
| 1986 | 
            +
                            dtype in zip(
         | 
| 1987 | 
            +
                                df.columns,
         | 
| 1988 | 
            +
                                df.dtypes))
         | 
| 1733 1989 | 
             
                        create_table_query = f'CREATE TABLE "{new_table_name}" ({columns_with_types})'
         | 
| 1734 1990 | 
             
                        conn.execute(create_table_query)
         | 
| 1735 1991 |  | 
    
        rgwfuncs/str_lib.py
    ADDED
    
    | @@ -0,0 +1,62 @@ | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
            import json
         | 
| 3 | 
            +
            import requests
         | 
| 4 | 
            +
            from typing import Tuple
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            def send_telegram_message(preset_name: str, message: str) -> None:
         | 
| 7 | 
            +
                """Send a Telegram message using the specified preset.
         | 
| 8 | 
            +
             | 
| 9 | 
            +
                Args:
         | 
| 10 | 
            +
                    preset_name (str): The name of the preset to use for sending the message.
         | 
| 11 | 
            +
                    message (str): The message to send.
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                Raises:
         | 
| 14 | 
            +
                    RuntimeError: If the preset is not found or necessary details are missing.
         | 
| 15 | 
            +
                """
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                # Set the config path to ~/.rgwfuncsrc
         | 
| 18 | 
            +
                config_path = os.path.expanduser("~/.rgwfuncsrc")
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                def load_config() -> dict:
         | 
| 21 | 
            +
                    """Load the configuration from the .rgwfuncsrc file."""
         | 
| 22 | 
            +
                    with open(config_path, 'r') as file:
         | 
| 23 | 
            +
                        return json.load(file)
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                def get_telegram_preset(config: dict, preset_name: str) -> dict:
         | 
| 26 | 
            +
                    """Get the Telegram preset configuration."""
         | 
| 27 | 
            +
                    presets = config.get("telegram_bot_presets", [])
         | 
| 28 | 
            +
                    for preset in presets:
         | 
| 29 | 
            +
                        if preset.get("name") == preset_name:
         | 
| 30 | 
            +
                            return preset
         | 
| 31 | 
            +
                    return None
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                def get_telegram_bot_details(config: dict, preset_name: str) -> Tuple[str, str]:
         | 
| 34 | 
            +
                    """Retrieve the Telegram bot token and chat ID from the preset."""
         | 
| 35 | 
            +
                    preset = get_telegram_preset(config, preset_name)
         | 
| 36 | 
            +
                    if not preset:
         | 
| 37 | 
            +
                        raise RuntimeError(f"Telegram bot preset '{preset_name}' not found in the configuration file")
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                    bot_token = preset.get("bot_token")
         | 
| 40 | 
            +
                    chat_id = preset.get("chat_id")
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                    if not bot_token or not chat_id:
         | 
| 43 | 
            +
                        raise RuntimeError(
         | 
| 44 | 
            +
                            f"Telegram bot token or chat ID for '{preset_name}' not found in the configuration file"
         | 
| 45 | 
            +
                        )
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                    return bot_token, chat_id
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                # Load the configuration
         | 
| 50 | 
            +
                config = load_config()
         | 
| 51 | 
            +
                
         | 
| 52 | 
            +
                # Get bot details from the configuration
         | 
| 53 | 
            +
                bot_token, chat_id = get_telegram_bot_details(config, preset_name)
         | 
| 54 | 
            +
                
         | 
| 55 | 
            +
                # Prepare the request
         | 
| 56 | 
            +
                url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
         | 
| 57 | 
            +
                payload = {"chat_id": chat_id, "text": message}
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                # Send the message
         | 
| 60 | 
            +
                response = requests.post(url, json=payload)
         | 
| 61 | 
            +
                response.raise_for_status()
         | 
| 62 | 
            +
             | 
| @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            Metadata-Version: 2.2
         | 
| 2 2 | 
             
            Name: rgwfuncs
         | 
| 3 | 
            -
            Version: 0.0. | 
| 3 | 
            +
            Version: 0.0.18
         | 
| 4 4 | 
             
            Summary: A functional programming paradigm for mathematical modelling and data science
         | 
| 5 5 | 
             
            Home-page: https://github.com/ryangerardwilson/rgwfunc
         | 
| 6 6 | 
             
            Author: Ryan Gerard Wilson
         | 
| @@ -40,9 +40,9 @@ Install the package using: | |
| 40 40 |  | 
| 41 41 | 
             
            --------------------------------------------------------------------------------
         | 
| 42 42 |  | 
| 43 | 
            -
            ## Create a ` | 
| 43 | 
            +
            ## Create a `.rgwfuncsrc` File
         | 
| 44 44 |  | 
| 45 | 
            -
            A ` | 
| 45 | 
            +
            A `.rgwfuncsrc` file (located at `vi ~/.rgwfuncsrc) is required for MSSQL, CLICKHOUSE, MYSQL, GOOGLE BIG QUERY, SLACK, TELEGRAM, and GMAIL integrations.
         | 
| 46 46 |  | 
| 47 47 | 
             
                {
         | 
| 48 48 | 
             
                  "db_presets" : [
         | 
| @@ -381,28 +381,30 @@ Drop duplicate rows based on specified columns, retaining the last occurrence. | |
| 381 381 | 
             
            --------------------------------------------------------------------------------
         | 
| 382 382 |  | 
| 383 383 | 
             
            ### 12. `load_data_from_query`
         | 
| 384 | 
            +
             | 
| 384 385 | 
             
            Load data from a database query into a DataFrame based on a configuration preset.
         | 
| 385 386 |  | 
| 386 | 
            -
             | 
| 387 | 
            -
              - `db_preset_name` (str): Name of the database preset in the  | 
| 388 | 
            -
              - query (str): The SQL query to execute.
         | 
| 389 | 
            -
              - `config_file_name` (str): Name of the configuration file (default: "rgwml.config").
         | 
| 387 | 
            +
            - **Parameters:**
         | 
| 388 | 
            +
              - `db_preset_name` (str): Name of the database preset in the configuration file.
         | 
| 389 | 
            +
              - `query` (str): The SQL query to execute.
         | 
| 390 390 |  | 
| 391 | 
            -
             | 
| 392 | 
            -
              - pd.DataFrame | 
| 391 | 
            +
            - **Returns:**
         | 
| 392 | 
            +
              - `pd.DataFrame`: A DataFrame containing the query result.
         | 
| 393 393 |  | 
| 394 | 
            -
             | 
| 395 | 
            -
             | 
| 396 | 
            -
                from rgwfuncs import load_data_from_query
         | 
| 394 | 
            +
            - **Notes:**
         | 
| 395 | 
            +
              - The configuration file is assumed to be located at `~/.rgwfuncsrc`.
         | 
| 397 396 |  | 
| 398 | 
            -
             | 
| 399 | 
            -
             | 
| 400 | 
            -
             | 
| 401 | 
            -
             | 
| 402 | 
            -
             | 
| 403 | 
            -
             | 
| 404 | 
            -
             | 
| 397 | 
            +
            - **Example:**
         | 
| 398 | 
            +
             | 
| 399 | 
            +
              from rgwfuncs import load_data_from_query
         | 
| 400 | 
            +
             | 
| 401 | 
            +
              df = load_data_from_query(
         | 
| 402 | 
            +
                  db_preset_name="MyDBPreset",
         | 
| 403 | 
            +
                  query="SELECT * FROM my_table"
         | 
| 404 | 
            +
              )
         | 
| 405 | 
            +
              print(df)
         | 
| 405 406 |  | 
| 407 | 
            +
                
         | 
| 406 408 | 
             
            --------------------------------------------------------------------------------
         | 
| 407 409 |  | 
| 408 410 | 
             
            ### 13. `load_data_from_path`
         | 
| @@ -1148,10 +1150,47 @@ Perform a right join on two DataFrames. | |
| 1148 1150 | 
             
                df_right_join = right_join(df1, df2, 'ID', 'ID')
         | 
| 1149 1151 | 
             
                print(df_right_join)
         | 
| 1150 1152 |  | 
| 1153 | 
            +
            --------------------------------------------------------------------------------
         | 
| 1154 | 
            +
             | 
| 1155 | 
            +
            ### 45. `insert_dataframe_in_sqlite_database`
         | 
| 1156 | 
            +
             | 
| 1157 | 
            +
            Inserts a Pandas DataFrame into a SQLite database table. If the specified table does not exist, it will be created with column types automatically inferred from the DataFrame's data types.
         | 
| 1158 | 
            +
             | 
| 1159 | 
            +
            - **Parameters:**
         | 
| 1160 | 
            +
              - `db_path` (str): The path to the SQLite database file. If the database does not exist, it will be created.
         | 
| 1161 | 
            +
              - `tablename` (str): The name of the table in the database. If the table does not exist, it is created with the DataFrame's columns and data types.
         | 
| 1162 | 
            +
              - `df` (pd.DataFrame): The DataFrame containing the data to be inserted into the database table.
         | 
| 1163 | 
            +
             | 
| 1164 | 
            +
            - **Returns:**
         | 
| 1165 | 
            +
              - `None`
         | 
| 1166 | 
            +
             | 
| 1167 | 
            +
            - **Notes:**
         | 
| 1168 | 
            +
              - Data types in the DataFrame are converted to SQLite-compatible types:
         | 
| 1169 | 
            +
                - `int64` is mapped to `INTEGER`
         | 
| 1170 | 
            +
                - `float64` is mapped to `REAL`
         | 
| 1171 | 
            +
                - `object` is mapped to `TEXT`
         | 
| 1172 | 
            +
                - `datetime64[ns]` is mapped to `TEXT` (dates are stored as text)
         | 
| 1173 | 
            +
                - `bool` is mapped to `INTEGER` (SQLite does not have a separate Boolean type)
         | 
| 1174 | 
            +
             | 
| 1175 | 
            +
            - **Example:**
         | 
| 1176 | 
            +
             | 
| 1177 | 
            +
                from rgwfuncs import insert_dataframe_in_sqlite_database
         | 
| 1178 | 
            +
                import pandas as pd
         | 
| 1179 | 
            +
             | 
| 1180 | 
            +
                df = pd.DataFrame({
         | 
| 1181 | 
            +
                    'ID': [1, 2, 3],
         | 
| 1182 | 
            +
                    'Name': ['Alice', 'Bob', 'Charlie'],
         | 
| 1183 | 
            +
                    'Score': [88.5, 92.3, 85.0]
         | 
| 1184 | 
            +
                })
         | 
| 1185 | 
            +
             | 
| 1186 | 
            +
                db_path = 'my_database.db'
         | 
| 1187 | 
            +
                tablename = 'students'
         | 
| 1188 | 
            +
             | 
| 1189 | 
            +
                insert_dataframe_in_sqlite_database(db_path, tablename, df)
         | 
| 1151 1190 |  | 
| 1152 1191 | 
             
            --------------------------------------------------------------------------------
         | 
| 1153 1192 |  | 
| 1154 | 
            -
            ###  | 
| 1193 | 
            +
            ### 46. `sync_dataframe_to_sqlite_database`
         | 
| 1155 1194 | 
             
            Processes and saves a DataFrame to an SQLite database, adding a timestamp column and replacing the existing table if needed. Creates the table if it does not exist.
         | 
| 1156 1195 |  | 
| 1157 1196 | 
             
            • Parameters:
         | 
| @@ -0,0 +1,9 @@ | |
| 1 | 
            +
            rgwfuncs/__init__.py,sha256=XqJ8TJuc4HkQq3T5Gzjf3KTBsdJtyi2NKXBgbPuDn0Y,1156
         | 
| 2 | 
            +
            rgwfuncs/df_lib.py,sha256=rY1yVvY04uqR174JwYBFiRnujekr9mbe258wmu9OeeY,67148
         | 
| 3 | 
            +
            rgwfuncs/str_lib.py,sha256=6v9AXZ5wWsWVEcvcIz0B1rTmsvYaD-v53r2sYPcV4pU,2109
         | 
| 4 | 
            +
            rgwfuncs-0.0.18.dist-info/LICENSE,sha256=7EI8xVBu6h_7_JlVw-yPhhOZlpY9hP8wal7kHtqKT_E,1074
         | 
| 5 | 
            +
            rgwfuncs-0.0.18.dist-info/METADATA,sha256=GfMK-J1vH4CG_fQqQAWwAvDE6JcSqNrKuNKvfOUKV_E,33442
         | 
| 6 | 
            +
            rgwfuncs-0.0.18.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
         | 
| 7 | 
            +
            rgwfuncs-0.0.18.dist-info/entry_points.txt,sha256=j-c5IOPIQ0252EaOV6j6STio56sbXl2C4ym_fQ0lXx0,43
         | 
| 8 | 
            +
            rgwfuncs-0.0.18.dist-info/top_level.txt,sha256=aGuVIzWsKiV1f2gCb6mynx0zx5ma0B1EwPGFKVEMTi4,9
         | 
| 9 | 
            +
            rgwfuncs-0.0.18.dist-info/RECORD,,
         | 
    
        rgwfuncs-0.0.16.dist-info/RECORD
    DELETED
    
    | @@ -1,8 +0,0 @@ | |
| 1 | 
            -
            rgwfuncs/__init__.py,sha256=BP8Nh8ivyCCz8Ga-21JW3NWInJFOElKoIfRuioJRWbA,1076
         | 
| 2 | 
            -
            rgwfuncs/df_lib.py,sha256=OZPI7M35mbue6YsieWmlzjM5RUkaow0v0d3P-V71L6o,63034
         | 
| 3 | 
            -
            rgwfuncs-0.0.16.dist-info/LICENSE,sha256=7EI8xVBu6h_7_JlVw-yPhhOZlpY9hP8wal7kHtqKT_E,1074
         | 
| 4 | 
            -
            rgwfuncs-0.0.16.dist-info/METADATA,sha256=oKTScVPzrgTTWdCQ7vxEdKYRnc-S_90hKwefifayeDU,32059
         | 
| 5 | 
            -
            rgwfuncs-0.0.16.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
         | 
| 6 | 
            -
            rgwfuncs-0.0.16.dist-info/entry_points.txt,sha256=j-c5IOPIQ0252EaOV6j6STio56sbXl2C4ym_fQ0lXx0,43
         | 
| 7 | 
            -
            rgwfuncs-0.0.16.dist-info/top_level.txt,sha256=aGuVIzWsKiV1f2gCb6mynx0zx5ma0B1EwPGFKVEMTi4,9
         | 
| 8 | 
            -
            rgwfuncs-0.0.16.dist-info/RECORD,,
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         |