rgwfuncs 0.0.4__py3-none-any.whl → 0.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rgwfuncs/df_lib.py CHANGED
@@ -27,14 +27,10 @@ from typing import Optional, Callable, Dict, List, Tuple, Any
27
27
 
28
28
  def docs(method_type_filter: Optional[str] = None) -> None:
29
29
  """
30
- Print a list of function names in alphabetical order. If
31
- method_type_filter is specified, print the docstrings of the functions
32
- that match the filter. Using '*' as a filter will print the docstrings for
33
- all functions.
30
+ Print a list of function names in alphabetical order. If method_type_filter is specified, print the docstrings of the functions that match the filter. Using '*' as a filter will print the docstrings for all functions.
34
31
 
35
32
  Parameters:
36
- method_type_filter: Optional filter string, comma-separated to select
37
- docstring types, or '*' for all.
33
+ method_type_filter: Optional filter string, comma-separated to select docstring types, or '*' for all.
38
34
  """
39
35
  # Get the current module's namespace
40
36
  current_module = __name__
@@ -75,23 +71,15 @@ def docs(method_type_filter: Optional[str] = None) -> None:
75
71
  print(f"\n{name}:\n{docstring}")
76
72
 
77
73
 
78
- def numeric_clean(
79
- df: pd.DataFrame,
80
- column_names: str,
81
- column_type: str,
82
- irregular_value_treatment: str
83
- ) -> pd.DataFrame:
74
+ def numeric_clean(df: pd.DataFrame, column_names: str, column_type: str, irregular_value_treatment: str) -> pd.DataFrame:
84
75
  """
85
76
  Cleans the numeric columns based on specified treatments.
86
77
 
87
78
  Parameters:
88
79
  df: The DataFrame to clean.
89
- column_names: A comma-separated string containing the names of the
90
- columns to clean.
91
- column_type: The type to convert the column to ('INTEGER' or
92
- 'FLOAT').
93
- irregular_value_treatment: How to treat irregular values ('NAN',
94
- 'TO_ZERO', 'MEAN').
80
+ column_names: A comma-separated string containing the names of the columns to clean.
81
+ column_type: The type to convert the column to ('INTEGER' or 'FLOAT').
82
+ irregular_value_treatment: How to treat irregular values ('NAN', 'TO_ZERO', 'MEAN').
95
83
 
96
84
  Returns:
97
85
  A new DataFrame with cleaned numeric columns.
@@ -183,8 +171,7 @@ def append_rows(df: pd.DataFrame, rows: List[List]) -> pd.DataFrame:
183
171
 
184
172
  Parameters:
185
173
  df: The original DataFrame.
186
- rows: A list of lists, where each inner list represents a row to be
187
- appended.
174
+ rows: A list of lists, where each inner list represents a row to be appended.
188
175
 
189
176
  Returns:
190
177
  A new DataFrame with the appended rows.
@@ -244,8 +231,7 @@ def update_rows(
244
231
  Parameters:
245
232
  df: The original DataFrame.
246
233
  condition: A query condition to identify rows for updating.
247
- updates: A dictionary with column names as keys and new values as
248
- values.
234
+ updates: A dictionary with column names as keys and new values as values.
249
235
 
250
236
  Returns:
251
237
  A new DataFrame with the updated rows.
@@ -316,17 +302,13 @@ def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
316
302
  return df.drop_duplicates(keep='first')
317
303
 
318
304
 
319
- def drop_duplicates_retain_first(
320
- df: pd.DataFrame,
321
- columns: Optional[str] = None) -> pd.DataFrame:
305
+ def drop_duplicates_retain_first(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
322
306
  """
323
- Drop duplicate rows in the DataFrame based on specified columns, retaining
324
- the first occurrence.
307
+ Drop duplicate rows in the DataFrame based on specified columns, retaining the first occurrence.
325
308
 
326
309
  Parameters:
327
310
  df: The DataFrame from which duplicates will be dropped.
328
- columns: A comma-separated string with the column names used to
329
- identify duplicates.
311
+ columns: A comma-separated string with the column names used to identify duplicates.
330
312
 
331
313
  Returns:
332
314
  A new DataFrame with duplicates removed.
@@ -342,17 +324,13 @@ def drop_duplicates_retain_first(
342
324
  return df.drop_duplicates(subset=columns_list, keep='first')
343
325
 
344
326
 
345
- def drop_duplicates_retain_last(
346
- df: pd.DataFrame,
347
- columns: Optional[str] = None) -> pd.DataFrame:
327
+ def drop_duplicates_retain_last(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
348
328
  """
349
- Drop duplicate rows in the DataFrame based on specified columns, retaining
350
- the last occurrence.
329
+ Drop duplicate rows in the DataFrame based on specified columns, retaining the last occurrence.
351
330
 
352
331
  Parameters:
353
332
  df: The DataFrame from which duplicates will be dropped.
354
- columns: A comma-separated string with the column names used to
355
- identify duplicates.
333
+ columns: A comma-separated string with the column names used to identify duplicates.
356
334
 
357
335
  Returns:
358
336
  A new DataFrame with duplicates removed.
@@ -363,22 +341,17 @@ def drop_duplicates_retain_last(
363
341
  if df is None:
364
342
  raise ValueError("DataFrame is not initialized.")
365
343
 
366
- columns_list = [col.strip()
367
- for col in columns.split(',')] if columns else None
344
+ columns_list = [col.strip() for col in columns.split(',')] if columns else None
368
345
  return df.drop_duplicates(subset=columns_list, keep='last')
369
346
 
370
347
 
371
- def load_data_from_query(
372
- db_preset_name: str,
373
- query: str,
374
- config_file_name: str = "rgwml.config") -> pd.DataFrame:
348
+ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str = "rgwml.config") -> pd.DataFrame:
375
349
  """
376
350
  Load data from a database query into a DataFrame based on a configuration
377
351
  preset.
378
352
 
379
353
  Parameters:
380
- db_preset_name: The name of the database preset in the configuration
381
- file.
354
+ db_preset_name: The name of the database preset in the configuration file.
382
355
  query: The SQL query to execute.
383
356
  config_file_name: Name of the configuration file
384
357
  (default: 'rgwml.config').
@@ -393,19 +366,14 @@ def load_data_from_query(
393
366
 
394
367
  def locate_config_file(filename: str = config_file_name) -> str:
395
368
  home_dir = os.path.expanduser("~")
396
- search_paths = [
397
- os.path.join(home_dir, "Desktop"),
398
- os.path.join(home_dir, "Documents"),
399
- os.path.join(home_dir, "Downloads"),
400
- ]
369
+ search_paths = [os.path.join(home_dir, "Desktop"), os.path.join(home_dir, "Documents"), os.path.join(home_dir, "Downloads"),]
401
370
 
402
371
  for path in search_paths:
403
372
  for root, dirs, files in os.walk(path):
404
373
  if filename in files:
405
374
  return os.path.join(root, filename)
406
375
  raise FileNotFoundError(
407
- f"{filename} not found in Desktop, Documents, or Downloads"
408
- + "folders")
376
+ f"{filename} not found in Desktop, Documents, or Downloads folders")
409
377
 
410
378
  def query_mssql(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
411
379
  server = db_preset['host']
@@ -413,8 +381,7 @@ def load_data_from_query(
413
381
  password = db_preset['password']
414
382
  database = db_preset.get('database', '')
415
383
 
416
- with pymssql.connect(server=server, user=user, password=password,
417
- database=database) as conn:
384
+ with pymssql.connect(server=server, user=user, password=password, database=database) as conn:
418
385
  with conn.cursor() as cursor:
419
386
  cursor.execute(query)
420
387
  rows = cursor.fetchall()
@@ -428,25 +395,15 @@ def load_data_from_query(
428
395
  password = db_preset['password']
429
396
  database = db_preset.get('database', '')
430
397
 
431
- with mysql.connector.connect(
432
- host=host,
433
- user=user,
434
- password=password,
435
- database=database
436
- ) as conn:
398
+ with mysql.connector.connect(host=host, user=user, password=password, database=database) as conn:
437
399
  with conn.cursor() as cursor:
438
400
  cursor.execute(query)
439
401
  rows = cursor.fetchall()
440
- columns = (
441
- [desc[0] for desc in cursor.description]
442
- if cursor.description
443
- else []
444
- )
402
+ columns = ([desc[0] for desc in cursor.description] if cursor.description else [])
445
403
 
446
404
  return pd.DataFrame(rows, columns=columns)
447
405
 
448
- def query_clickhouse(
449
- db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
406
+ def query_clickhouse(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
450
407
 
451
408
  host = db_preset['host']
452
409
  user = db_preset['username']
@@ -458,13 +415,7 @@ def load_data_from_query(
458
415
 
459
416
  for attempt in range(max_retries):
460
417
  try:
461
- client = clickhouse_connect.get_client(
462
- host=host,
463
- port='8123',
464
- username=user,
465
- password=password,
466
- database=database
467
- )
418
+ client = clickhouse_connect.get_client(host=host, port='8123', username=user, password=password, database=database)
468
419
  data = client.query(query)
469
420
  rows = data.result_rows
470
421
  columns = data.column_names
@@ -478,13 +429,11 @@ def load_data_from_query(
478
429
  raise ConnectionError(
479
430
  "All attempts to connect to ClickHouse failed.")
480
431
 
481
- def query_google_big_query(
482
- db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
432
+ def query_google_big_query(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
483
433
  json_file_path = db_preset['json_file_path']
484
434
  project_id = db_preset['project_id']
485
435
 
486
- credentials = service_account.Credentials.from_service_account_file(
487
- json_file_path)
436
+ credentials = service_account.Credentials.from_service_account_file(json_file_path)
488
437
  client = bigquery.Client(credentials=credentials, project=project_id)
489
438
 
490
439
  query_job = client.query(query)
@@ -500,9 +449,7 @@ def load_data_from_query(
500
449
  config = json.load(f)
501
450
 
502
451
  db_presets = config.get('db_presets', [])
503
- db_preset = next(
504
- (preset for preset in db_presets if preset['name'] == db_preset_name),
505
- None)
452
+ db_preset = next((preset for preset in db_presets if preset['name'] == db_preset_name), None)
506
453
  if not db_preset:
507
454
  raise ValueError(f"No matching db_preset found for {db_preset_name}")
508
455
 
@@ -584,8 +531,7 @@ def load_data_from_path(file_path: str) -> pd.DataFrame:
584
531
 
585
532
  def load_data_from_sqlite_path(sqlite_path: str, query: str) -> pd.DataFrame:
586
533
  """
587
- Execute a query on a SQLite database specified by its path and return the
588
- results as a DataFrame.
534
+ Execute a query on a SQLite database specified by its path and return the results as a DataFrame.
589
535
 
590
536
  Parameters:
591
537
  sqlite_path: The absolute path to the SQLite database file.
@@ -615,10 +561,7 @@ def first_n_rows(df: pd.DataFrame, n: int) -> None:
615
561
  """
616
562
  Display the first n rows of the DataFrame.
617
563
 
618
- This function prints out the first `n` rows of a given DataFrame. Each row
619
- is formatted for clarity and
620
- printed as a dictionary. If the DataFrame is empty or `None`, it raises a
621
- ValueError.
564
+ This function prints out the first `n` rows of a given DataFrame. Each row is formatted for clarity and printed as a dictionary. If the DataFrame is empty or `None`, it raises a ValueError.
622
565
 
623
566
  Parameters:
624
567
  - df (pd.DataFrame): The DataFrame to display rows from.
@@ -643,8 +586,7 @@ def last_n_rows(df: pd.DataFrame, n: int) -> None:
643
586
  """
644
587
  Display the last n rows of the DataFrame.
645
588
 
646
- Prints the last `n` rows of a given DataFrame, formatted as dictionaries.
647
- Useful for end-segment analysis and verifying data continuity.
589
+ Prints the last `n` rows of a given DataFrame, formatted as dictionaries. Useful for end-segment analysis and verifying data continuity.
648
590
 
649
591
  Parameters:
650
592
  - df (pd.DataFrame): The DataFrame from which to display rows.
@@ -669,15 +611,13 @@ def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
669
611
  """
670
612
  Print the top `n` unique values for specified columns in the DataFrame.
671
613
 
672
- This method calculates and prints the top `n` unique frequency values for
673
- specified columns in a DataFrame.
614
+ This method calculates and prints the top `n` unique frequency values for specified columns in a DataFrame.
674
615
 
675
616
  Parameters:
676
617
  - df (pd.DataFrame): The DataFrame from which to calculate top unique
677
618
  values.
678
619
  - n (int): Number of top values to display.
679
- - columns (List[str]): List of column names for which to display top
680
- unique values.
620
+ - columns (List[str]): List of column names for which to display top unique values.
681
621
 
682
622
  Raises:
683
623
  - ValueError: If the DataFrame is `None`.
@@ -687,20 +627,10 @@ def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
687
627
  for column in columns:
688
628
  if column in df.columns:
689
629
  frequency = df[column].astype(str).value_counts(dropna=False)
690
- frequency = frequency.rename(
691
- index={
692
- 'nan': 'NaN',
693
- 'NaT': 'NaT',
694
- 'None': 'None',
695
- '': 'Empty'})
630
+ frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
696
631
  top_n_values = frequency.nlargest(n)
697
- report[column] = {str(value): str(count)
698
- for value, count in top_n_values.items()}
699
- print(
700
- f"Top {n} unique values for column '{column}':\n{
701
- json.dumps(
702
- report[column],
703
- indent=2)}\n")
632
+ report[column] = {str(value): str(count) for value, count in top_n_values.items()}
633
+ print(f"Top {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
704
634
  else:
705
635
  print(f"Column '{column}' does not exist in the DataFrame.")
706
636
  else:
@@ -710,22 +640,17 @@ def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
710
640
  gc.collect()
711
641
 
712
642
 
713
- def bottom_n_unique_values(
714
- df: pd.DataFrame,
715
- n: int,
716
- columns: List[str]) -> None:
643
+ def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
717
644
  """
718
645
  Print the bottom `n` unique values for specified columns in the DataFrame.
719
646
 
720
- This method calculates and prints the bottom `n` unique frequency values
721
- for specified columns in a DataFrame.
647
+ This method calculates and prints the bottom `n` unique frequency values for specified columns in a DataFrame.
722
648
 
723
649
  Parameters:
724
650
  - df (pd.DataFrame): The DataFrame from which to calculate bottom unique
725
651
  values.
726
652
  - n (int): Number of bottom unique frequency values to display.
727
- - columns (List[str]): List of column names for which to display bottom
728
- unique values.
653
+ - columns (List[str]): List of column names for which to display bottom unique values.
729
654
 
730
655
  Raises:
731
656
  - ValueError: If the DataFrame is `None`.
@@ -735,21 +660,12 @@ def bottom_n_unique_values(
735
660
  for column in columns:
736
661
  if column in df.columns:
737
662
  frequency = df[column].astype(str).value_counts(dropna=False)
738
- frequency = frequency.rename(
739
- index={
740
- 'nan': 'NaN',
741
- 'NaT': 'NaT',
742
- 'None': 'None',
743
- '': 'Empty'})
663
+ frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
744
664
  bottom_n_values = frequency.nsmallest(n)
745
665
  report[column] = {
746
666
  str(value): str(count) for value,
747
667
  count in bottom_n_values.items()}
748
- print(
749
- f"Bottom {n} unique values for column '{column}':\n{
750
- json.dumps(
751
- report[column],
752
- indent=2)}\n")
668
+ print(f"Bottom {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
753
669
  else:
754
670
  print(f"Column '{column}' does not exist in the DataFrame.")
755
671
  else:
@@ -759,18 +675,15 @@ def bottom_n_unique_values(
759
675
  gc.collect()
760
676
 
761
677
 
762
- def print_correlation(
763
- df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
678
+ def print_correlation(df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
764
679
  """
765
680
  Print correlation for multiple pairs of columns in the DataFrame.
766
681
 
767
- This function computes and displays the correlation coefficients for
768
- specified pairs of columns.
682
+ This function computes and displays the correlation coefficients for specified pairs of columns.
769
683
 
770
684
  Parameters:
771
685
  - df (pd.DataFrame): The DataFrame containing the columns to analyze.
772
- - column_pairs (List[Tuple[str, str]]): List of column pairs for which to
773
- compute correlations.
686
+ - column_pairs (List[Tuple[str, str]]): List of column pairs for which to compute correlations.
774
687
  """
775
688
  if df is not None:
776
689
  for col1, col2 in column_pairs:
@@ -781,21 +694,13 @@ def print_correlation(
781
694
 
782
695
  correlation = numeric_col1.corr(numeric_col2)
783
696
  if pd.notnull(correlation):
784
- print(
785
- f"The correlation between '{col1}' and '{col2}'"
786
- + f" is {correlation}.")
697
+ print(f"The correlation between '{col1}' and '{col2}' is {correlation}.")
787
698
  else:
788
- print(
789
- f"Cannot calculate correlation between '{col1}'"
790
- + f" and '{col2}' due to insufficient numeric"
791
- + " data.")
699
+ print(f"Cannot calculate correlation between '{col1}' and '{col2}' due to insufficient numeric data.")
792
700
  except Exception as e:
793
- print(
794
- f"Error processing cols '{col1}' and '{col2}': {e}")
701
+ print(f"Error processing cols '{col1}' and '{col2}': {e}")
795
702
  else:
796
- print(
797
- f"One or both of the specified cols ('{col1}', '{col2}')"
798
- + " do not exist in the DataFrame.")
703
+ print(f"One or both of the specified cols ('{col1}', '{col2}') do not exist in the DataFrame.")
799
704
  else:
800
705
  print("The DataFrame is empty.")
801
706
 
@@ -806,8 +711,7 @@ def print_memory_usage(df: pd.DataFrame) -> None:
806
711
  """
807
712
  Prints the memory usage of the DataFrame.
808
713
 
809
- This function computes the memory footprint of a DataFrame in megabytes
810
- and displays it, rounding to two decimal places for clarity.
714
+ This function computes the memory footprint of a DataFrame in megabytes and displays it, rounding to two decimal places for clarity.
811
715
 
812
716
  Parameters:
813
717
  - df (pd.DataFrame): The DataFrame for which the memory usage is computed.
@@ -816,8 +720,7 @@ def print_memory_usage(df: pd.DataFrame) -> None:
816
720
  - ValueError: If the DataFrame is `None`.
817
721
  """
818
722
  if df is not None:
819
- memory_usage = df.memory_usage(deep=True).sum(
820
- ) / (1024 * 1024) # Convert bytes to MB
723
+ memory_usage = df.memory_usage(deep=True).sum() / (1024 * 1024) # Convert bytes to MB
821
724
  print(f"Memory usage of DataFrame: {memory_usage:.2f} MB")
822
725
  else:
823
726
  raise ValueError("No DataFrame to print. Please provide a DataFrame.")
@@ -829,9 +732,7 @@ def filter_dataframe(df: pd.DataFrame, filter_expr: str) -> pd.DataFrame:
829
732
  """
830
733
  Return a filtered DataFrame according to the given expression.
831
734
 
832
- This function filters rows of a DataFrame using a specified query
833
- expression, returning a new DataFrame containing only the rows that
834
- match the criteria.
735
+ This function filters rows of a DataFrame using a specified query expression, returning a new DataFrame containing only the rows that match the criteria.
835
736
 
836
737
  Parameters:
837
738
  - df (pd.DataFrame): The original DataFrame to be filtered.
@@ -860,19 +761,14 @@ def filter_indian_mobiles(df: pd.DataFrame, mobile_col: str) -> pd.DataFrame:
860
761
  """
861
762
  Filter and return DataFrame rows containing valid Indian mobile numbers.
862
763
 
863
- This function processes a DataFrame to extract and retain rows where the
864
- specified column matches the typical format for Indian mobile numbers.
865
- An Indian mobile number is expected to be a digit-only string starting
866
- with 6, 7, 8, or 9, and should have at least 4 distinct digits.
764
+ This function processes a DataFrame to extract and retain rows where the specified column matches the typical format for Indian mobile numbers. An Indian mobile number is expected to be a digit-only string starting with 6, 7, 8, or 9, and should have at least 4 distinct digits.
867
765
 
868
766
  Parameters:
869
767
  - df (pd.DataFrame): The DataFrame to filter.
870
- - mobile_col (str): The name of the column in the DataFrame that contains
871
- mobile number data.
768
+ - mobile_col (str): The name of the column in the DataFrame that contains mobile number data.
872
769
 
873
770
  Returns:
874
- - pd.DataFrame: A new DataFrame containing only rows with valid Indian
875
- mobile numbers.
771
+ - pd.DataFrame: A new DataFrame containing only rows with valid Indian mobile numbers.
876
772
 
877
773
  Raises:
878
774
  - ValueError: If the DataFrame is `None`.
@@ -897,18 +793,15 @@ def filter_indian_mobiles(df: pd.DataFrame, mobile_col: str) -> pd.DataFrame:
897
793
 
898
794
  def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
899
795
  """
900
- Print the DataFrame and its column types. If a source path is provided,
901
- print it as well.
796
+ Print the DataFrame and its column types. If a source path is provided, print it as well.
902
797
 
903
798
  Parameters:
904
799
  df: The DataFrame to print.
905
- source: Optional; The source path of the DataFrame for logging
906
- purposes.
800
+ source: Optional; The source path of the DataFrame for logging purposes.
907
801
  """
908
802
  if df is not None:
909
803
  print(df)
910
- columns_with_types = [
911
- f"{col} ({df[col].dtypes})" for col in df.columns]
804
+ columns_with_types = [f"{col} ({df[col].dtypes})" for col in df.columns]
912
805
  print("Columns:", columns_with_types)
913
806
  if source:
914
807
  print(f"Source: {source}")
@@ -918,35 +811,22 @@ def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
918
811
  gc.collect()
919
812
 
920
813
 
921
- def send_dataframe_via_telegram(
922
- df: pd.DataFrame,
923
- bot_name: str,
924
- message: Optional[str] = None,
925
- as_file: bool = True,
926
- remove_after_send: bool = True) -> None:
814
+ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
927
815
  """
928
816
  Send a DataFrame via Telegram using a specified bot configuration.
929
817
 
930
818
  Parameters:
931
819
  df: The DataFrame to send.
932
- bot_name: The name of the Telegram bot as specified in the
933
- configuration.
820
+ bot_name: The name of the Telegram bot as specified in the configuration.
934
821
  message: Custom message to send along with the DataFrame or file.
935
- as_file: Boolean flag to decide whether to send the DataFrame as a
936
- file or as text.
822
+ as_file: Boolean flag to decide whether to send the DataFrame as a file or as text.
937
823
  remove_after_send: If True, removes the file after sending.
938
824
  """
939
825
 
940
826
  def locate_config_file(filename: str = "rgwml.config") -> str:
941
827
  """Retrieve the configuration file path."""
942
828
  home_dir = os.path.expanduser("~")
943
- search_paths = [
944
- os.path.join(
945
- home_dir,
946
- folder) for folder in [
947
- "Desktop",
948
- "Documents",
949
- "Downloads"]]
829
+ search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
950
830
 
951
831
  for path in search_paths:
952
832
  for root, _, files in os.walk(path):
@@ -962,13 +842,7 @@ def send_dataframe_via_telegram(
962
842
 
963
843
  config_path = locate_config_file()
964
844
  config = get_config(config_path)
965
- bot_config = next(
966
- (
967
- bot for bot in config['telegram_bot_presets']
968
- if bot['name'] == bot_name
969
- ),
970
- None
971
- )
845
+ bot_config = next((bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name), None)
972
846
 
973
847
  if not bot_config:
974
848
  raise ValueError(f"No bot found with the name {bot_name}")
@@ -982,15 +856,9 @@ def send_dataframe_via_telegram(
982
856
  df.to_csv(file_name, index=False)
983
857
  try:
984
858
  with open(file_name, 'rb') as file:
985
- payload = {
986
- 'chat_id': bot_config['chat_id'],
987
- 'caption': message or ''}
859
+ payload = {'chat_id': bot_config['chat_id'], 'caption': message or ''}
988
860
  files = {'document': file}
989
- response = requests.post(
990
- f"https://api.telegram.org/bot{
991
- bot_config['bot_token']}/sendDocument",
992
- data=payload,
993
- files=files)
861
+ response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendDocument", data=payload, files=files)
994
862
  if remove_after_send and os.path.exists(file_name):
995
863
  os.remove(file_name)
996
864
  except Exception as e:
@@ -1002,9 +870,7 @@ def send_dataframe_via_telegram(
1002
870
  'chat_id': bot_config['chat_id'],
1003
871
  'text': message + "\n\n" + df_str if message else df_str,
1004
872
  'parse_mode': 'HTML'}
1005
- response = requests.post(
1006
- f"https://api.telegram.org/bot{bot_config['bot_token']}"
1007
- + "/sendMessage", data=payload)
873
+ response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendMessage", data=payload)
1008
874
 
1009
875
  if not response.ok:
1010
876
  raise Exception(f"Error sending message: {response.text}")
@@ -1012,49 +878,30 @@ def send_dataframe_via_telegram(
1012
878
  print("Message sent successfully.")
1013
879
 
1014
880
 
1015
- def send_data_to_email(
1016
- df: pd.DataFrame,
1017
- preset_name: str,
1018
- to_email: str,
1019
- subject: Optional[str] = None,
1020
- body: Optional[str] = None,
1021
- as_file: bool = True,
1022
- remove_after_send: bool = True
1023
- ) -> None:
881
+ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subject: Optional[str] = None, body: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
1024
882
  """
1025
- Send an email with optional DataFrame attachment using Gmail API via a
1026
- specified preset.
883
+ Send an email with optional DataFrame attachment using Gmail API via a specified preset.
1027
884
 
1028
885
  Parameters:
1029
886
  df: The DataFrame to send.
1030
- preset_name: The configuration preset name to use for sending the
1031
- email.
887
+ preset_name: The configuration preset name to use for sending the email.
1032
888
  to_email: The recipient email address.
1033
889
  subject: Optional subject of the email.
1034
890
  body: Optional message body of the email.
1035
- as_file: Boolean flag to decide whether to send the DataFrame as a
1036
- file.
891
+ as_file: Boolean flag to decide whether to send the DataFrame as a file.
1037
892
  remove_after_send: If True, removes the CSV file after sending.
1038
893
  """
1039
894
 
1040
895
  def locate_config_file(filename: str = "rgwml.config") -> str:
1041
896
  """Locate config file in common user directories."""
1042
897
  home_dir = os.path.expanduser("~")
1043
- search_paths = [
1044
- os.path.join(
1045
- home_dir,
1046
- folder) for folder in [
1047
- "Desktop",
1048
- "Documents",
1049
- "Downloads"]]
898
+ search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
1050
899
 
1051
900
  for path in search_paths:
1052
901
  for root, _, files in os.walk(path):
1053
902
  if filename in files:
1054
903
  return os.path.join(root, filename)
1055
- raise FileNotFoundError(
1056
- f"{filename} not found in Desktop, Documents, or Downloads"
1057
- + " folders")
904
+ raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
1058
905
 
1059
906
  def get_config(config_path: str) -> dict:
1060
907
  with open(config_path, 'r') as file:
@@ -1078,13 +925,7 @@ def send_data_to_email(
1078
925
  config = get_config(config_path)
1079
926
 
1080
927
  # Retrieve Gmail preset configuration
1081
- gmail_config = next(
1082
- (
1083
- preset for preset in config['gmail_bot_presets']
1084
- if preset['name'] == preset_name
1085
- ),
1086
- None
1087
- )
928
+ gmail_config = next((preset for preset in config['gmail_bot_presets'] if preset['name'] == preset_name), None)
1088
929
 
1089
930
  if not gmail_config:
1090
931
  raise ValueError(f"No preset found with the name {preset_name}")
@@ -1097,9 +938,7 @@ def send_data_to_email(
1097
938
 
1098
939
  if as_file:
1099
940
  # Create a temporary file for the DataFrame as CSV
1100
- with tempfile.NamedTemporaryFile(
1101
- delete=False, suffix=".csv"
1102
- ) as tmp_file:
941
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
1103
942
  tmp_file_name = tmp_file.name
1104
943
  df.to_csv(tmp_file_name, index=False)
1105
944
 
@@ -1109,18 +948,13 @@ def send_data_to_email(
1109
948
  message['to'] = to_email
1110
949
  message['from'] = sender_email
1111
950
  message['subject'] = subject if subject else 'DataFrame CSV File'
1112
- message.attach(
1113
- MIMEText(
1114
- body if body else 'Please find the CSV file attached.'))
951
+ message.attach(MIMEText(body if body else 'Please find the CSV file attached.'))
1115
952
 
1116
953
  with open(tmp_file_name, 'rb') as file:
1117
954
  part = MIMEBase('application', 'octet-stream')
1118
955
  part.set_payload(file.read())
1119
956
  encoders.encode_base64(part)
1120
- part.add_header(
1121
- 'Content-Disposition',
1122
- f'attachment; filename={
1123
- os.path.basename(tmp_file_name)}')
957
+ part.add_header('Content-Disposition', f'attachment; filename={os.path.basename(tmp_file_name)}')
1124
958
  message.attach(part)
1125
959
 
1126
960
  if remove_after_send and os.path.exists(tmp_file_name):
@@ -1142,20 +976,13 @@ def send_data_to_email(
1142
976
  try:
1143
977
  raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
1144
978
  email_body = {'raw': raw}
1145
- sent_message = service.users().messages().send(
1146
- userId="me", body=email_body).execute()
979
+ sent_message = service.users().messages().send(userId="me", body=email_body).execute()
1147
980
  print(f"Email with Message Id {sent_message['id']} successfully sent.")
1148
981
  except Exception as error:
1149
982
  raise Exception(f"Error sending email: {error}")
1150
983
 
1151
984
 
1152
- def send_data_to_slack(
1153
- df: pd.DataFrame,
1154
- bot_name: str,
1155
- message: Optional[str] = None,
1156
- as_file: bool = True,
1157
- remove_after_send: bool = True
1158
- ) -> None:
985
+ def send_data_to_slack(df: pd.DataFrame, bot_name: str, message: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
1159
986
  """
1160
987
  Send a DataFrame or message to Slack using a specified bot configuration.
1161
988
 
@@ -1163,29 +990,21 @@ def send_data_to_slack(
1163
990
  df: The DataFrame to send.
1164
991
  bot_name: The Slack bot configuration preset name.
1165
992
  message: Custom message to send along with the DataFrame or file.
1166
- as_file: Boolean flag to decide whether to send the DataFrame as a
1167
- file.
993
+ as_file: Boolean flag to decide whether to send the DataFrame as a file.
1168
994
  remove_after_send: If True, removes the CSV file after sending.
1169
995
  """
1170
996
 
1171
997
  def locate_config_file(filename: str = "rgwml.config") -> str:
1172
998
  """Locate config file in common user directories."""
1173
999
  home_dir = os.path.expanduser("~")
1174
- search_paths = [
1175
- os.path.join(
1176
- home_dir,
1177
- folder) for folder in [
1178
- "Desktop",
1179
- "Documents",
1180
- "Downloads"]]
1000
+ search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
1181
1001
 
1182
1002
  for path in search_paths:
1183
1003
  for root, _, files in os.walk(path):
1184
1004
  if filename in files:
1185
1005
  return os.path.join(root, filename)
1186
1006
  raise FileNotFoundError(
1187
- f"{filename} not found in Desktop, Documents, or Downloads"
1188
- + " folders")
1007
+ f"{filename} not found in Desktop, Documents, or Downloads folders")
1189
1008
 
1190
1009
  def get_config(config_path: str) -> dict:
1191
1010
  """Load configuration from a JSON file."""
@@ -1196,13 +1015,7 @@ def send_data_to_slack(
1196
1015
  config_path = locate_config_file()
1197
1016
  config = get_config(config_path)
1198
1017
 
1199
- bot_config = next(
1200
- (
1201
- bot for bot in config['slack_bot_presets']
1202
- if bot['name'] == bot_name
1203
- ),
1204
- None
1205
- )
1018
+ bot_config = next((bot for bot in config['slack_bot_presets'] if bot['name'] == bot_name), None)
1206
1019
 
1207
1020
  if not bot_config:
1208
1021
  raise ValueError(f"No bot found with the name {bot_name}")
@@ -1211,30 +1024,19 @@ def send_data_to_slack(
1211
1024
 
1212
1025
  if as_file:
1213
1026
  # Create a temporary file for the DataFrame as CSV
1214
- with tempfile.NamedTemporaryFile(
1215
- delete=False, suffix=".csv"
1216
- ) as tmp_file:
1027
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp_file:
1217
1028
  file_name = tmp_file.name
1218
1029
  df.to_csv(file_name, index=False)
1219
1030
 
1220
1031
  try:
1221
1032
  with open(file_name, 'rb') as file:
1222
- response = client.files_upload(
1223
- channels=bot_config['channel_id'],
1224
- file=file,
1225
- filename=os.path.basename(file_name),
1226
- title="DataFrame Upload",
1227
- initial_comment=message or ''
1228
- )
1033
+ response = client.files_upload(channels=bot_config['channel_id'], file=file, filename=os.path.basename(file_name), title="DataFrame Upload", initial_comment=message or '')
1229
1034
  finally:
1230
1035
  if remove_after_send and os.path.exists(file_name):
1231
1036
  os.remove(file_name)
1232
1037
  else:
1233
1038
  df_str = df.to_string()
1234
- response = client.chat_postMessage(
1235
- channel=bot_config['channel_id'],
1236
- text=(message + "\n\n" + df_str) if message else df_str
1237
- )
1039
+ response = client.chat_postMessage(channel=bot_config['channel_id'], text=(message + "\n\n" + df_str) if message else df_str)
1238
1040
 
1239
1041
  # Check if the message was sent successfully
1240
1042
  if not response["ok"]:
@@ -1291,12 +1093,7 @@ def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
1291
1093
  return df[new_order]
1292
1094
 
1293
1095
 
1294
- def append_ranged_classification_column(
1295
- df: pd.DataFrame,
1296
- ranges: str,
1297
- target_col: str,
1298
- new_col_name: str
1299
- ) -> pd.DataFrame:
1096
+ def append_ranged_classification_column(df: pd.DataFrame, ranges: str, target_col: str, new_col_name: str) -> pd.DataFrame:
1300
1097
  """
1301
1098
  Append a ranged classification column to the DataFrame.
1302
1099
 
@@ -1364,39 +1161,22 @@ def append_ranged_classification_column(
1364
1161
  for r in range_list
1365
1162
  )
1366
1163
 
1367
- labels = [
1368
- f"{pad_number(range_list[i], max_integer_length)}"
1369
- f" to "
1370
- f"{pad_number(range_list[i + 1], max_integer_length)}"
1371
- for i in range(len(range_list) - 1)
1372
- ]
1164
+ labels = [f"{pad_number(range_list[i], max_integer_length)} to {pad_number(range_list[i + 1], max_integer_length)}" for i in range(len(range_list) - 1)]
1373
1165
 
1374
1166
  # Ensure the target column is numeric
1375
1167
  df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
1376
-
1377
- df[new_col_name] = pd.cut(
1378
- df[target_col],
1379
- bins=range_list,
1380
- labels=labels,
1381
- right=False,
1382
- include_lowest=True)
1168
+ df[new_col_name] = pd.cut(df[target_col], bins=range_list, labels=labels, right=False, include_lowest=True)
1383
1169
 
1384
1170
  return df
1385
1171
 
1386
1172
 
1387
- def append_percentile_classification_column(
1388
- df: pd.DataFrame,
1389
- percentiles: str,
1390
- target_col: str,
1391
- new_col_name: str
1392
- ) -> pd.DataFrame:
1173
+ def append_percentile_classification_column(df: pd.DataFrame, percentiles: str, target_col: str, new_col_name: str) -> pd.DataFrame:
1393
1174
  """
1394
1175
  Append a percentile classification column to the DataFrame.
1395
1176
 
1396
1177
  Parameters:
1397
1178
  df: The DataFrame to modify.
1398
- percentiles: A string representation of percentile values separated
1399
- by commas.
1179
+ percentiles: A string representation of percentile values separated by commas.
1400
1180
  target_col: The column to analyze.
1401
1181
  new_col_name: The name of the new classification column.
1402
1182
 
@@ -1418,57 +1198,28 @@ def append_percentile_classification_column(
1418
1198
 
1419
1199
  if has_decimals:
1420
1200
  percentiles_list = [float(p) for p in percentiles_list]
1421
-
1422
- max_decimal_length = max(
1423
- len(str(p).split('.')[1])
1424
- for p in percentiles_list
1425
- if '.' in str(p)
1426
- )
1427
-
1428
- max_integer_length = max(
1429
- len(str(int(float(p))))
1430
- for p in percentiles_list
1431
- )
1201
+ max_decimal_length = max(len(str(p).split('.')[1]) for p in percentiles_list if '.' in str(p))
1202
+ max_integer_length = max(len(str(int(float(p)))) for p in percentiles_list)
1432
1203
 
1433
1204
  labels = []
1434
1205
 
1435
1206
  for i in range(len(percentiles_list) - 1):
1436
- start = pad_number(
1437
- percentiles_list[i],
1438
- max_integer_length,
1439
- max_decimal_length,
1440
- decimal=True
1441
- )
1442
-
1443
- end = pad_number(
1444
- percentiles_list[i + 1],
1445
- max_integer_length,
1446
- max_decimal_length,
1447
- decimal=True
1448
- )
1207
+ start = pad_number(percentiles_list[i], max_integer_length, max_decimal_length, decimal=True)
1208
+ end = pad_number(percentiles_list[i + 1], max_integer_length, max_decimal_length, decimal=True)
1449
1209
 
1450
1210
  label = f"{start} to {end}"
1451
1211
  labels.append(label)
1452
1212
  else:
1453
1213
  percentiles_list = [int(p) for p in percentiles_list]
1454
1214
 
1455
- max_integer_length = max(
1456
- len(str(p))
1457
- for p in percentiles_list
1458
- )
1215
+ max_integer_length = max(len(str(p)) for p in percentiles_list)
1459
1216
 
1460
1217
  labels = []
1461
1218
 
1462
1219
  for i in range(len(percentiles_list) - 1):
1463
- start = pad_number(
1464
- percentiles_list[i],
1465
- max_integer_length
1466
- )
1220
+ start = pad_number(percentiles_list[i], max_integer_length)
1467
1221
 
1468
- end = pad_number(
1469
- percentiles_list[i + 1],
1470
- max_integer_length
1471
- )
1222
+ end = pad_number(percentiles_list[i + 1], max_integer_length)
1472
1223
 
1473
1224
  label = f"{start} to {end}"
1474
1225
  labels.append(label)
@@ -1477,28 +1228,18 @@ def append_percentile_classification_column(
1477
1228
  df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
1478
1229
  quantiles = [df[target_col].quantile(p / 100) for p in percentiles_list]
1479
1230
 
1480
- df[new_col_name] = pd.cut(
1481
- df[target_col],
1482
- bins=quantiles,
1483
- labels=labels,
1484
- include_lowest=True)
1231
+ df[new_col_name] = pd.cut(df[target_col], bins=quantiles, labels=labels, include_lowest=True)
1485
1232
 
1486
1233
  return df
1487
1234
 
1488
1235
 
1489
- def append_ranged_date_classification_column(
1490
- df: pd.DataFrame,
1491
- date_ranges: str,
1492
- target_col: str,
1493
- new_col_name: str
1494
- ) -> pd.DataFrame:
1236
+ def append_ranged_date_classification_column(df: pd.DataFrame, date_ranges: str, target_col: str, new_col_name: str) -> pd.DataFrame:
1495
1237
  """
1496
1238
  Append a ranged date classification column to the DataFrame.
1497
1239
 
1498
1240
  Parameters:
1499
1241
  df: The DataFrame to modify.
1500
- date_ranges: A string representation of date ranges separated by
1501
- commas.
1242
+ date_ranges: A string representation of date ranges separated by commas.
1502
1243
  target_col: The date column to analyze.
1503
1244
  new_col_name: The name of the new date classification column.
1504
1245
 
@@ -1525,24 +1266,19 @@ def append_ranged_date_classification_column(
1525
1266
  return df
1526
1267
 
1527
1268
 
1528
- def rename_columns(df: pd.DataFrame,
1529
- rename_pairs: Dict[str,
1530
- str]) -> pd.DataFrame:
1269
+ def rename_columns(df: pd.DataFrame, rename_pairs: Dict[str, str]) -> pd.DataFrame:
1531
1270
  """
1532
1271
  Rename columns in the DataFrame.
1533
1272
 
1534
1273
  Parameters:
1535
1274
  df: The DataFrame to modify.
1536
- rename_pairs: A dictionary mapping old column names to new column
1537
- names.
1275
+ rename_pairs: A dictionary mapping old column names to new column names.
1538
1276
 
1539
1277
  Returns:
1540
1278
  A new DataFrame with columns renamed.
1541
1279
  """
1542
1280
  if df is None:
1543
- raise ValueError(
1544
- "No DataFrame to rename columns. Please provide a valid"
1545
- + " DataFrame.")
1281
+ raise ValueError("No DataFrame to rename columns. Please provide a valid DataFrame.")
1546
1282
 
1547
1283
  return df.rename(columns=rename_pairs)
1548
1284
 
@@ -1560,8 +1296,7 @@ def cascade_sort(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
1560
1296
  A new DataFrame sorted by specified columns.
1561
1297
  """
1562
1298
  if df is None:
1563
- raise ValueError(
1564
- "No DataFrame to sort. Please provide a valid DataFrame.")
1299
+ raise ValueError("No DataFrame to sort. Please provide a valid DataFrame.")
1565
1300
 
1566
1301
  col_names = []
1567
1302
  asc_order = []
@@ -1590,15 +1325,13 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
1590
1325
 
1591
1326
  Parameters:
1592
1327
  df: The DataFrame to modify.
1593
- ratio_str: A string specifying the ratio of TRAIN:TEST or
1594
- TRAIN:VALIDATE:TEST.
1328
+ ratio_str: A string specifying the ratio of TRAIN:TEST or TRAIN:VALIDATE:TEST.
1595
1329
 
1596
1330
  Returns:
1597
1331
  A new DataFrame with XGB_TYPE labels appended.
1598
1332
  """
1599
1333
  if df is None:
1600
- raise ValueError(
1601
- "No DataFrame to add labels. Please provide a valid DataFrame.")
1334
+ raise ValueError("No DataFrame to add labels. Please provide a valid DataFrame.")
1602
1335
 
1603
1336
  ratios = list(map(int, ratio_str.split(':')))
1604
1337
  total_ratio = sum(ratios)
@@ -1615,9 +1348,7 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
1615
1348
  labels = ['TRAIN'] * train_rows + ['VALIDATE'] * \
1616
1349
  validate_rows + ['TEST'] * test_rows
1617
1350
  else:
1618
- raise ValueError(
1619
- "Invalid ratio string format. Use 'TRAIN:TEST' or"
1620
- + "'TRAIN:VALIDATE:TEST'.")
1351
+ raise ValueError("Invalid ratio string format. Use 'TRAIN:TEST' or 'TRAIN:VALIDATE:TEST'.")
1621
1352
 
1622
1353
  df_with_labels = df.copy()
1623
1354
  df_with_labels['XGB_TYPE'] = labels
@@ -1625,17 +1356,9 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
1625
1356
  return df_with_labels
1626
1357
 
1627
1358
 
1628
- def append_xgb_regression_predictions(
1629
- df: pd.DataFrame,
1630
- target_col: str,
1631
- feature_cols: str,
1632
- pred_col: str,
1633
- boosting_rounds: int = 100,
1634
- model_path: Optional[str] = None
1635
- ) -> pd.DataFrame:
1359
+ def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature_cols: str, pred_col: str, boosting_rounds: int = 100, model_path: Optional[str] = None) -> pd.DataFrame:
1636
1360
  """
1637
- Append XGB regression predictions to DataFrame. Assumes data is labeled
1638
- by an 'XGB_TYPE' column.
1361
+ Append XGB regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
1639
1362
 
1640
1363
  Parameters:
1641
1364
  df: DataFrame to modify.
@@ -1649,8 +1372,7 @@ def append_xgb_regression_predictions(
1649
1372
  DataFrame with predictions appended.
1650
1373
  """
1651
1374
  if df is None or 'XGB_TYPE' not in df.columns:
1652
- raise ValueError(
1653
- "DataFrame is not initialized or 'XGB_TYPE' column is missing.")
1375
+ raise ValueError("DataFrame is not initialized or 'XGB_TYPE' column is missing.")
1654
1376
 
1655
1377
  features = feature_cols.replace(' ', '').split(',')
1656
1378
 
@@ -1666,30 +1388,16 @@ def append_xgb_regression_predictions(
1666
1388
  else:
1667
1389
  validate_data = None
1668
1390
 
1669
- dtrain = xgb.DMatrix(
1670
- train_data[features],
1671
- label=train_data[target_col],
1672
- enable_categorical=True)
1391
+ dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
1673
1392
  evals = [(dtrain, 'train')]
1674
1393
 
1675
1394
  if validate_data is not None:
1676
- dvalidate = xgb.DMatrix(
1677
- validate_data[features],
1678
- label=validate_data[target_col],
1679
- enable_categorical=True)
1395
+ dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
1680
1396
  evals.append((dvalidate, 'validate'))
1681
1397
 
1682
- params = {
1683
- 'objective': 'reg:squarederror',
1684
- 'eval_metric': 'rmse'
1685
- }
1398
+ params = {'objective': 'reg:squarederror', 'eval_metric': 'rmse'}
1686
1399
 
1687
- model = xgb.train(
1688
- params,
1689
- dtrain,
1690
- num_boost_round=boosting_rounds,
1691
- evals=evals,
1692
- early_stopping_rounds=10 if validate_data is not None else None)
1400
+ model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
1693
1401
 
1694
1402
  # Make predictions for all data
1695
1403
  dall = xgb.DMatrix(df[features], enable_categorical=True)
@@ -1698,24 +1406,15 @@ def append_xgb_regression_predictions(
1698
1406
  if model_path:
1699
1407
  model.save_model(model_path)
1700
1408
 
1701
- columns_order = [col for col in df.columns if col not in [
1702
- 'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1409
+ columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1703
1410
  df = df[columns_order]
1704
1411
 
1705
1412
  return df
1706
1413
 
1707
1414
 
1708
- def append_xgb_logistic_regression_predictions(
1709
- df: pd.DataFrame,
1710
- target_col: str,
1711
- feature_cols: str,
1712
- pred_col: str,
1713
- boosting_rounds: int = 100,
1714
- model_path: Optional[str] = None
1715
- ) -> pd.DataFrame:
1415
+ def append_xgb_logistic_regression_predictions(df: pd.DataFrame, target_col: str, feature_cols: str, pred_col: str, boosting_rounds: int = 100, model_path: Optional[str] = None) -> pd.DataFrame:
1716
1416
  """
1717
- Append XGB logistic regression predictions to DataFrame. Assumes data is
1718
- labeled by an 'XGB_TYPE' column.
1417
+ Append XGB logistic regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
1719
1418
 
1720
1419
  Parameters:
1721
1420
  df: DataFrame to modify.
@@ -1745,30 +1444,16 @@ def append_xgb_logistic_regression_predictions(
1745
1444
  if 'VALIDATE' in df['XGB_TYPE'].values:
1746
1445
  validate_data = df[df['XGB_TYPE'] == 'VALIDATE']
1747
1446
 
1748
- dtrain = xgb.DMatrix(
1749
- train_data[features],
1750
- label=train_data[target_col],
1751
- enable_categorical=True)
1447
+ dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
1752
1448
  evals = [(dtrain, 'train')]
1753
1449
 
1754
1450
  if validate_data is not None:
1755
- dvalidate = xgb.DMatrix(
1756
- validate_data[features],
1757
- label=validate_data[target_col],
1758
- enable_categorical=True)
1451
+ dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
1759
1452
  evals.append((dvalidate, 'validate'))
1760
1453
 
1761
- params = {
1762
- 'objective': 'binary:logistic',
1763
- 'eval_metric': 'auc'
1764
- }
1454
+ params = {'objective': 'binary:logistic', 'eval_metric': 'auc'}
1765
1455
 
1766
- model = xgb.train(
1767
- params,
1768
- dtrain,
1769
- num_boost_round=boosting_rounds,
1770
- evals=evals,
1771
- early_stopping_rounds=10 if validate_data is not None else None)
1456
+ model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
1772
1457
 
1773
1458
  # Make predictions for all data
1774
1459
  dall = xgb.DMatrix(df[features], enable_categorical=True)
@@ -1777,19 +1462,13 @@ def append_xgb_logistic_regression_predictions(
1777
1462
  if model_path:
1778
1463
  model.save_model(model_path)
1779
1464
 
1780
- columns_order = [col for col in df.columns if col not in [
1781
- 'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1465
+ columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1782
1466
  df = df[columns_order]
1783
1467
 
1784
1468
  return df
1785
1469
 
1786
1470
 
1787
- def print_n_frequency_cascading(
1788
- df: pd.DataFrame,
1789
- n: int,
1790
- columns: str,
1791
- order_by: str = "FREQ_DESC"
1792
- ) -> None:
1471
+ def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by: str = "FREQ_DESC") -> None:
1793
1472
  """
1794
1473
  Print the cascading frequency of top n values for specified columns.
1795
1474
 
@@ -1812,12 +1491,7 @@ def print_n_frequency_cascading(
1812
1491
  # Convert the column to string representation
1813
1492
  df[current_col] = df[current_col].astype(str)
1814
1493
  frequency = df[current_col].value_counts(dropna=False)
1815
- frequency = frequency.rename(
1816
- index={
1817
- 'nan': 'NaN',
1818
- 'NaT': 'NaT',
1819
- 'None': 'None',
1820
- '': 'Empty'})
1494
+ frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
1821
1495
 
1822
1496
  if limit is not None:
1823
1497
  frequency = frequency.nlargest(limit)
@@ -1832,15 +1506,10 @@ def print_n_frequency_cascading(
1832
1506
  filtered_df = df[df[current_col] == value]
1833
1507
 
1834
1508
  if len(columns) > 1:
1835
- sub_report = generate_cascade_report(
1836
- filtered_df, columns[1:], limit, order_by)
1837
- report[value] = {
1838
- "count": str(count), f"sub_distribution({
1839
- columns[1]})": sub_report if sub_report else {}}
1509
+ sub_report = generate_cascade_report(filtered_df, columns[1:], limit, order_by)
1510
+ report[value] = {"count": str(count), f"sub_distribution({columns[1]})": sub_report if sub_report else {}}
1840
1511
  else:
1841
- report[value] = {
1842
- "count": str(count)
1843
- }
1512
+ report[value] = {"count": str(count)}
1844
1513
 
1845
1514
  return report
1846
1515
 
@@ -1848,30 +1517,17 @@ def print_n_frequency_cascading(
1848
1517
  if order_by == "ASC":
1849
1518
  return dict(sorted(frequency.items(), key=lambda item: item[0]))
1850
1519
  elif order_by == "DESC":
1851
- return dict(
1852
- sorted(
1853
- frequency.items(),
1854
- key=lambda item: item[0],
1855
- reverse=True))
1520
+ return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
1856
1521
  elif order_by == "FREQ_ASC":
1857
1522
  return dict(sorted(frequency.items(), key=lambda item: item[1]))
1858
1523
  else: # Default to "FREQ_DESC"
1859
- return dict(
1860
- sorted(
1861
- frequency.items(),
1862
- key=lambda item: item[1],
1863
- reverse=True))
1524
+ return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
1864
1525
 
1865
1526
  report = generate_cascade_report(df, columns, n, order_by)
1866
1527
  print(json.dumps(report, indent=2))
1867
1528
 
1868
1529
 
1869
- def print_n_frequency_linear(
1870
- df: pd.DataFrame,
1871
- n: int,
1872
- columns: str,
1873
- order_by: str = "FREQ_DESC"
1874
- ) -> None:
1530
+ def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: str, order_by: str = "FREQ_DESC") -> None:
1875
1531
  """
1876
1532
  Print the linear frequency of top n values for specified columns.
1877
1533
 
@@ -1891,19 +1547,13 @@ def print_n_frequency_linear(
1891
1547
  continue
1892
1548
 
1893
1549
  frequency = df[current_col].astype(str).value_counts(dropna=False)
1894
- frequency = frequency.rename(
1895
- index={
1896
- 'nan': 'NaN',
1897
- 'NaT': 'NaT',
1898
- 'None': 'None',
1899
- '': 'Empty'})
1550
+ frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
1900
1551
 
1901
1552
  if limit is not None:
1902
1553
  frequency = frequency.nlargest(limit)
1903
1554
 
1904
1555
  sorted_frequency = sort_frequency(frequency, order_by)
1905
- col_report = {str(value): str(count)
1906
- for value, count in sorted_frequency.items()}
1556
+ col_report = {str(value): str(count) for value, count in sorted_frequency.items()}
1907
1557
  report[current_col] = col_report
1908
1558
 
1909
1559
  return report
@@ -1912,27 +1562,17 @@ def print_n_frequency_linear(
1912
1562
  if order_by == "ASC":
1913
1563
  return dict(sorted(frequency.items(), key=lambda item: item[0]))
1914
1564
  elif order_by == "DESC":
1915
- return dict(
1916
- sorted(
1917
- frequency.items(),
1918
- key=lambda item: item[0],
1919
- reverse=True))
1565
+ return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
1920
1566
  elif order_by == "FREQ_ASC":
1921
1567
  return dict(sorted(frequency.items(), key=lambda item: item[1]))
1922
1568
  else: # Default to "FREQ_DESC"
1923
- return dict(
1924
- sorted(
1925
- frequency.items(),
1926
- key=lambda item: item[1],
1927
- reverse=True))
1569
+ return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
1928
1570
 
1929
1571
  report = generate_linear_report(df, columns, n, order_by)
1930
1572
  print(json.dumps(report, indent=2))
1931
1573
 
1932
1574
 
1933
- def retain_columns(
1934
- df: pd.DataFrame,
1935
- columns_to_retain: List[str]) -> pd.DataFrame:
1575
+ def retain_columns(df: pd.DataFrame, columns_to_retain: List[str]) -> pd.DataFrame:
1936
1576
  """
1937
1577
  Retain specified columns in the DataFrame and drop the others.
1938
1578
 
@@ -1948,11 +1588,7 @@ def retain_columns(
1948
1588
  return df[columns_to_retain]
1949
1589
 
1950
1590
 
1951
- def mask_against_dataframe(
1952
- df: pd.DataFrame,
1953
- other_df: pd.DataFrame,
1954
- column_name: str
1955
- ) -> pd.DataFrame:
1591
+ def mask_against_dataframe(df: pd.DataFrame, other_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
1956
1592
  """
1957
1593
  Retain only rows with common column values between two DataFrames.
1958
1594
 
@@ -1969,11 +1605,7 @@ def mask_against_dataframe(
1969
1605
  return df[df[column_name].isin(other_df[column_name])]
1970
1606
 
1971
1607
 
1972
- def mask_against_dataframe_converse(
1973
- df: pd.DataFrame,
1974
- other_df: pd.DataFrame,
1975
- column_name: str
1976
- ) -> pd.DataFrame:
1608
+ def mask_against_dataframe_converse(df: pd.DataFrame, other_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
1977
1609
  """
1978
1610
  Retain only rows with uncommon column values between two DataFrames.
1979
1611
 
@@ -1990,3 +1622,77 @@ def mask_against_dataframe_converse(
1990
1622
  raise ValueError("The specified column must exist in both DataFrames.")
1991
1623
 
1992
1624
  return df[~df[column_name].isin(other_df[column_name])]
1625
+
1626
+
1627
+ def union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
1628
+ """
1629
+ Perform a union join, concatenating the two DataFrames and dropping duplicates.
1630
+
1631
+ Parameters:
1632
+ df1: First DataFrame.
1633
+ df2: Second DataFrame.
1634
+
1635
+ Returns:
1636
+ A new DataFrame with the union of df1 and df2, without duplicates.
1637
+
1638
+ Raises:
1639
+ ValueError: If the DataFrames do not have the same columns.
1640
+ """
1641
+ if set(df1.columns) != set(df2.columns):
1642
+ raise ValueError("Both DataFrames must have the same columns for a union join")
1643
+
1644
+ result_df = pd.concat([df1, df2], ignore_index=True).drop_duplicates()
1645
+ return result_df
1646
+
1647
+
1648
+ def bag_union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
1649
+ """
1650
+ Perform a bag union join, concatenating the two DataFrames without dropping duplicates.
1651
+
1652
+ Parameters:
1653
+ df1: First DataFrame.
1654
+ df2: Second DataFrame.
1655
+
1656
+ Returns:
1657
+ A new DataFrame with the concatenated data of df1 and df2.
1658
+
1659
+ Raises:
1660
+ ValueError: If the DataFrames do not have the same columns.
1661
+ """
1662
+ if set(df1.columns) != set(df2.columns):
1663
+ raise ValueError("Both DataFrames must have the same columns for a bag union join")
1664
+
1665
+ result_df = pd.concat([df1, df2], ignore_index=True)
1666
+ return result_df
1667
+
1668
+
1669
+ def left_join(df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str) -> pd.DataFrame:
1670
+ """
1671
+ Perform a left join on two DataFrames.
1672
+
1673
+ Parameters:
1674
+ df1: The left DataFrame.
1675
+ df2: The right DataFrame.
1676
+ left_on: Column name in df1 to join on.
1677
+ right_on: Column name in df2 to join on.
1678
+
1679
+ Returns:
1680
+ A new DataFrame as the result of a left join.
1681
+ """
1682
+ return df1.merge(df2, how='left', left_on=left_on, right_on=right_on)
1683
+
1684
+
1685
+ def right_join(df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str) -> pd.DataFrame:
1686
+ """
1687
+ Perform a right join on two DataFrames.
1688
+
1689
+ Parameters:
1690
+ df1: The left DataFrame.
1691
+ df2: The right DataFrame.
1692
+ left_on: Column name in df1 to join on.
1693
+ right_on: Column name in df2 to join on.
1694
+
1695
+ Returns:
1696
+ A new DataFrame as the result of a right join.
1697
+ """
1698
+ return df1.merge(df2, how='right', left_on=left_on, right_on=right_on)