rgwfuncs 0.0.16__py3-none-any.whl → 0.0.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rgwfuncs/__init__.py CHANGED
@@ -1,4 +1,5 @@
1
1
  # This file is automatically generated
2
2
  # Dynamically importing functions from modules
3
3
 
4
- from .df_lib import append_columns, append_percentile_classification_column, append_ranged_classification_column, append_ranged_date_classification_column, append_rows, append_xgb_labels, append_xgb_logistic_regression_predictions, append_xgb_regression_predictions, bag_union_join, bottom_n_unique_values, cascade_sort, delete_rows, docs, drop_duplicates, drop_duplicates_retain_first, drop_duplicates_retain_last, filter_dataframe, filter_indian_mobiles, first_n_rows, from_raw_data, last_n_rows, left_join, limit_dataframe, load_data_from_path, load_data_from_query, load_data_from_sqlite_path, mask_against_dataframe, mask_against_dataframe_converse, numeric_clean, order_columns, print_correlation, print_dataframe, print_memory_usage, print_n_frequency_cascading, print_n_frequency_linear, rename_columns, retain_columns, right_join, send_data_to_email, send_data_to_slack, send_dataframe_via_telegram, sync_dataframe_to_sqlite_database, top_n_unique_values, union_join, update_rows
4
+ from .df_lib import append_columns, append_percentile_classification_column, append_ranged_classification_column, append_ranged_date_classification_column, append_rows, append_xgb_labels, append_xgb_logistic_regression_predictions, append_xgb_regression_predictions, bag_union_join, bottom_n_unique_values, cascade_sort, delete_rows, docs, drop_duplicates, drop_duplicates_retain_first, drop_duplicates_retain_last, filter_dataframe, filter_indian_mobiles, first_n_rows, from_raw_data, insert_dataframe_in_sqlite_database, last_n_rows, left_join, limit_dataframe, load_data_from_path, load_data_from_query, load_data_from_sqlite_path, mask_against_dataframe, mask_against_dataframe_converse, numeric_clean, order_columns, print_correlation, print_dataframe, print_memory_usage, print_n_frequency_cascading, print_n_frequency_linear, rename_columns, retain_columns, right_join, send_data_to_email, send_data_to_slack, send_dataframe_via_telegram, sync_dataframe_to_sqlite_database, top_n_unique_values, union_join, update_rows
5
+ from .str_lib import send_telegram_message
rgwfuncs/df_lib.py CHANGED
@@ -28,6 +28,7 @@ import warnings
28
28
  # Suppress all FutureWarnings
29
29
  warnings.filterwarnings("ignore", category=FutureWarning)
30
30
 
31
+
31
32
  def docs(method_type_filter: Optional[str] = None) -> None:
32
33
  """
33
34
  Print a list of function names in alphabetical order. If method_type_filter
@@ -65,7 +66,11 @@ def docs(method_type_filter: Optional[str] = None) -> None:
65
66
  print(f"\n{name}:\n{docstring}")
66
67
 
67
68
 
68
- def numeric_clean(df: pd.DataFrame, column_names: str, column_type: str, irregular_value_treatment: str) -> pd.DataFrame:
69
+ def numeric_clean(
70
+ df: pd.DataFrame,
71
+ column_names: str,
72
+ column_type: str,
73
+ irregular_value_treatment: str) -> pd.DataFrame:
69
74
  """
70
75
  Cleans the numeric columns based on specified treatments.
71
76
 
@@ -296,7 +301,9 @@ def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
296
301
  return df.drop_duplicates(keep='first')
297
302
 
298
303
 
299
- def drop_duplicates_retain_first(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
304
+ def drop_duplicates_retain_first(
305
+ df: pd.DataFrame,
306
+ columns: Optional[str] = None) -> pd.DataFrame:
300
307
  """
301
308
  Drop duplicate rows in the DataFrame based on specified columns, retaining the first occurrence.
302
309
 
@@ -318,7 +325,9 @@ def drop_duplicates_retain_first(df: pd.DataFrame, columns: Optional[str] = None
318
325
  return df.drop_duplicates(subset=columns_list, keep='first')
319
326
 
320
327
 
321
- def drop_duplicates_retain_last(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
328
+ def drop_duplicates_retain_last(
329
+ df: pd.DataFrame,
330
+ columns: Optional[str] = None) -> pd.DataFrame:
322
331
  """
323
332
  Drop duplicate rows in the DataFrame based on specified columns, retaining the last occurrence.
324
333
 
@@ -335,20 +344,18 @@ def drop_duplicates_retain_last(df: pd.DataFrame, columns: Optional[str] = None)
335
344
  if df is None:
336
345
  raise ValueError("DataFrame is not initialized.")
337
346
 
338
- columns_list = [col.strip() for col in columns.split(',')] if columns else None
347
+ columns_list = [col.strip()
348
+ for col in columns.split(',')] if columns else None
339
349
  return df.drop_duplicates(subset=columns_list, keep='last')
340
350
 
341
351
 
342
- def load_data_from_query(db_preset_name: str, query: str, config_file_name: str = "rgwml.config") -> pd.DataFrame:
352
+ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
343
353
  """
344
- Load data from a database query into a DataFrame based on a configuration
345
- preset.
354
+ Load data from a database query into a DataFrame based on a configuration preset.
346
355
 
347
356
  Parameters:
348
357
  db_preset_name: The name of the database preset in the configuration file.
349
358
  query: The SQL query to execute.
350
- config_file_name: Name of the configuration file
351
- (default: 'rgwml.config').
352
359
 
353
360
  Returns:
354
361
  A DataFrame containing the query result.
@@ -358,17 +365,6 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
358
365
  ValueError: If the database preset or db_type is invalid.
359
366
  """
360
367
 
361
- def locate_config_file(filename: str = config_file_name) -> str:
362
- home_dir = os.path.expanduser("~")
363
- search_paths = [os.path.join(home_dir, "Desktop"), os.path.join(home_dir, "Documents"), os.path.join(home_dir, "Downloads"),]
364
-
365
- for path in search_paths:
366
- for root, dirs, files in os.walk(path):
367
- if filename in files:
368
- return os.path.join(root, filename)
369
- raise FileNotFoundError(
370
- f"{filename} not found in Desktop, Documents, or Downloads folders")
371
-
372
368
  def query_mssql(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
373
369
  server = db_preset['host']
374
370
  user = db_preset['username']
@@ -393,12 +389,13 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
393
389
  with conn.cursor() as cursor:
394
390
  cursor.execute(query)
395
391
  rows = cursor.fetchall()
396
- columns = ([desc[0] for desc in cursor.description] if cursor.description else [])
392
+ columns = ([desc[0] for desc in cursor.description]
393
+ if cursor.description else [])
397
394
 
398
395
  return pd.DataFrame(rows, columns=columns)
399
396
 
400
- def query_clickhouse(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
401
-
397
+ def query_clickhouse(
398
+ db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
402
399
  host = db_preset['host']
403
400
  user = db_preset['username']
404
401
  password = db_preset['password']
@@ -409,7 +406,8 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
409
406
 
410
407
  for attempt in range(max_retries):
411
408
  try:
412
- client = clickhouse_connect.get_client(host=host, port='8123', username=user, password=password, database=database)
409
+ client = clickhouse_connect.get_client(
410
+ host=host, port='8123', username=user, password=password, database=database)
413
411
  data = client.query(query)
414
412
  rows = data.result_rows
415
413
  columns = data.column_names
@@ -423,11 +421,13 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
423
421
  raise ConnectionError(
424
422
  "All attempts to connect to ClickHouse failed.")
425
423
 
426
- def query_google_big_query(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
424
+ def query_google_big_query(
425
+ db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
427
426
  json_file_path = db_preset['json_file_path']
428
427
  project_id = db_preset['project_id']
429
428
 
430
- credentials = service_account.Credentials.from_service_account_file(json_file_path)
429
+ credentials = service_account.Credentials.from_service_account_file(
430
+ json_file_path)
431
431
  client = bigquery.Client(credentials=credentials, project=project_id)
432
432
 
433
433
  query_job = client.query(query)
@@ -437,13 +437,15 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
437
437
 
438
438
  return pd.DataFrame(rows, columns=columns)
439
439
 
440
- # Read the configuration file to get the database preset
441
- config_path = locate_config_file()
440
+ # Assume the configuration file is located at ~/.rgwfuncsrc
441
+ config_path = os.path.expanduser('~/.rgwfuncsrc')
442
442
  with open(config_path, 'r') as f:
443
443
  config = json.load(f)
444
444
 
445
445
  db_presets = config.get('db_presets', [])
446
- db_preset = next((preset for preset in db_presets if preset['name'] == db_preset_name), None)
446
+ db_preset = next(
447
+ (preset for preset in db_presets if preset['name'] == db_preset_name),
448
+ None)
447
449
  if not db_preset:
448
450
  raise ValueError(f"No matching db_preset found for {db_preset_name}")
449
451
 
@@ -621,10 +623,20 @@ def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
621
623
  for column in columns:
622
624
  if column in df.columns:
623
625
  frequency = df[column].astype(str).value_counts(dropna=False)
624
- frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
626
+ frequency = frequency.rename(
627
+ index={
628
+ 'nan': 'NaN',
629
+ 'NaT': 'NaT',
630
+ 'None': 'None',
631
+ '': 'Empty'})
625
632
  top_n_values = frequency.nlargest(n)
626
- report[column] = {str(value): str(count) for value, count in top_n_values.items()}
627
- print(f"Top {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
633
+ report[column] = {str(value): str(count)
634
+ for value, count in top_n_values.items()}
635
+ print(
636
+ f"Top {n} unique values for column '{column}':\n{
637
+ json.dumps(
638
+ report[column],
639
+ indent=2)}\n")
628
640
  else:
629
641
  print(f"Column '{column}' does not exist in the DataFrame.")
630
642
  else:
@@ -634,7 +646,10 @@ def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
634
646
  gc.collect()
635
647
 
636
648
 
637
- def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
649
+ def bottom_n_unique_values(
650
+ df: pd.DataFrame,
651
+ n: int,
652
+ columns: List[str]) -> None:
638
653
  """
639
654
  Print the bottom `n` unique values for specified columns in the DataFrame.
640
655
 
@@ -654,12 +669,21 @@ def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None
654
669
  for column in columns:
655
670
  if column in df.columns:
656
671
  frequency = df[column].astype(str).value_counts(dropna=False)
657
- frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
672
+ frequency = frequency.rename(
673
+ index={
674
+ 'nan': 'NaN',
675
+ 'NaT': 'NaT',
676
+ 'None': 'None',
677
+ '': 'Empty'})
658
678
  bottom_n_values = frequency.nsmallest(n)
659
679
  report[column] = {
660
680
  str(value): str(count) for value,
661
681
  count in bottom_n_values.items()}
662
- print(f"Bottom {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
682
+ print(
683
+ f"Bottom {n} unique values for column '{column}':\n{
684
+ json.dumps(
685
+ report[column],
686
+ indent=2)}\n")
663
687
  else:
664
688
  print(f"Column '{column}' does not exist in the DataFrame.")
665
689
  else:
@@ -669,7 +693,8 @@ def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None
669
693
  gc.collect()
670
694
 
671
695
 
672
- def print_correlation(df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
696
+ def print_correlation(
697
+ df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
673
698
  """
674
699
  Print correlation for multiple pairs of columns in the DataFrame.
675
700
 
@@ -688,13 +713,16 @@ def print_correlation(df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) ->
688
713
 
689
714
  correlation = numeric_col1.corr(numeric_col2)
690
715
  if pd.notnull(correlation):
691
- print(f"The correlation between '{col1}' and '{col2}' is {correlation}.")
716
+ print(
717
+ f"The correlation between '{col1}' and '{col2}' is {correlation}.")
692
718
  else:
693
- print(f"Cannot calculate correlation between '{col1}' and '{col2}' due to insufficient numeric data.")
719
+ print(
720
+ f"Cannot calculate correlation between '{col1}' and '{col2}' due to insufficient numeric data.")
694
721
  except Exception as e:
695
722
  print(f"Error processing cols '{col1}' and '{col2}': {e}")
696
723
  else:
697
- print(f"One or both of the specified cols ('{col1}', '{col2}') do not exist in the DataFrame.")
724
+ print(
725
+ f"One or both of the specified cols ('{col1}', '{col2}') do not exist in the DataFrame.")
698
726
  else:
699
727
  print("The DataFrame is empty.")
700
728
 
@@ -714,7 +742,8 @@ def print_memory_usage(df: pd.DataFrame) -> None:
714
742
  - ValueError: If the DataFrame is `None`.
715
743
  """
716
744
  if df is not None:
717
- memory_usage = df.memory_usage(deep=True).sum() / (1024 * 1024) # Convert bytes to MB
745
+ memory_usage = df.memory_usage(deep=True).sum(
746
+ ) / (1024 * 1024) # Convert bytes to MB
718
747
  print(f"Memory usage of DataFrame: {memory_usage:.2f} MB")
719
748
  else:
720
749
  raise ValueError("No DataFrame to print. Please provide a DataFrame.")
@@ -795,7 +824,8 @@ def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
795
824
  """
796
825
  if df is not None:
797
826
  print(df)
798
- columns_with_types = [f"{col} ({df[col].dtypes})" for col in df.columns]
827
+ columns_with_types = [
828
+ f"{col} ({df[col].dtypes})" for col in df.columns]
799
829
  print("Columns:", columns_with_types)
800
830
  if source:
801
831
  print(f"Source: {source}")
@@ -811,48 +841,53 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
811
841
 
812
842
  Parameters:
813
843
  df: The DataFrame to send.
814
- bot_name: The name of the Telegram bot as specified in the configuration.
815
- message: Custom message to send along with the DataFrame or file.
816
- as_file: Boolean flag to decide whether to send the DataFrame as a file or as text.
817
- remove_after_send: If True, removes the file after sending.
818
- """
844
+ bot_name: The name of the Telegram bot as specified in the configuration file.
845
+ message: Custom message to send along with the DataFrame or file. Defaults to None.
846
+ as_file: Boolean flag to indicate whether the DataFrame should be sent as a file (True) or as text (False). Defaults to True.
847
+ remove_after_send: If True, removes the CSV file after sending. Defaults to True.
819
848
 
820
- def locate_config_file(filename: str = "rgwml.config") -> str:
821
- """Retrieve the configuration file path."""
822
- home_dir = os.path.expanduser("~")
823
- search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
849
+ Raises:
850
+ ValueError: If the specified bot is not found or if no DataFrame is provided.
851
+ Exception: If the message sending fails.
824
852
 
825
- for path in search_paths:
826
- for root, _, files in os.walk(path):
827
- if filename in files:
828
- return os.path.join(root, filename)
829
- raise FileNotFoundError(
830
- f"{filename} not found in Desktop, Documents, or Downloads")
853
+ Notes:
854
+ The configuration file is assumed to be located at `~/.rgwfuncsrc`.
855
+ """
831
856
 
832
857
  def get_config(config_path: str) -> dict:
833
- """Load configuration from a json file."""
858
+ """Load configuration from a JSON file."""
834
859
  with open(config_path, 'r') as file:
835
860
  return json.load(file)
836
861
 
837
- config_path = locate_config_file()
862
+ # Assume the configuration file is located at ~/.rgwfuncsrc
863
+ config_path = os.path.expanduser('~/.rgwfuncsrc')
838
864
  config = get_config(config_path)
839
- bot_config = next((bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name), None)
840
865
 
866
+ bot_config = next(
867
+ (bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name),
868
+ None)
841
869
  if not bot_config:
842
870
  raise ValueError(f"No bot found with the name {bot_name}")
843
871
 
844
872
  if df is None:
845
873
  raise ValueError("No DataFrame to send. Please provide a DataFrame.")
846
874
 
875
+ response = None
847
876
  if as_file:
848
877
  timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
849
878
  file_name = f"df_{timestamp}.csv"
850
879
  df.to_csv(file_name, index=False)
851
880
  try:
852
881
  with open(file_name, 'rb') as file:
853
- payload = {'chat_id': bot_config['chat_id'], 'caption': message or ''}
882
+ payload = {
883
+ 'chat_id': bot_config['chat_id'],
884
+ 'caption': message or ''}
854
885
  files = {'document': file}
855
- response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendDocument", data=payload, files=files)
886
+ response = requests.post(
887
+ f"https://api.telegram.org/bot{
888
+ bot_config['bot_token']}/sendDocument",
889
+ data=payload,
890
+ files=files)
856
891
  if remove_after_send and os.path.exists(file_name):
857
892
  os.remove(file_name)
858
893
  except Exception as e:
@@ -862,40 +897,45 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
862
897
  df_str = df.to_string()
863
898
  payload = {
864
899
  'chat_id': bot_config['chat_id'],
865
- 'text': message + "\n\n" + df_str if message else df_str,
866
- 'parse_mode': 'HTML'}
867
- response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendMessage", data=payload)
900
+ 'text': (message + "\n\n" + df_str) if message else df_str,
901
+ 'parse_mode': 'HTML'
902
+ }
903
+ response = requests.post(
904
+ f"https://api.telegram.org/bot{bot_config['bot_token']}/sendMessage", data=payload)
868
905
 
869
- if not response.ok:
906
+ if response and not response.ok:
870
907
  raise Exception(f"Error sending message: {response.text}")
871
908
 
872
909
  print("Message sent successfully.")
873
910
 
874
911
 
875
- def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subject: Optional[str] = None, body: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
912
+ def send_data_to_email(
913
+ df: pd.DataFrame,
914
+ preset_name: str,
915
+ to_email: str,
916
+ subject: Optional[str] = None,
917
+ body: Optional[str] = None,
918
+ as_file: bool = True,
919
+ remove_after_send: bool = True) -> None:
876
920
  """
877
- Send an email with optional DataFrame attachment using Gmail API via a specified preset.
921
+ Send an email with an optional DataFrame attachment using the Gmail API via a specified preset.
878
922
 
879
923
  Parameters:
880
924
  df: The DataFrame to send.
881
925
  preset_name: The configuration preset name to use for sending the email.
882
926
  to_email: The recipient email address.
883
- subject: Optional subject of the email.
884
- body: Optional message body of the email.
885
- as_file: Boolean flag to decide whether to send the DataFrame as a file.
886
- remove_after_send: If True, removes the CSV file after sending.
887
- """
927
+ subject: Optional subject of the email. Defaults to 'DataFrame CSV File' if not given.
928
+ body: Optional message body of the email. Defaults to 'Please find the CSV file attached.' if not given.
929
+ as_file: Boolean flag to decide whether to send the DataFrame as a file (True) or embed it in the email (False). Defaults to True.
930
+ remove_after_send: If True, removes the CSV file after sending. Defaults to True.
888
931
 
889
- def locate_config_file(filename: str = "rgwml.config") -> str:
890
- """Locate config file in common user directories."""
891
- home_dir = os.path.expanduser("~")
892
- search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
932
+ Raises:
933
+ ValueError: If the preset is not found in the configuration.
934
+ Exception: If the email preparation or sending fails.
893
935
 
894
- for path in search_paths:
895
- for root, _, files in os.walk(path):
896
- if filename in files:
897
- return os.path.join(root, filename)
898
- raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
936
+ Notes:
937
+ The configuration file is assumed to be located at `~/.rgwfuncsrc`.
938
+ """
899
939
 
900
940
  def get_config(config_path: str) -> dict:
901
941
  with open(config_path, 'r') as file:
@@ -914,12 +954,14 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec
914
954
  )
915
955
  return build('gmail', 'v1', credentials=credentials)
916
956
 
917
- # Load configuration
918
- config_path = locate_config_file()
957
+ # Load configuration from ~/.rgwfuncsrc
958
+ config_path = os.path.expanduser('~/.rgwfuncsrc')
919
959
  config = get_config(config_path)
920
960
 
921
961
  # Retrieve Gmail preset configuration
922
- gmail_config = next((preset for preset in config['gmail_bot_presets'] if preset['name'] == preset_name), None)
962
+ gmail_config = next(
963
+ (preset for preset in config['gmail_bot_presets'] if preset['name'] == preset_name),
964
+ None)
923
965
 
924
966
  if not gmail_config:
925
967
  raise ValueError(f"No preset found with the name {preset_name}")
@@ -942,13 +984,18 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec
942
984
  message['to'] = to_email
943
985
  message['from'] = sender_email
944
986
  message['subject'] = subject if subject else 'DataFrame CSV File'
945
- message.attach(MIMEText(body if body else 'Please find the CSV file attached.'))
987
+ message.attach(
988
+ MIMEText(
989
+ body if body else 'Please find the CSV file attached.'))
946
990
 
947
991
  with open(tmp_file_name, 'rb') as file:
948
992
  part = MIMEBase('application', 'octet-stream')
949
993
  part.set_payload(file.read())
950
994
  encoders.encode_base64(part)
951
- part.add_header('Content-Disposition', f'attachment; filename={os.path.basename(tmp_file_name)}')
995
+ part.add_header(
996
+ 'Content-Disposition',
997
+ f'attachment; filename={
998
+ os.path.basename(tmp_file_name)}')
952
999
  message.attach(part)
953
1000
 
954
1001
  if remove_after_send and os.path.exists(tmp_file_name):
@@ -970,46 +1017,49 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec
970
1017
  try:
971
1018
  raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
972
1019
  email_body = {'raw': raw}
973
- sent_message = service.users().messages().send(userId="me", body=email_body).execute()
1020
+ sent_message = service.users().messages().send(
1021
+ userId="me", body=email_body).execute()
974
1022
  print(f"Email with Message Id {sent_message['id']} successfully sent.")
975
1023
  except Exception as error:
976
1024
  raise Exception(f"Error sending email: {error}")
977
1025
 
978
1026
 
979
- def send_data_to_slack(df: pd.DataFrame, bot_name: str, message: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
1027
+ def send_data_to_slack(
1028
+ df: pd.DataFrame,
1029
+ bot_name: str,
1030
+ message: Optional[str] = None,
1031
+ as_file: bool = True,
1032
+ remove_after_send: bool = True) -> None:
980
1033
  """
981
1034
  Send a DataFrame or message to Slack using a specified bot configuration.
982
1035
 
983
1036
  Parameters:
984
1037
  df: The DataFrame to send.
985
1038
  bot_name: The Slack bot configuration preset name.
986
- message: Custom message to send along with the DataFrame or file.
987
- as_file: Boolean flag to decide whether to send the DataFrame as a file.
988
- remove_after_send: If True, removes the CSV file after sending.
989
- """
1039
+ message: Custom message to send along with the DataFrame or file. Defaults to None.
1040
+ as_file: Boolean flag to decide whether to send the DataFrame as a file (True) or as text (False). Defaults to True.
1041
+ remove_after_send: If True, removes the CSV file after sending. Defaults to True.
990
1042
 
991
- def locate_config_file(filename: str = "rgwml.config") -> str:
992
- """Locate config file in common user directories."""
993
- home_dir = os.path.expanduser("~")
994
- search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
1043
+ Raises:
1044
+ ValueError: If the specified bot is not found in the configuration.
1045
+ Exception: If the message sending fails.
995
1046
 
996
- for path in search_paths:
997
- for root, _, files in os.walk(path):
998
- if filename in files:
999
- return os.path.join(root, filename)
1000
- raise FileNotFoundError(
1001
- f"{filename} not found in Desktop, Documents, or Downloads folders")
1047
+ Notes:
1048
+ The configuration file is assumed to be located at `~/.rgwfuncsrc`.
1049
+ """
1002
1050
 
1003
1051
  def get_config(config_path: str) -> dict:
1004
1052
  """Load configuration from a JSON file."""
1005
1053
  with open(config_path, 'r') as file:
1006
1054
  return json.load(file)
1007
1055
 
1008
- # Load the Slack configuration
1009
- config_path = locate_config_file()
1056
+ # Load the Slack configuration from ~/.rgwfuncsrc
1057
+ config_path = os.path.expanduser('~/.rgwfuncsrc')
1010
1058
  config = get_config(config_path)
1011
1059
 
1012
- bot_config = next((bot for bot in config['slack_bot_presets'] if bot['name'] == bot_name), None)
1060
+ bot_config = next(
1061
+ (bot for bot in config['slack_bot_presets'] if bot['name'] == bot_name),
1062
+ None)
1013
1063
 
1014
1064
  if not bot_config:
1015
1065
  raise ValueError(f"No bot found with the name {bot_name}")
@@ -1024,13 +1074,22 @@ def send_data_to_slack(df: pd.DataFrame, bot_name: str, message: Optional[str] =
1024
1074
 
1025
1075
  try:
1026
1076
  with open(file_name, 'rb') as file:
1027
- response = client.files_upload(channels=bot_config['channel_id'], file=file, filename=os.path.basename(file_name), title="DataFrame Upload", initial_comment=message or '')
1077
+ response = client.files_upload(
1078
+ channels=bot_config['channel_id'],
1079
+ file=file,
1080
+ filename=os.path.basename(file_name),
1081
+ title="DataFrame Upload",
1082
+ initial_comment=message or ''
1083
+ )
1028
1084
  finally:
1029
1085
  if remove_after_send and os.path.exists(file_name):
1030
1086
  os.remove(file_name)
1031
1087
  else:
1032
1088
  df_str = df.to_string()
1033
- response = client.chat_postMessage(channel=bot_config['channel_id'], text=(message + "\n\n" + df_str) if message else df_str)
1089
+ response = client.chat_postMessage(
1090
+ channel=bot_config['channel_id'],
1091
+ text=(message + "\n\n" + df_str) if message else df_str
1092
+ )
1034
1093
 
1035
1094
  # Check if the message was sent successfully
1036
1095
  if not response["ok"]:
@@ -1087,7 +1146,11 @@ def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
1087
1146
  return df[new_order]
1088
1147
 
1089
1148
 
1090
- def append_ranged_classification_column(df: pd.DataFrame, ranges: str, target_col: str, new_col_name: str) -> pd.DataFrame:
1149
+ def append_ranged_classification_column(
1150
+ df: pd.DataFrame,
1151
+ ranges: str,
1152
+ target_col: str,
1153
+ new_col_name: str) -> pd.DataFrame:
1091
1154
  """
1092
1155
  Append a ranged classification column to the DataFrame.
1093
1156
 
@@ -1155,16 +1218,27 @@ def append_ranged_classification_column(df: pd.DataFrame, ranges: str, target_co
1155
1218
  for r in range_list
1156
1219
  )
1157
1220
 
1158
- labels = [f"{pad_number(range_list[i], max_integer_length)} to {pad_number(range_list[i + 1], max_integer_length)}" for i in range(len(range_list) - 1)]
1221
+ labels = [f"{pad_number(range_list[i],
1222
+ max_integer_length)} to {pad_number(range_list[i + 1],
1223
+ max_integer_length)}" for i in range(len(range_list) - 1)]
1159
1224
 
1160
1225
  # Ensure the target column is numeric
1161
1226
  df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
1162
- df[new_col_name] = pd.cut(df[target_col], bins=range_list, labels=labels, right=False, include_lowest=True)
1227
+ df[new_col_name] = pd.cut(
1228
+ df[target_col],
1229
+ bins=range_list,
1230
+ labels=labels,
1231
+ right=False,
1232
+ include_lowest=True)
1163
1233
 
1164
1234
  return df
1165
1235
 
1166
1236
 
1167
- def append_percentile_classification_column(df: pd.DataFrame, percentiles: str, target_col: str, new_col_name: str) -> pd.DataFrame:
1237
+ def append_percentile_classification_column(
1238
+ df: pd.DataFrame,
1239
+ percentiles: str,
1240
+ target_col: str,
1241
+ new_col_name: str) -> pd.DataFrame:
1168
1242
  """
1169
1243
  Append a percentile classification column to the DataFrame.
1170
1244
 
@@ -1192,14 +1266,21 @@ def append_percentile_classification_column(df: pd.DataFrame, percentiles: str,
1192
1266
 
1193
1267
  if has_decimals:
1194
1268
  percentiles_list = [float(p) for p in percentiles_list]
1195
- max_decimal_length = max(len(str(p).split('.')[1]) for p in percentiles_list if '.' in str(p))
1196
- max_integer_length = max(len(str(int(float(p)))) for p in percentiles_list)
1269
+ max_decimal_length = max(
1270
+ len(str(p).split('.')[1]) for p in percentiles_list if '.' in str(p))
1271
+ max_integer_length = max(len(str(int(float(p))))
1272
+ for p in percentiles_list)
1197
1273
 
1198
1274
  labels = []
1199
1275
 
1200
1276
  for i in range(len(percentiles_list) - 1):
1201
- start = pad_number(percentiles_list[i], max_integer_length, max_decimal_length, decimal=True)
1202
- end = pad_number(percentiles_list[i + 1], max_integer_length, max_decimal_length, decimal=True)
1277
+ start = pad_number(
1278
+ percentiles_list[i],
1279
+ max_integer_length,
1280
+ max_decimal_length,
1281
+ decimal=True)
1282
+ end = pad_number(
1283
+ percentiles_list[i + 1], max_integer_length, max_decimal_length, decimal=True)
1203
1284
 
1204
1285
  label = f"{start} to {end}"
1205
1286
  labels.append(label)
@@ -1222,12 +1303,20 @@ def append_percentile_classification_column(df: pd.DataFrame, percentiles: str,
1222
1303
  df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
1223
1304
  quantiles = [df[target_col].quantile(p / 100) for p in percentiles_list]
1224
1305
 
1225
- df[new_col_name] = pd.cut(df[target_col], bins=quantiles, labels=labels, include_lowest=True)
1306
+ df[new_col_name] = pd.cut(
1307
+ df[target_col],
1308
+ bins=quantiles,
1309
+ labels=labels,
1310
+ include_lowest=True)
1226
1311
 
1227
1312
  return df
1228
1313
 
1229
1314
 
1230
- def append_ranged_date_classification_column(df: pd.DataFrame, date_ranges: str, target_col: str, new_col_name: str) -> pd.DataFrame:
1315
+ def append_ranged_date_classification_column(
1316
+ df: pd.DataFrame,
1317
+ date_ranges: str,
1318
+ target_col: str,
1319
+ new_col_name: str) -> pd.DataFrame:
1231
1320
  """
1232
1321
  Append a ranged date classification column to the DataFrame.
1233
1322
 
@@ -1260,7 +1349,9 @@ def append_ranged_date_classification_column(df: pd.DataFrame, date_ranges: str,
1260
1349
  return df
1261
1350
 
1262
1351
 
1263
- def rename_columns(df: pd.DataFrame, rename_pairs: Dict[str, str]) -> pd.DataFrame:
1352
+ def rename_columns(df: pd.DataFrame,
1353
+ rename_pairs: Dict[str,
1354
+ str]) -> pd.DataFrame:
1264
1355
  """
1265
1356
  Rename columns in the DataFrame.
1266
1357
 
@@ -1272,7 +1363,8 @@ def rename_columns(df: pd.DataFrame, rename_pairs: Dict[str, str]) -> pd.DataFra
1272
1363
  A new DataFrame with columns renamed.
1273
1364
  """
1274
1365
  if df is None:
1275
- raise ValueError("No DataFrame to rename columns. Please provide a valid DataFrame.")
1366
+ raise ValueError(
1367
+ "No DataFrame to rename columns. Please provide a valid DataFrame.")
1276
1368
 
1277
1369
  return df.rename(columns=rename_pairs)
1278
1370
 
@@ -1290,7 +1382,8 @@ def cascade_sort(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
1290
1382
  A new DataFrame sorted by specified columns.
1291
1383
  """
1292
1384
  if df is None:
1293
- raise ValueError("No DataFrame to sort. Please provide a valid DataFrame.")
1385
+ raise ValueError(
1386
+ "No DataFrame to sort. Please provide a valid DataFrame.")
1294
1387
 
1295
1388
  col_names = []
1296
1389
  asc_order = []
@@ -1325,7 +1418,8 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
1325
1418
  A new DataFrame with XGB_TYPE labels appended.
1326
1419
  """
1327
1420
  if df is None:
1328
- raise ValueError("No DataFrame to add labels. Please provide a valid DataFrame.")
1421
+ raise ValueError(
1422
+ "No DataFrame to add labels. Please provide a valid DataFrame.")
1329
1423
 
1330
1424
  ratios = list(map(int, ratio_str.split(':')))
1331
1425
  total_ratio = sum(ratios)
@@ -1342,7 +1436,8 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
1342
1436
  labels = ['TRAIN'] * train_rows + ['VALIDATE'] * \
1343
1437
  validate_rows + ['TEST'] * test_rows
1344
1438
  else:
1345
- raise ValueError("Invalid ratio string format. Use 'TRAIN:TEST' or 'TRAIN:VALIDATE:TEST'.")
1439
+ raise ValueError(
1440
+ "Invalid ratio string format. Use 'TRAIN:TEST' or 'TRAIN:VALIDATE:TEST'.")
1346
1441
 
1347
1442
  df_with_labels = df.copy()
1348
1443
  df_with_labels['XGB_TYPE'] = labels
@@ -1350,7 +1445,13 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
1350
1445
  return df_with_labels
1351
1446
 
1352
1447
 
1353
- def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature_cols: str, pred_col: str, boosting_rounds: int = 100, model_path: Optional[str] = None) -> pd.DataFrame:
1448
+ def append_xgb_regression_predictions(
1449
+ df: pd.DataFrame,
1450
+ target_col: str,
1451
+ feature_cols: str,
1452
+ pred_col: str,
1453
+ boosting_rounds: int = 100,
1454
+ model_path: Optional[str] = None) -> pd.DataFrame:
1354
1455
  """
1355
1456
  Append XGB regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
1356
1457
 
@@ -1366,7 +1467,8 @@ def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature
1366
1467
  DataFrame with predictions appended.
1367
1468
  """
1368
1469
  if df is None or 'XGB_TYPE' not in df.columns:
1369
- raise ValueError("DataFrame is not initialized or 'XGB_TYPE' column is missing.")
1470
+ raise ValueError(
1471
+ "DataFrame is not initialized or 'XGB_TYPE' column is missing.")
1370
1472
 
1371
1473
  features = feature_cols.replace(' ', '').split(',')
1372
1474
 
@@ -1382,16 +1484,27 @@ def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature
1382
1484
  else:
1383
1485
  validate_data = None
1384
1486
 
1385
- dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
1487
+ dtrain = xgb.DMatrix(
1488
+ train_data[features],
1489
+ label=train_data[target_col],
1490
+ enable_categorical=True)
1386
1491
  evals = [(dtrain, 'train')]
1387
1492
 
1388
1493
  if validate_data is not None:
1389
- dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
1494
+ dvalidate = xgb.DMatrix(
1495
+ validate_data[features],
1496
+ label=validate_data[target_col],
1497
+ enable_categorical=True)
1390
1498
  evals.append((dvalidate, 'validate'))
1391
1499
 
1392
1500
  params = {'objective': 'reg:squarederror', 'eval_metric': 'rmse'}
1393
1501
 
1394
- model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
1502
+ model = xgb.train(
1503
+ params,
1504
+ dtrain,
1505
+ num_boost_round=boosting_rounds,
1506
+ evals=evals,
1507
+ early_stopping_rounds=10 if validate_data is not None else None)
1395
1508
 
1396
1509
  # Make predictions for all data
1397
1510
  dall = xgb.DMatrix(df[features], enable_categorical=True)
@@ -1400,13 +1513,20 @@ def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature
1400
1513
  if model_path:
1401
1514
  model.save_model(model_path)
1402
1515
 
1403
- columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1516
+ columns_order = [col for col in df.columns if col not in [
1517
+ 'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1404
1518
  df = df[columns_order]
1405
1519
 
1406
1520
  return df
1407
1521
 
1408
1522
 
1409
- def append_xgb_logistic_regression_predictions(df: pd.DataFrame, target_col: str, feature_cols: str, pred_col: str, boosting_rounds: int = 100, model_path: Optional[str] = None) -> pd.DataFrame:
1523
+ def append_xgb_logistic_regression_predictions(
1524
+ df: pd.DataFrame,
1525
+ target_col: str,
1526
+ feature_cols: str,
1527
+ pred_col: str,
1528
+ boosting_rounds: int = 100,
1529
+ model_path: Optional[str] = None) -> pd.DataFrame:
1410
1530
  """
1411
1531
  Append XGB logistic regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
1412
1532
 
@@ -1438,16 +1558,27 @@ def append_xgb_logistic_regression_predictions(df: pd.DataFrame, target_col: str
1438
1558
  if 'VALIDATE' in df['XGB_TYPE'].values:
1439
1559
  validate_data = df[df['XGB_TYPE'] == 'VALIDATE']
1440
1560
 
1441
- dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
1561
+ dtrain = xgb.DMatrix(
1562
+ train_data[features],
1563
+ label=train_data[target_col],
1564
+ enable_categorical=True)
1442
1565
  evals = [(dtrain, 'train')]
1443
1566
 
1444
1567
  if validate_data is not None:
1445
- dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
1568
+ dvalidate = xgb.DMatrix(
1569
+ validate_data[features],
1570
+ label=validate_data[target_col],
1571
+ enable_categorical=True)
1446
1572
  evals.append((dvalidate, 'validate'))
1447
1573
 
1448
1574
  params = {'objective': 'binary:logistic', 'eval_metric': 'auc'}
1449
1575
 
1450
- model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
1576
+ model = xgb.train(
1577
+ params,
1578
+ dtrain,
1579
+ num_boost_round=boosting_rounds,
1580
+ evals=evals,
1581
+ early_stopping_rounds=10 if validate_data is not None else None)
1451
1582
 
1452
1583
  # Make predictions for all data
1453
1584
  dall = xgb.DMatrix(df[features], enable_categorical=True)
@@ -1456,13 +1587,18 @@ def append_xgb_logistic_regression_predictions(df: pd.DataFrame, target_col: str
1456
1587
  if model_path:
1457
1588
  model.save_model(model_path)
1458
1589
 
1459
- columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1590
+ columns_order = [col for col in df.columns if col not in [
1591
+ 'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1460
1592
  df = df[columns_order]
1461
1593
 
1462
1594
  return df
1463
1595
 
1464
1596
 
1465
- def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by: str = "FREQ_DESC") -> None:
1597
+ def print_n_frequency_cascading(
1598
+ df: pd.DataFrame,
1599
+ n: int,
1600
+ columns: str,
1601
+ order_by: str = "FREQ_DESC") -> None:
1466
1602
  """
1467
1603
  Print the cascading frequency of top n values for specified columns.
1468
1604
 
@@ -1485,7 +1621,12 @@ def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by
1485
1621
  # Convert the column to string representation
1486
1622
  df[current_col] = df[current_col].astype(str)
1487
1623
  frequency = df[current_col].value_counts(dropna=False)
1488
- frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
1624
+ frequency = frequency.rename(
1625
+ index={
1626
+ 'nan': 'NaN',
1627
+ 'NaT': 'NaT',
1628
+ 'None': 'None',
1629
+ '': 'Empty'})
1489
1630
 
1490
1631
  if limit is not None:
1491
1632
  frequency = frequency.nlargest(limit)
@@ -1500,8 +1641,11 @@ def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by
1500
1641
  filtered_df = df[df[current_col] == value]
1501
1642
 
1502
1643
  if len(columns) > 1:
1503
- sub_report = generate_cascade_report(filtered_df, columns[1:], limit, order_by)
1504
- report[value] = {"count": str(count), f"sub_distribution({columns[1]})": sub_report if sub_report else {}}
1644
+ sub_report = generate_cascade_report(
1645
+ filtered_df, columns[1:], limit, order_by)
1646
+ report[value] = {
1647
+ "count": str(count), f"sub_distribution({
1648
+ columns[1]})": sub_report if sub_report else {}}
1505
1649
  else:
1506
1650
  report[value] = {"count": str(count)}
1507
1651
 
@@ -1511,17 +1655,29 @@ def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by
1511
1655
  if order_by == "ASC":
1512
1656
  return dict(sorted(frequency.items(), key=lambda item: item[0]))
1513
1657
  elif order_by == "DESC":
1514
- return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
1658
+ return dict(
1659
+ sorted(
1660
+ frequency.items(),
1661
+ key=lambda item: item[0],
1662
+ reverse=True))
1515
1663
  elif order_by == "FREQ_ASC":
1516
1664
  return dict(sorted(frequency.items(), key=lambda item: item[1]))
1517
1665
  else: # Default to "FREQ_DESC"
1518
- return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
1666
+ return dict(
1667
+ sorted(
1668
+ frequency.items(),
1669
+ key=lambda item: item[1],
1670
+ reverse=True))
1519
1671
 
1520
1672
  report = generate_cascade_report(df, columns, n, order_by)
1521
1673
  print(json.dumps(report, indent=2))
1522
1674
 
1523
1675
 
1524
- def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: str, order_by: str = "FREQ_DESC") -> None:
1676
+ def print_n_frequency_linear(
1677
+ df: pd.DataFrame,
1678
+ n: int,
1679
+ columns: str,
1680
+ order_by: str = "FREQ_DESC") -> None:
1525
1681
  """
1526
1682
  Print the linear frequency of top n values for specified columns.
1527
1683
 
@@ -1541,13 +1697,19 @@ def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: str, order_by: s
1541
1697
  continue
1542
1698
 
1543
1699
  frequency = df[current_col].astype(str).value_counts(dropna=False)
1544
- frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
1700
+ frequency = frequency.rename(
1701
+ index={
1702
+ 'nan': 'NaN',
1703
+ 'NaT': 'NaT',
1704
+ 'None': 'None',
1705
+ '': 'Empty'})
1545
1706
 
1546
1707
  if limit is not None:
1547
1708
  frequency = frequency.nlargest(limit)
1548
1709
 
1549
1710
  sorted_frequency = sort_frequency(frequency, order_by)
1550
- col_report = {str(value): str(count) for value, count in sorted_frequency.items()}
1711
+ col_report = {str(value): str(count)
1712
+ for value, count in sorted_frequency.items()}
1551
1713
  report[current_col] = col_report
1552
1714
 
1553
1715
  return report
@@ -1556,17 +1718,27 @@ def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: str, order_by: s
1556
1718
  if order_by == "ASC":
1557
1719
  return dict(sorted(frequency.items(), key=lambda item: item[0]))
1558
1720
  elif order_by == "DESC":
1559
- return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
1721
+ return dict(
1722
+ sorted(
1723
+ frequency.items(),
1724
+ key=lambda item: item[0],
1725
+ reverse=True))
1560
1726
  elif order_by == "FREQ_ASC":
1561
1727
  return dict(sorted(frequency.items(), key=lambda item: item[1]))
1562
1728
  else: # Default to "FREQ_DESC"
1563
- return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
1729
+ return dict(
1730
+ sorted(
1731
+ frequency.items(),
1732
+ key=lambda item: item[1],
1733
+ reverse=True))
1564
1734
 
1565
1735
  report = generate_linear_report(df, columns, n, order_by)
1566
1736
  print(json.dumps(report, indent=2))
1567
1737
 
1568
1738
 
1569
- def retain_columns(df: pd.DataFrame, columns_to_retain: List[str]) -> pd.DataFrame:
1739
+ def retain_columns(
1740
+ df: pd.DataFrame,
1741
+ columns_to_retain: List[str]) -> pd.DataFrame:
1570
1742
  """
1571
1743
  Retain specified columns in the DataFrame and drop the others.
1572
1744
 
@@ -1582,7 +1754,10 @@ def retain_columns(df: pd.DataFrame, columns_to_retain: List[str]) -> pd.DataFra
1582
1754
  return df[columns_to_retain]
1583
1755
 
1584
1756
 
1585
- def mask_against_dataframe(df: pd.DataFrame, other_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
1757
+ def mask_against_dataframe(
1758
+ df: pd.DataFrame,
1759
+ other_df: pd.DataFrame,
1760
+ column_name: str) -> pd.DataFrame:
1586
1761
  """
1587
1762
  Retain only rows with common column values between two DataFrames.
1588
1763
 
@@ -1599,7 +1774,10 @@ def mask_against_dataframe(df: pd.DataFrame, other_df: pd.DataFrame, column_name
1599
1774
  return df[df[column_name].isin(other_df[column_name])]
1600
1775
 
1601
1776
 
1602
- def mask_against_dataframe_converse(df: pd.DataFrame, other_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
1777
+ def mask_against_dataframe_converse(
1778
+ df: pd.DataFrame,
1779
+ other_df: pd.DataFrame,
1780
+ column_name: str) -> pd.DataFrame:
1603
1781
  """
1604
1782
  Retain only rows with uncommon column values between two DataFrames.
1605
1783
 
@@ -1633,7 +1811,8 @@ def union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
1633
1811
  ValueError: If the DataFrames do not have the same columns.
1634
1812
  """
1635
1813
  if set(df1.columns) != set(df2.columns):
1636
- raise ValueError("Both DataFrames must have the same columns for a union join")
1814
+ raise ValueError(
1815
+ "Both DataFrames must have the same columns for a union join")
1637
1816
 
1638
1817
  result_df = pd.concat([df1, df2], ignore_index=True).drop_duplicates()
1639
1818
  return result_df
@@ -1654,13 +1833,18 @@ def bag_union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
1654
1833
  ValueError: If the DataFrames do not have the same columns.
1655
1834
  """
1656
1835
  if set(df1.columns) != set(df2.columns):
1657
- raise ValueError("Both DataFrames must have the same columns for a bag union join")
1836
+ raise ValueError(
1837
+ "Both DataFrames must have the same columns for a bag union join")
1658
1838
 
1659
1839
  result_df = pd.concat([df1, df2], ignore_index=True)
1660
1840
  return result_df
1661
1841
 
1662
1842
 
1663
- def left_join(df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str) -> pd.DataFrame:
1843
+ def left_join(
1844
+ df1: pd.DataFrame,
1845
+ df2: pd.DataFrame,
1846
+ left_on: str,
1847
+ right_on: str) -> pd.DataFrame:
1664
1848
  """
1665
1849
  Perform a left join on two DataFrames.
1666
1850
 
@@ -1676,7 +1860,11 @@ def left_join(df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str)
1676
1860
  return df1.merge(df2, how='left', left_on=left_on, right_on=right_on)
1677
1861
 
1678
1862
 
1679
- def right_join(df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str) -> pd.DataFrame:
1863
+ def right_join(
1864
+ df1: pd.DataFrame,
1865
+ df2: pd.DataFrame,
1866
+ left_on: str,
1867
+ right_on: str) -> pd.DataFrame:
1680
1868
  """
1681
1869
  Perform a right join on two DataFrames.
1682
1870
 
@@ -1692,7 +1880,72 @@ def right_join(df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str
1692
1880
  return df1.merge(df2, how='right', left_on=left_on, right_on=right_on)
1693
1881
 
1694
1882
 
1695
- def sync_dataframe_to_sqlite_database(db_path: str, tablename: str, df: pd.DataFrame) -> None:
1883
+ def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.DataFrame) -> None:
1884
+ """
1885
+ Inserts a Pandas DataFrame into a SQLite database table.
1886
+
1887
+ Parameters:
1888
+ db_path: str
1889
+ The file path to the SQLite database. If the database does not exist,
1890
+ it will be created.
1891
+
1892
+ tablename: str
1893
+ The name of the table where the data will be inserted. If the table does
1894
+ not exist, it will be created based on the DataFrame's columns and types.
1895
+
1896
+ df: pd.DataFrame
1897
+ The DataFrame containing the data to be inserted into the database.
1898
+
1899
+ Functionality:
1900
+ - Checks if the specified table exists in the database.
1901
+ - Creates the table with appropriate column types if it doesn't exist.
1902
+ - Inserts the DataFrame's data into the table, appending to any existing data.
1903
+
1904
+ Data Type Mapping:
1905
+ - Converts Pandas data types to SQLite types: 'int64' to 'INTEGER',
1906
+ 'float64' to 'REAL', 'object' to 'TEXT', 'datetime64[ns]' to 'TEXT',
1907
+ and 'bool' to 'INTEGER'.
1908
+
1909
+ Returns:
1910
+ None
1911
+ """
1912
+
1913
+ def table_exists(cursor, table_name):
1914
+ cursor.execute(
1915
+ f"SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{table_name}'")
1916
+ return cursor.fetchone()[0] == 1
1917
+
1918
+ dtype_mapping = {
1919
+ 'int64': 'INTEGER',
1920
+ 'float64': 'REAL',
1921
+ 'object': 'TEXT',
1922
+ 'datetime64[ns]': 'TEXT',
1923
+ 'bool': 'INTEGER',
1924
+ }
1925
+
1926
+ def map_dtype(dtype):
1927
+ return dtype_mapping.get(str(dtype), 'TEXT')
1928
+
1929
+ with sqlite3.connect(db_path) as conn:
1930
+ cursor = conn.cursor()
1931
+
1932
+ if not table_exists(cursor, tablename):
1933
+ columns_with_types = ', '.join(
1934
+ f'"{col}" {
1935
+ map_dtype(dtype)}' for col,
1936
+ dtype in zip(
1937
+ df.columns,
1938
+ df.dtypes))
1939
+ create_table_query = f'CREATE TABLE "{tablename}" ({columns_with_types})'
1940
+ conn.execute(create_table_query)
1941
+
1942
+ df.to_sql(tablename, conn, if_exists='append', index=False)
1943
+
1944
+
1945
+ def sync_dataframe_to_sqlite_database(
1946
+ db_path: str,
1947
+ tablename: str,
1948
+ df: pd.DataFrame) -> None:
1696
1949
  """
1697
1950
  Processes and saves a DataFrame to an SQLite database, adding a timestamp column
1698
1951
  and replacing the existing table if needed. Creates the table if it does not exist.
@@ -1702,6 +1955,10 @@ def sync_dataframe_to_sqlite_database(db_path: str, tablename: str, df: pd.DataF
1702
1955
  - tablename (str): The name of the table in the database.
1703
1956
  - df (pd.DataFrame): The DataFrame to be processed and saved.
1704
1957
  """
1958
+ # Helper function to map pandas dtype to SQLite type
1959
+ def map_dtype(dtype):
1960
+ return dtype_mapping.get(str(dtype), 'TEXT')
1961
+
1705
1962
  # Step 1: Add a timestamp column to the dataframe
1706
1963
  df['rgwfuncs_sync_timestamp'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
1707
1964
 
@@ -1714,10 +1971,6 @@ def sync_dataframe_to_sqlite_database(db_path: str, tablename: str, df: pd.DataF
1714
1971
  'bool': 'INTEGER', # SQLite does not have a separate Boolean storage class
1715
1972
  }
1716
1973
 
1717
- # Helper function to map pandas dtype to SQLite type
1718
- def map_dtype(dtype):
1719
- return dtype_mapping.get(str(dtype), 'TEXT')
1720
-
1721
1974
  # Step 2: Save df in SQLite3 db as '{tablename}_new'
1722
1975
  with sqlite3.connect(db_path) as conn:
1723
1976
  new_table_name = f"{tablename}_new"
@@ -1728,8 +1981,11 @@ def sync_dataframe_to_sqlite_database(db_path: str, tablename: str, df: pd.DataF
1728
1981
  if cursor.fetchall() == []: # Table does not exist
1729
1982
  # Create a table using the DataFrame's column names and types
1730
1983
  columns_with_types = ', '.join(
1731
- f'"{col}" {map_dtype(dtype)}' for col, dtype in zip(df.columns, df.dtypes)
1732
- )
1984
+ f'"{col}" {
1985
+ map_dtype(dtype)}' for col,
1986
+ dtype in zip(
1987
+ df.columns,
1988
+ df.dtypes))
1733
1989
  create_table_query = f'CREATE TABLE "{new_table_name}" ({columns_with_types})'
1734
1990
  conn.execute(create_table_query)
1735
1991
 
rgwfuncs/str_lib.py ADDED
@@ -0,0 +1,62 @@
1
+ import os
2
+ import json
3
+ import requests
4
+ from typing import Tuple
5
+
6
+ def send_telegram_message(preset_name: str, message: str) -> None:
7
+ """Send a Telegram message using the specified preset.
8
+
9
+ Args:
10
+ preset_name (str): The name of the preset to use for sending the message.
11
+ message (str): The message to send.
12
+
13
+ Raises:
14
+ RuntimeError: If the preset is not found or necessary details are missing.
15
+ """
16
+
17
+ # Set the config path to ~/.rgwfuncsrc
18
+ config_path = os.path.expanduser("~/.rgwfuncsrc")
19
+
20
+ def load_config() -> dict:
21
+ """Load the configuration from the .rgwfuncsrc file."""
22
+ with open(config_path, 'r') as file:
23
+ return json.load(file)
24
+
25
+ def get_telegram_preset(config: dict, preset_name: str) -> dict:
26
+ """Get the Telegram preset configuration."""
27
+ presets = config.get("telegram_bot_presets", [])
28
+ for preset in presets:
29
+ if preset.get("name") == preset_name:
30
+ return preset
31
+ return None
32
+
33
+ def get_telegram_bot_details(config: dict, preset_name: str) -> Tuple[str, str]:
34
+ """Retrieve the Telegram bot token and chat ID from the preset."""
35
+ preset = get_telegram_preset(config, preset_name)
36
+ if not preset:
37
+ raise RuntimeError(f"Telegram bot preset '{preset_name}' not found in the configuration file")
38
+
39
+ bot_token = preset.get("bot_token")
40
+ chat_id = preset.get("chat_id")
41
+
42
+ if not bot_token or not chat_id:
43
+ raise RuntimeError(
44
+ f"Telegram bot token or chat ID for '{preset_name}' not found in the configuration file"
45
+ )
46
+
47
+ return bot_token, chat_id
48
+
49
+ # Load the configuration
50
+ config = load_config()
51
+
52
+ # Get bot details from the configuration
53
+ bot_token, chat_id = get_telegram_bot_details(config, preset_name)
54
+
55
+ # Prepare the request
56
+ url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
57
+ payload = {"chat_id": chat_id, "text": message}
58
+
59
+ # Send the message
60
+ response = requests.post(url, json=payload)
61
+ response.raise_for_status()
62
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: rgwfuncs
3
- Version: 0.0.16
3
+ Version: 0.0.18
4
4
  Summary: A functional programming paradigm for mathematical modelling and data science
5
5
  Home-page: https://github.com/ryangerardwilson/rgwfunc
6
6
  Author: Ryan Gerard Wilson
@@ -40,9 +40,9 @@ Install the package using:
40
40
 
41
41
  --------------------------------------------------------------------------------
42
42
 
43
- ## Create a `rgwml.config` File
43
+ ## Create a `.rgwfuncsrc` File
44
44
 
45
- A `rgwml.config` file (located at `vi ~/Documents/rgwml.config) is required for MSSQL, CLICKHOUSE, MYSQL, GOOGLE BIG QUERY, SLACK, TELEGRAM, and GMAIL integrations.
45
+ A `.rgwfuncsrc` file (located at `vi ~/.rgwfuncsrc) is required for MSSQL, CLICKHOUSE, MYSQL, GOOGLE BIG QUERY, SLACK, TELEGRAM, and GMAIL integrations.
46
46
 
47
47
  {
48
48
  "db_presets" : [
@@ -381,28 +381,30 @@ Drop duplicate rows based on specified columns, retaining the last occurrence.
381
381
  --------------------------------------------------------------------------------
382
382
 
383
383
  ### 12. `load_data_from_query`
384
+
384
385
  Load data from a database query into a DataFrame based on a configuration preset.
385
386
 
386
- Parameters:
387
- - `db_preset_name` (str): Name of the database preset in the config file.
388
- - query (str): The SQL query to execute.
389
- - `config_file_name` (str): Name of the configuration file (default: "rgwml.config").
387
+ - **Parameters:**
388
+ - `db_preset_name` (str): Name of the database preset in the configuration file.
389
+ - `query` (str): The SQL query to execute.
390
390
 
391
- Returns:
392
- - pd.DataFrame: A DataFrame containing the query result.
391
+ - **Returns:**
392
+ - `pd.DataFrame`: A DataFrame containing the query result.
393
393
 
394
- Example:
395
-
396
- from rgwfuncs import load_data_from_query
394
+ - **Notes:**
395
+ - The configuration file is assumed to be located at `~/.rgwfuncsrc`.
397
396
 
398
- df = load_data_from_query(
399
- db_preset_name="MyDBPreset",
400
- query="SELECT * FROM my_table",
401
- config_file_name="rgwml.config"
402
- )
403
- print(df)
404
-
397
+ - **Example:**
398
+
399
+ from rgwfuncs import load_data_from_query
400
+
401
+ df = load_data_from_query(
402
+ db_preset_name="MyDBPreset",
403
+ query="SELECT * FROM my_table"
404
+ )
405
+ print(df)
405
406
 
407
+
406
408
  --------------------------------------------------------------------------------
407
409
 
408
410
  ### 13. `load_data_from_path`
@@ -1148,10 +1150,47 @@ Perform a right join on two DataFrames.
1148
1150
  df_right_join = right_join(df1, df2, 'ID', 'ID')
1149
1151
  print(df_right_join)
1150
1152
 
1153
+ --------------------------------------------------------------------------------
1154
+
1155
+ ### 45. `insert_dataframe_in_sqlite_database`
1156
+
1157
+ Inserts a Pandas DataFrame into a SQLite database table. If the specified table does not exist, it will be created with column types automatically inferred from the DataFrame's data types.
1158
+
1159
+ - **Parameters:**
1160
+ - `db_path` (str): The path to the SQLite database file. If the database does not exist, it will be created.
1161
+ - `tablename` (str): The name of the table in the database. If the table does not exist, it is created with the DataFrame's columns and data types.
1162
+ - `df` (pd.DataFrame): The DataFrame containing the data to be inserted into the database table.
1163
+
1164
+ - **Returns:**
1165
+ - `None`
1166
+
1167
+ - **Notes:**
1168
+ - Data types in the DataFrame are converted to SQLite-compatible types:
1169
+ - `int64` is mapped to `INTEGER`
1170
+ - `float64` is mapped to `REAL`
1171
+ - `object` is mapped to `TEXT`
1172
+ - `datetime64[ns]` is mapped to `TEXT` (dates are stored as text)
1173
+ - `bool` is mapped to `INTEGER` (SQLite does not have a separate Boolean type)
1174
+
1175
+ - **Example:**
1176
+
1177
+ from rgwfuncs import insert_dataframe_in_sqlite_database
1178
+ import pandas as pd
1179
+
1180
+ df = pd.DataFrame({
1181
+ 'ID': [1, 2, 3],
1182
+ 'Name': ['Alice', 'Bob', 'Charlie'],
1183
+ 'Score': [88.5, 92.3, 85.0]
1184
+ })
1185
+
1186
+ db_path = 'my_database.db'
1187
+ tablename = 'students'
1188
+
1189
+ insert_dataframe_in_sqlite_database(db_path, tablename, df)
1151
1190
 
1152
1191
  --------------------------------------------------------------------------------
1153
1192
 
1154
- ### 45. `sync_dataframe_to_sqlite_database`
1193
+ ### 46. `sync_dataframe_to_sqlite_database`
1155
1194
  Processes and saves a DataFrame to an SQLite database, adding a timestamp column and replacing the existing table if needed. Creates the table if it does not exist.
1156
1195
 
1157
1196
  • Parameters:
@@ -0,0 +1,9 @@
1
+ rgwfuncs/__init__.py,sha256=XqJ8TJuc4HkQq3T5Gzjf3KTBsdJtyi2NKXBgbPuDn0Y,1156
2
+ rgwfuncs/df_lib.py,sha256=rY1yVvY04uqR174JwYBFiRnujekr9mbe258wmu9OeeY,67148
3
+ rgwfuncs/str_lib.py,sha256=6v9AXZ5wWsWVEcvcIz0B1rTmsvYaD-v53r2sYPcV4pU,2109
4
+ rgwfuncs-0.0.18.dist-info/LICENSE,sha256=7EI8xVBu6h_7_JlVw-yPhhOZlpY9hP8wal7kHtqKT_E,1074
5
+ rgwfuncs-0.0.18.dist-info/METADATA,sha256=GfMK-J1vH4CG_fQqQAWwAvDE6JcSqNrKuNKvfOUKV_E,33442
6
+ rgwfuncs-0.0.18.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
7
+ rgwfuncs-0.0.18.dist-info/entry_points.txt,sha256=j-c5IOPIQ0252EaOV6j6STio56sbXl2C4ym_fQ0lXx0,43
8
+ rgwfuncs-0.0.18.dist-info/top_level.txt,sha256=aGuVIzWsKiV1f2gCb6mynx0zx5ma0B1EwPGFKVEMTi4,9
9
+ rgwfuncs-0.0.18.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- rgwfuncs/__init__.py,sha256=BP8Nh8ivyCCz8Ga-21JW3NWInJFOElKoIfRuioJRWbA,1076
2
- rgwfuncs/df_lib.py,sha256=OZPI7M35mbue6YsieWmlzjM5RUkaow0v0d3P-V71L6o,63034
3
- rgwfuncs-0.0.16.dist-info/LICENSE,sha256=7EI8xVBu6h_7_JlVw-yPhhOZlpY9hP8wal7kHtqKT_E,1074
4
- rgwfuncs-0.0.16.dist-info/METADATA,sha256=oKTScVPzrgTTWdCQ7vxEdKYRnc-S_90hKwefifayeDU,32059
5
- rgwfuncs-0.0.16.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
6
- rgwfuncs-0.0.16.dist-info/entry_points.txt,sha256=j-c5IOPIQ0252EaOV6j6STio56sbXl2C4ym_fQ0lXx0,43
7
- rgwfuncs-0.0.16.dist-info/top_level.txt,sha256=aGuVIzWsKiV1f2gCb6mynx0zx5ma0B1EwPGFKVEMTi4,9
8
- rgwfuncs-0.0.16.dist-info/RECORD,,