rgwfuncs 0.0.17__tar.gz → 0.0.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: rgwfuncs
3
- Version: 0.0.17
3
+ Version: 0.0.19
4
4
  Summary: A functional programming paradigm for mathematical modelling and data science
5
5
  Home-page: https://github.com/ryangerardwilson/rgwfunc
6
6
  Author: Ryan Gerard Wilson
@@ -135,11 +135,48 @@ To display all docstrings, use:
135
135
 
136
136
  --------------------------------------------------------------------------------
137
137
 
138
- ## Function References and Syntax Examples
138
+ ## String Based Functions
139
+
140
+ ### 1. str_docs
141
+ Print a list of available function names in alphabetical order. If a filter is provided, print the matching docstrings.
142
+
143
+ • Parameters:
144
+ - `method_type_filter` (str): Optional, comma-separated to select docstring types, or '*' for all.
145
+
146
+ • Example:
147
+
148
+ import rgwfuncs
149
+ rgwfuncs.str_docs(method_type_filter='numeric_clean,limit_dataframe')
150
+
151
+ --------------------------------------------------------------------------------
152
+
153
+ ### 2. send_telegram_message
154
+
155
+ Send a message to a Telegram chat using a specified preset from your configuration file.
156
+
157
+ • Parameters:
158
+ - `preset_name` (str): The name of the preset to use for sending the message. This should match a preset in the configuration file.
159
+ - `message` (str): The message text that you want to send to the Telegram chat.
160
+
161
+ • Raises:
162
+ - `RuntimeError`: If the preset is not found in the configuration file or if necessary details (bot token or chat ID) are missing.
163
+
164
+ • Example:
165
+
166
+ from rgwfuncs import send_telegram_message
167
+
168
+ preset_name = "daily_updates"
169
+ message = "Here is your daily update!"
170
+
171
+ send_telegram_message(preset_name, message)
172
+
173
+ --------------------------------------------------------------------------------
174
+
175
+ ## Dataframe Based Functions
139
176
 
140
177
  Below is a quick reference of available functions, their purpose, and basic usage examples.
141
178
 
142
- ### 1. docs
179
+ ### 1. df_docs
143
180
  Print a list of available function names in alphabetical order. If a filter is provided, print the matching docstrings.
144
181
 
145
182
  • Parameters:
@@ -148,7 +185,7 @@ Print a list of available function names in alphabetical order. If a filter is p
148
185
  • Example:
149
186
 
150
187
  import rgwfuncs
151
- rgwfuncs.docs(method_type_filter='numeric_clean,limit_dataframe')
188
+ rgwfuncs.df_docs(method_type_filter='numeric_clean,limit_dataframe')
152
189
 
153
190
  --------------------------------------------------------------------------------
154
191
 
@@ -109,11 +109,48 @@ To display all docstrings, use:
109
109
 
110
110
  --------------------------------------------------------------------------------
111
111
 
112
- ## Function References and Syntax Examples
112
+ ## String Based Functions
113
+
114
+ ### 1. str_docs
115
+ Print a list of available function names in alphabetical order. If a filter is provided, print the matching docstrings.
116
+
117
+ • Parameters:
118
+ - `method_type_filter` (str): Optional, comma-separated to select docstring types, or '*' for all.
119
+
120
+ • Example:
121
+
122
+ import rgwfuncs
123
+ rgwfuncs.str_docs(method_type_filter='numeric_clean,limit_dataframe')
124
+
125
+ --------------------------------------------------------------------------------
126
+
127
+ ### 2. send_telegram_message
128
+
129
+ Send a message to a Telegram chat using a specified preset from your configuration file.
130
+
131
+ • Parameters:
132
+ - `preset_name` (str): The name of the preset to use for sending the message. This should match a preset in the configuration file.
133
+ - `message` (str): The message text that you want to send to the Telegram chat.
134
+
135
+ • Raises:
136
+ - `RuntimeError`: If the preset is not found in the configuration file or if necessary details (bot token or chat ID) are missing.
137
+
138
+ • Example:
139
+
140
+ from rgwfuncs import send_telegram_message
141
+
142
+ preset_name = "daily_updates"
143
+ message = "Here is your daily update!"
144
+
145
+ send_telegram_message(preset_name, message)
146
+
147
+ --------------------------------------------------------------------------------
148
+
149
+ ## Dataframe Based Functions
113
150
 
114
151
  Below is a quick reference of available functions, their purpose, and basic usage examples.
115
152
 
116
- ### 1. docs
153
+ ### 1. df_docs
117
154
  Print a list of available function names in alphabetical order. If a filter is provided, print the matching docstrings.
118
155
 
119
156
  • Parameters:
@@ -122,7 +159,7 @@ Print a list of available function names in alphabetical order. If a filter is p
122
159
  • Example:
123
160
 
124
161
  import rgwfuncs
125
- rgwfuncs.docs(method_type_filter='numeric_clean,limit_dataframe')
162
+ rgwfuncs.df_docs(method_type_filter='numeric_clean,limit_dataframe')
126
163
 
127
164
  --------------------------------------------------------------------------------
128
165
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "rgwfuncs"
7
- version = "0.0.17"
7
+ version = "0.0.19"
8
8
  authors = [
9
9
  { name = "Ryan Gerard Wilson", email = "ryangerardwilson@gmail.com" },
10
10
  ]
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = rgwfuncs
3
- version = 0.0.17
3
+ version = 0.0.19
4
4
  author = Ryan Gerard Wilson
5
5
  author_email = ryangerardwilson@gmail.com
6
6
  description = A functional programming paradigm for mathematical modelling and data science
@@ -0,0 +1,5 @@
1
+ # This file is automatically generated
2
+ # Dynamically importing functions from modules
3
+
4
+ from .df_lib import append_columns, append_percentile_classification_column, append_ranged_classification_column, append_ranged_date_classification_column, append_rows, append_xgb_labels, append_xgb_logistic_regression_predictions, append_xgb_regression_predictions, bag_union_join, bottom_n_unique_values, cascade_sort, delete_rows, df_docs, drop_duplicates, drop_duplicates_retain_first, drop_duplicates_retain_last, filter_dataframe, filter_indian_mobiles, first_n_rows, from_raw_data, insert_dataframe_in_sqlite_database, last_n_rows, left_join, limit_dataframe, load_data_from_path, load_data_from_query, load_data_from_sqlite_path, mask_against_dataframe, mask_against_dataframe_converse, numeric_clean, order_columns, print_correlation, print_dataframe, print_memory_usage, print_n_frequency_cascading, print_n_frequency_linear, rename_columns, retain_columns, right_join, send_data_to_email, send_data_to_slack, send_dataframe_via_telegram, sync_dataframe_to_sqlite_database, top_n_unique_values, union_join, update_rows
5
+ from .str_lib import send_telegram_message, str_docs
@@ -29,7 +29,7 @@ import warnings
29
29
  warnings.filterwarnings("ignore", category=FutureWarning)
30
30
 
31
31
 
32
- def docs(method_type_filter: Optional[str] = None) -> None:
32
+ def df_docs(method_type_filter: Optional[str] = None) -> None:
33
33
  """
34
34
  Print a list of function names in alphabetical order. If method_type_filter
35
35
  is specified, print the docstrings of the functions that match the filter.
@@ -66,7 +66,11 @@ def docs(method_type_filter: Optional[str] = None) -> None:
66
66
  print(f"\n{name}:\n{docstring}")
67
67
 
68
68
 
69
- def numeric_clean(df: pd.DataFrame, column_names: str, column_type: str, irregular_value_treatment: str) -> pd.DataFrame:
69
+ def numeric_clean(
70
+ df: pd.DataFrame,
71
+ column_names: str,
72
+ column_type: str,
73
+ irregular_value_treatment: str) -> pd.DataFrame:
70
74
  """
71
75
  Cleans the numeric columns based on specified treatments.
72
76
 
@@ -297,7 +301,9 @@ def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
297
301
  return df.drop_duplicates(keep='first')
298
302
 
299
303
 
300
- def drop_duplicates_retain_first(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
304
+ def drop_duplicates_retain_first(
305
+ df: pd.DataFrame,
306
+ columns: Optional[str] = None) -> pd.DataFrame:
301
307
  """
302
308
  Drop duplicate rows in the DataFrame based on specified columns, retaining the first occurrence.
303
309
 
@@ -319,7 +325,9 @@ def drop_duplicates_retain_first(df: pd.DataFrame, columns: Optional[str] = None
319
325
  return df.drop_duplicates(subset=columns_list, keep='first')
320
326
 
321
327
 
322
- def drop_duplicates_retain_last(df: pd.DataFrame, columns: Optional[str] = None) -> pd.DataFrame:
328
+ def drop_duplicates_retain_last(
329
+ df: pd.DataFrame,
330
+ columns: Optional[str] = None) -> pd.DataFrame:
323
331
  """
324
332
  Drop duplicate rows in the DataFrame based on specified columns, retaining the last occurrence.
325
333
 
@@ -336,7 +344,8 @@ def drop_duplicates_retain_last(df: pd.DataFrame, columns: Optional[str] = None)
336
344
  if df is None:
337
345
  raise ValueError("DataFrame is not initialized.")
338
346
 
339
- columns_list = [col.strip() for col in columns.split(',')] if columns else None
347
+ columns_list = [col.strip()
348
+ for col in columns.split(',')] if columns else None
340
349
  return df.drop_duplicates(subset=columns_list, keep='last')
341
350
 
342
351
 
@@ -380,11 +389,13 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
380
389
  with conn.cursor() as cursor:
381
390
  cursor.execute(query)
382
391
  rows = cursor.fetchall()
383
- columns = ([desc[0] for desc in cursor.description] if cursor.description else [])
392
+ columns = ([desc[0] for desc in cursor.description]
393
+ if cursor.description else [])
384
394
 
385
395
  return pd.DataFrame(rows, columns=columns)
386
396
 
387
- def query_clickhouse(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
397
+ def query_clickhouse(
398
+ db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
388
399
  host = db_preset['host']
389
400
  user = db_preset['username']
390
401
  password = db_preset['password']
@@ -395,7 +406,8 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
395
406
 
396
407
  for attempt in range(max_retries):
397
408
  try:
398
- client = clickhouse_connect.get_client(host=host, port='8123', username=user, password=password, database=database)
409
+ client = clickhouse_connect.get_client(
410
+ host=host, port='8123', username=user, password=password, database=database)
399
411
  data = client.query(query)
400
412
  rows = data.result_rows
401
413
  columns = data.column_names
@@ -409,11 +421,13 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
409
421
  raise ConnectionError(
410
422
  "All attempts to connect to ClickHouse failed.")
411
423
 
412
- def query_google_big_query(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
424
+ def query_google_big_query(
425
+ db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
413
426
  json_file_path = db_preset['json_file_path']
414
427
  project_id = db_preset['project_id']
415
428
 
416
- credentials = service_account.Credentials.from_service_account_file(json_file_path)
429
+ credentials = service_account.Credentials.from_service_account_file(
430
+ json_file_path)
417
431
  client = bigquery.Client(credentials=credentials, project=project_id)
418
432
 
419
433
  query_job = client.query(query)
@@ -429,7 +443,9 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
429
443
  config = json.load(f)
430
444
 
431
445
  db_presets = config.get('db_presets', [])
432
- db_preset = next((preset for preset in db_presets if preset['name'] == db_preset_name), None)
446
+ db_preset = next(
447
+ (preset for preset in db_presets if preset['name'] == db_preset_name),
448
+ None)
433
449
  if not db_preset:
434
450
  raise ValueError(f"No matching db_preset found for {db_preset_name}")
435
451
 
@@ -447,7 +463,6 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
447
463
  raise ValueError(f"Unsupported db_type: {db_type}")
448
464
 
449
465
 
450
-
451
466
  def load_data_from_path(file_path: str) -> pd.DataFrame:
452
467
  """
453
468
  Load data from a file into a DataFrame based on the file extension.
@@ -608,10 +623,20 @@ def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
608
623
  for column in columns:
609
624
  if column in df.columns:
610
625
  frequency = df[column].astype(str).value_counts(dropna=False)
611
- frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
626
+ frequency = frequency.rename(
627
+ index={
628
+ 'nan': 'NaN',
629
+ 'NaT': 'NaT',
630
+ 'None': 'None',
631
+ '': 'Empty'})
612
632
  top_n_values = frequency.nlargest(n)
613
- report[column] = {str(value): str(count) for value, count in top_n_values.items()}
614
- print(f"Top {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
633
+ report[column] = {str(value): str(count)
634
+ for value, count in top_n_values.items()}
635
+ print(
636
+ f"Top {n} unique values for column '{column}':\n{
637
+ json.dumps(
638
+ report[column],
639
+ indent=2)}\n")
615
640
  else:
616
641
  print(f"Column '{column}' does not exist in the DataFrame.")
617
642
  else:
@@ -621,7 +646,10 @@ def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
621
646
  gc.collect()
622
647
 
623
648
 
624
- def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
649
+ def bottom_n_unique_values(
650
+ df: pd.DataFrame,
651
+ n: int,
652
+ columns: List[str]) -> None:
625
653
  """
626
654
  Print the bottom `n` unique values for specified columns in the DataFrame.
627
655
 
@@ -641,12 +669,21 @@ def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None
641
669
  for column in columns:
642
670
  if column in df.columns:
643
671
  frequency = df[column].astype(str).value_counts(dropna=False)
644
- frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
672
+ frequency = frequency.rename(
673
+ index={
674
+ 'nan': 'NaN',
675
+ 'NaT': 'NaT',
676
+ 'None': 'None',
677
+ '': 'Empty'})
645
678
  bottom_n_values = frequency.nsmallest(n)
646
679
  report[column] = {
647
680
  str(value): str(count) for value,
648
681
  count in bottom_n_values.items()}
649
- print(f"Bottom {n} unique values for column '{column}':\n{json.dumps(report[column], indent=2)}\n")
682
+ print(
683
+ f"Bottom {n} unique values for column '{column}':\n{
684
+ json.dumps(
685
+ report[column],
686
+ indent=2)}\n")
650
687
  else:
651
688
  print(f"Column '{column}' does not exist in the DataFrame.")
652
689
  else:
@@ -656,7 +693,8 @@ def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None
656
693
  gc.collect()
657
694
 
658
695
 
659
- def print_correlation(df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
696
+ def print_correlation(
697
+ df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
660
698
  """
661
699
  Print correlation for multiple pairs of columns in the DataFrame.
662
700
 
@@ -675,13 +713,16 @@ def print_correlation(df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) ->
675
713
 
676
714
  correlation = numeric_col1.corr(numeric_col2)
677
715
  if pd.notnull(correlation):
678
- print(f"The correlation between '{col1}' and '{col2}' is {correlation}.")
716
+ print(
717
+ f"The correlation between '{col1}' and '{col2}' is {correlation}.")
679
718
  else:
680
- print(f"Cannot calculate correlation between '{col1}' and '{col2}' due to insufficient numeric data.")
719
+ print(
720
+ f"Cannot calculate correlation between '{col1}' and '{col2}' due to insufficient numeric data.")
681
721
  except Exception as e:
682
722
  print(f"Error processing cols '{col1}' and '{col2}': {e}")
683
723
  else:
684
- print(f"One or both of the specified cols ('{col1}', '{col2}') do not exist in the DataFrame.")
724
+ print(
725
+ f"One or both of the specified cols ('{col1}', '{col2}') do not exist in the DataFrame.")
685
726
  else:
686
727
  print("The DataFrame is empty.")
687
728
 
@@ -701,7 +742,8 @@ def print_memory_usage(df: pd.DataFrame) -> None:
701
742
  - ValueError: If the DataFrame is `None`.
702
743
  """
703
744
  if df is not None:
704
- memory_usage = df.memory_usage(deep=True).sum() / (1024 * 1024) # Convert bytes to MB
745
+ memory_usage = df.memory_usage(deep=True).sum(
746
+ ) / (1024 * 1024) # Convert bytes to MB
705
747
  print(f"Memory usage of DataFrame: {memory_usage:.2f} MB")
706
748
  else:
707
749
  raise ValueError("No DataFrame to print. Please provide a DataFrame.")
@@ -782,7 +824,8 @@ def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
782
824
  """
783
825
  if df is not None:
784
826
  print(df)
785
- columns_with_types = [f"{col} ({df[col].dtypes})" for col in df.columns]
827
+ columns_with_types = [
828
+ f"{col} ({df[col].dtypes})" for col in df.columns]
786
829
  print("Columns:", columns_with_types)
787
830
  if source:
788
831
  print(f"Source: {source}")
@@ -820,7 +863,9 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
820
863
  config_path = os.path.expanduser('~/.rgwfuncsrc')
821
864
  config = get_config(config_path)
822
865
 
823
- bot_config = next((bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name), None)
866
+ bot_config = next(
867
+ (bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name),
868
+ None)
824
869
  if not bot_config:
825
870
  raise ValueError(f"No bot found with the name {bot_name}")
826
871
 
@@ -834,9 +879,15 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
834
879
  df.to_csv(file_name, index=False)
835
880
  try:
836
881
  with open(file_name, 'rb') as file:
837
- payload = {'chat_id': bot_config['chat_id'], 'caption': message or ''}
882
+ payload = {
883
+ 'chat_id': bot_config['chat_id'],
884
+ 'caption': message or ''}
838
885
  files = {'document': file}
839
- response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendDocument", data=payload, files=files)
886
+ response = requests.post(
887
+ f"https://api.telegram.org/bot{
888
+ bot_config['bot_token']}/sendDocument",
889
+ data=payload,
890
+ files=files)
840
891
  if remove_after_send and os.path.exists(file_name):
841
892
  os.remove(file_name)
842
893
  except Exception as e:
@@ -849,7 +900,8 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
849
900
  'text': (message + "\n\n" + df_str) if message else df_str,
850
901
  'parse_mode': 'HTML'
851
902
  }
852
- response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendMessage", data=payload)
903
+ response = requests.post(
904
+ f"https://api.telegram.org/bot{bot_config['bot_token']}/sendMessage", data=payload)
853
905
 
854
906
  if response and not response.ok:
855
907
  raise Exception(f"Error sending message: {response.text}")
@@ -857,7 +909,14 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
857
909
  print("Message sent successfully.")
858
910
 
859
911
 
860
- def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subject: Optional[str] = None, body: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
912
+ def send_data_to_email(
913
+ df: pd.DataFrame,
914
+ preset_name: str,
915
+ to_email: str,
916
+ subject: Optional[str] = None,
917
+ body: Optional[str] = None,
918
+ as_file: bool = True,
919
+ remove_after_send: bool = True) -> None:
861
920
  """
862
921
  Send an email with an optional DataFrame attachment using the Gmail API via a specified preset.
863
922
 
@@ -885,7 +944,9 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec
885
944
  except json.JSONDecodeError as e:
886
945
  raise ValueError(f"Invalid JSON format in config file: {e}")
887
946
 
888
- def authenticate_service_account(service_account_credentials_path: str, sender_email_id: str) -> Any:
947
+ def authenticate_service_account(
948
+ service_account_credentials_path: str,
949
+ sender_email_id: str) -> Any:
889
950
  credentials = service_account.Credentials.from_service_account_file(
890
951
  service_account_credentials_path,
891
952
  scopes=['https://mail.google.com/'],
@@ -898,7 +959,9 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec
898
959
  config = get_config(config_path)
899
960
 
900
961
  # Retrieve Gmail preset configuration
901
- gmail_config = next((preset for preset in config['gmail_bot_presets'] if preset['name'] == preset_name), None)
962
+ gmail_config = next(
963
+ (preset for preset in config['gmail_bot_presets'] if preset['name'] == preset_name),
964
+ None)
902
965
 
903
966
  if not gmail_config:
904
967
  raise ValueError(f"No preset found with the name {preset_name}")
@@ -921,13 +984,18 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec
921
984
  message['to'] = to_email
922
985
  message['from'] = sender_email
923
986
  message['subject'] = subject if subject else 'DataFrame CSV File'
924
- message.attach(MIMEText(body if body else 'Please find the CSV file attached.'))
987
+ message.attach(
988
+ MIMEText(
989
+ body if body else 'Please find the CSV file attached.'))
925
990
 
926
991
  with open(tmp_file_name, 'rb') as file:
927
992
  part = MIMEBase('application', 'octet-stream')
928
993
  part.set_payload(file.read())
929
994
  encoders.encode_base64(part)
930
- part.add_header('Content-Disposition', f'attachment; filename={os.path.basename(tmp_file_name)}')
995
+ part.add_header(
996
+ 'Content-Disposition',
997
+ f'attachment; filename={
998
+ os.path.basename(tmp_file_name)}')
931
999
  message.attach(part)
932
1000
 
933
1001
  if remove_after_send and os.path.exists(tmp_file_name):
@@ -949,13 +1017,19 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec
949
1017
  try:
950
1018
  raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
951
1019
  email_body = {'raw': raw}
952
- sent_message = service.users().messages().send(userId="me", body=email_body).execute()
1020
+ sent_message = service.users().messages().send(
1021
+ userId="me", body=email_body).execute()
953
1022
  print(f"Email with Message Id {sent_message['id']} successfully sent.")
954
1023
  except Exception as error:
955
1024
  raise Exception(f"Error sending email: {error}")
956
1025
 
957
1026
 
958
- def send_data_to_slack(df: pd.DataFrame, bot_name: str, message: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
1027
+ def send_data_to_slack(
1028
+ df: pd.DataFrame,
1029
+ bot_name: str,
1030
+ message: Optional[str] = None,
1031
+ as_file: bool = True,
1032
+ remove_after_send: bool = True) -> None:
959
1033
  """
960
1034
  Send a DataFrame or message to Slack using a specified bot configuration.
961
1035
 
@@ -983,7 +1057,9 @@ def send_data_to_slack(df: pd.DataFrame, bot_name: str, message: Optional[str] =
983
1057
  config_path = os.path.expanduser('~/.rgwfuncsrc')
984
1058
  config = get_config(config_path)
985
1059
 
986
- bot_config = next((bot for bot in config['slack_bot_presets'] if bot['name'] == bot_name), None)
1060
+ bot_config = next(
1061
+ (bot for bot in config['slack_bot_presets'] if bot['name'] == bot_name),
1062
+ None)
987
1063
 
988
1064
  if not bot_config:
989
1065
  raise ValueError(f"No bot found with the name {bot_name}")
@@ -1070,7 +1146,11 @@ def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
1070
1146
  return df[new_order]
1071
1147
 
1072
1148
 
1073
- def append_ranged_classification_column(df: pd.DataFrame, ranges: str, target_col: str, new_col_name: str) -> pd.DataFrame:
1149
+ def append_ranged_classification_column(
1150
+ df: pd.DataFrame,
1151
+ ranges: str,
1152
+ target_col: str,
1153
+ new_col_name: str) -> pd.DataFrame:
1074
1154
  """
1075
1155
  Append a ranged classification column to the DataFrame.
1076
1156
 
@@ -1138,16 +1218,27 @@ def append_ranged_classification_column(df: pd.DataFrame, ranges: str, target_co
1138
1218
  for r in range_list
1139
1219
  )
1140
1220
 
1141
- labels = [f"{pad_number(range_list[i], max_integer_length)} to {pad_number(range_list[i + 1], max_integer_length)}" for i in range(len(range_list) - 1)]
1221
+ labels = [f"{pad_number(range_list[i],
1222
+ max_integer_length)} to {pad_number(range_list[i + 1],
1223
+ max_integer_length)}" for i in range(len(range_list) - 1)]
1142
1224
 
1143
1225
  # Ensure the target column is numeric
1144
1226
  df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
1145
- df[new_col_name] = pd.cut(df[target_col], bins=range_list, labels=labels, right=False, include_lowest=True)
1227
+ df[new_col_name] = pd.cut(
1228
+ df[target_col],
1229
+ bins=range_list,
1230
+ labels=labels,
1231
+ right=False,
1232
+ include_lowest=True)
1146
1233
 
1147
1234
  return df
1148
1235
 
1149
1236
 
1150
- def append_percentile_classification_column(df: pd.DataFrame, percentiles: str, target_col: str, new_col_name: str) -> pd.DataFrame:
1237
+ def append_percentile_classification_column(
1238
+ df: pd.DataFrame,
1239
+ percentiles: str,
1240
+ target_col: str,
1241
+ new_col_name: str) -> pd.DataFrame:
1151
1242
  """
1152
1243
  Append a percentile classification column to the DataFrame.
1153
1244
 
@@ -1175,14 +1266,21 @@ def append_percentile_classification_column(df: pd.DataFrame, percentiles: str,
1175
1266
 
1176
1267
  if has_decimals:
1177
1268
  percentiles_list = [float(p) for p in percentiles_list]
1178
- max_decimal_length = max(len(str(p).split('.')[1]) for p in percentiles_list if '.' in str(p))
1179
- max_integer_length = max(len(str(int(float(p)))) for p in percentiles_list)
1269
+ max_decimal_length = max(
1270
+ len(str(p).split('.')[1]) for p in percentiles_list if '.' in str(p))
1271
+ max_integer_length = max(len(str(int(float(p))))
1272
+ for p in percentiles_list)
1180
1273
 
1181
1274
  labels = []
1182
1275
 
1183
1276
  for i in range(len(percentiles_list) - 1):
1184
- start = pad_number(percentiles_list[i], max_integer_length, max_decimal_length, decimal=True)
1185
- end = pad_number(percentiles_list[i + 1], max_integer_length, max_decimal_length, decimal=True)
1277
+ start = pad_number(
1278
+ percentiles_list[i],
1279
+ max_integer_length,
1280
+ max_decimal_length,
1281
+ decimal=True)
1282
+ end = pad_number(
1283
+ percentiles_list[i + 1], max_integer_length, max_decimal_length, decimal=True)
1186
1284
 
1187
1285
  label = f"{start} to {end}"
1188
1286
  labels.append(label)
@@ -1205,12 +1303,20 @@ def append_percentile_classification_column(df: pd.DataFrame, percentiles: str,
1205
1303
  df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
1206
1304
  quantiles = [df[target_col].quantile(p / 100) for p in percentiles_list]
1207
1305
 
1208
- df[new_col_name] = pd.cut(df[target_col], bins=quantiles, labels=labels, include_lowest=True)
1306
+ df[new_col_name] = pd.cut(
1307
+ df[target_col],
1308
+ bins=quantiles,
1309
+ labels=labels,
1310
+ include_lowest=True)
1209
1311
 
1210
1312
  return df
1211
1313
 
1212
1314
 
1213
- def append_ranged_date_classification_column(df: pd.DataFrame, date_ranges: str, target_col: str, new_col_name: str) -> pd.DataFrame:
1315
+ def append_ranged_date_classification_column(
1316
+ df: pd.DataFrame,
1317
+ date_ranges: str,
1318
+ target_col: str,
1319
+ new_col_name: str) -> pd.DataFrame:
1214
1320
  """
1215
1321
  Append a ranged date classification column to the DataFrame.
1216
1322
 
@@ -1243,7 +1349,9 @@ def append_ranged_date_classification_column(df: pd.DataFrame, date_ranges: str,
1243
1349
  return df
1244
1350
 
1245
1351
 
1246
- def rename_columns(df: pd.DataFrame, rename_pairs: Dict[str, str]) -> pd.DataFrame:
1352
+ def rename_columns(df: pd.DataFrame,
1353
+ rename_pairs: Dict[str,
1354
+ str]) -> pd.DataFrame:
1247
1355
  """
1248
1356
  Rename columns in the DataFrame.
1249
1357
 
@@ -1255,7 +1363,8 @@ def rename_columns(df: pd.DataFrame, rename_pairs: Dict[str, str]) -> pd.DataFra
1255
1363
  A new DataFrame with columns renamed.
1256
1364
  """
1257
1365
  if df is None:
1258
- raise ValueError("No DataFrame to rename columns. Please provide a valid DataFrame.")
1366
+ raise ValueError(
1367
+ "No DataFrame to rename columns. Please provide a valid DataFrame.")
1259
1368
 
1260
1369
  return df.rename(columns=rename_pairs)
1261
1370
 
@@ -1273,7 +1382,8 @@ def cascade_sort(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
1273
1382
  A new DataFrame sorted by specified columns.
1274
1383
  """
1275
1384
  if df is None:
1276
- raise ValueError("No DataFrame to sort. Please provide a valid DataFrame.")
1385
+ raise ValueError(
1386
+ "No DataFrame to sort. Please provide a valid DataFrame.")
1277
1387
 
1278
1388
  col_names = []
1279
1389
  asc_order = []
@@ -1308,7 +1418,8 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
1308
1418
  A new DataFrame with XGB_TYPE labels appended.
1309
1419
  """
1310
1420
  if df is None:
1311
- raise ValueError("No DataFrame to add labels. Please provide a valid DataFrame.")
1421
+ raise ValueError(
1422
+ "No DataFrame to add labels. Please provide a valid DataFrame.")
1312
1423
 
1313
1424
  ratios = list(map(int, ratio_str.split(':')))
1314
1425
  total_ratio = sum(ratios)
@@ -1325,7 +1436,8 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
1325
1436
  labels = ['TRAIN'] * train_rows + ['VALIDATE'] * \
1326
1437
  validate_rows + ['TEST'] * test_rows
1327
1438
  else:
1328
- raise ValueError("Invalid ratio string format. Use 'TRAIN:TEST' or 'TRAIN:VALIDATE:TEST'.")
1439
+ raise ValueError(
1440
+ "Invalid ratio string format. Use 'TRAIN:TEST' or 'TRAIN:VALIDATE:TEST'.")
1329
1441
 
1330
1442
  df_with_labels = df.copy()
1331
1443
  df_with_labels['XGB_TYPE'] = labels
@@ -1333,7 +1445,13 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
1333
1445
  return df_with_labels
1334
1446
 
1335
1447
 
1336
- def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature_cols: str, pred_col: str, boosting_rounds: int = 100, model_path: Optional[str] = None) -> pd.DataFrame:
1448
+ def append_xgb_regression_predictions(
1449
+ df: pd.DataFrame,
1450
+ target_col: str,
1451
+ feature_cols: str,
1452
+ pred_col: str,
1453
+ boosting_rounds: int = 100,
1454
+ model_path: Optional[str] = None) -> pd.DataFrame:
1337
1455
  """
1338
1456
  Append XGB regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
1339
1457
 
@@ -1349,7 +1467,8 @@ def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature
1349
1467
  DataFrame with predictions appended.
1350
1468
  """
1351
1469
  if df is None or 'XGB_TYPE' not in df.columns:
1352
- raise ValueError("DataFrame is not initialized or 'XGB_TYPE' column is missing.")
1470
+ raise ValueError(
1471
+ "DataFrame is not initialized or 'XGB_TYPE' column is missing.")
1353
1472
 
1354
1473
  features = feature_cols.replace(' ', '').split(',')
1355
1474
 
@@ -1365,16 +1484,27 @@ def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature
1365
1484
  else:
1366
1485
  validate_data = None
1367
1486
 
1368
- dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
1487
+ dtrain = xgb.DMatrix(
1488
+ train_data[features],
1489
+ label=train_data[target_col],
1490
+ enable_categorical=True)
1369
1491
  evals = [(dtrain, 'train')]
1370
1492
 
1371
1493
  if validate_data is not None:
1372
- dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
1494
+ dvalidate = xgb.DMatrix(
1495
+ validate_data[features],
1496
+ label=validate_data[target_col],
1497
+ enable_categorical=True)
1373
1498
  evals.append((dvalidate, 'validate'))
1374
1499
 
1375
1500
  params = {'objective': 'reg:squarederror', 'eval_metric': 'rmse'}
1376
1501
 
1377
- model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
1502
+ model = xgb.train(
1503
+ params,
1504
+ dtrain,
1505
+ num_boost_round=boosting_rounds,
1506
+ evals=evals,
1507
+ early_stopping_rounds=10 if validate_data is not None else None)
1378
1508
 
1379
1509
  # Make predictions for all data
1380
1510
  dall = xgb.DMatrix(df[features], enable_categorical=True)
@@ -1383,13 +1513,20 @@ def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature
1383
1513
  if model_path:
1384
1514
  model.save_model(model_path)
1385
1515
 
1386
- columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1516
+ columns_order = [col for col in df.columns if col not in [
1517
+ 'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1387
1518
  df = df[columns_order]
1388
1519
 
1389
1520
  return df
1390
1521
 
1391
1522
 
1392
- def append_xgb_logistic_regression_predictions(df: pd.DataFrame, target_col: str, feature_cols: str, pred_col: str, boosting_rounds: int = 100, model_path: Optional[str] = None) -> pd.DataFrame:
1523
+ def append_xgb_logistic_regression_predictions(
1524
+ df: pd.DataFrame,
1525
+ target_col: str,
1526
+ feature_cols: str,
1527
+ pred_col: str,
1528
+ boosting_rounds: int = 100,
1529
+ model_path: Optional[str] = None) -> pd.DataFrame:
1393
1530
  """
1394
1531
  Append XGB logistic regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
1395
1532
 
@@ -1421,16 +1558,27 @@ def append_xgb_logistic_regression_predictions(df: pd.DataFrame, target_col: str
1421
1558
  if 'VALIDATE' in df['XGB_TYPE'].values:
1422
1559
  validate_data = df[df['XGB_TYPE'] == 'VALIDATE']
1423
1560
 
1424
- dtrain = xgb.DMatrix(train_data[features], label=train_data[target_col], enable_categorical=True)
1561
+ dtrain = xgb.DMatrix(
1562
+ train_data[features],
1563
+ label=train_data[target_col],
1564
+ enable_categorical=True)
1425
1565
  evals = [(dtrain, 'train')]
1426
1566
 
1427
1567
  if validate_data is not None:
1428
- dvalidate = xgb.DMatrix(validate_data[features], label=validate_data[target_col], enable_categorical=True)
1568
+ dvalidate = xgb.DMatrix(
1569
+ validate_data[features],
1570
+ label=validate_data[target_col],
1571
+ enable_categorical=True)
1429
1572
  evals.append((dvalidate, 'validate'))
1430
1573
 
1431
1574
  params = {'objective': 'binary:logistic', 'eval_metric': 'auc'}
1432
1575
 
1433
- model = xgb.train(params, dtrain, num_boost_round=boosting_rounds, evals=evals, early_stopping_rounds=10 if validate_data is not None else None)
1576
+ model = xgb.train(
1577
+ params,
1578
+ dtrain,
1579
+ num_boost_round=boosting_rounds,
1580
+ evals=evals,
1581
+ early_stopping_rounds=10 if validate_data is not None else None)
1434
1582
 
1435
1583
  # Make predictions for all data
1436
1584
  dall = xgb.DMatrix(df[features], enable_categorical=True)
@@ -1439,13 +1587,18 @@ def append_xgb_logistic_regression_predictions(df: pd.DataFrame, target_col: str
1439
1587
  if model_path:
1440
1588
  model.save_model(model_path)
1441
1589
 
1442
- columns_order = [col for col in df.columns if col not in ['XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1590
+ columns_order = [col for col in df.columns if col not in [
1591
+ 'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
1443
1592
  df = df[columns_order]
1444
1593
 
1445
1594
  return df
1446
1595
 
1447
1596
 
1448
- def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by: str = "FREQ_DESC") -> None:
1597
+ def print_n_frequency_cascading(
1598
+ df: pd.DataFrame,
1599
+ n: int,
1600
+ columns: str,
1601
+ order_by: str = "FREQ_DESC") -> None:
1449
1602
  """
1450
1603
  Print the cascading frequency of top n values for specified columns.
1451
1604
 
@@ -1468,7 +1621,12 @@ def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by
1468
1621
  # Convert the column to string representation
1469
1622
  df[current_col] = df[current_col].astype(str)
1470
1623
  frequency = df[current_col].value_counts(dropna=False)
1471
- frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
1624
+ frequency = frequency.rename(
1625
+ index={
1626
+ 'nan': 'NaN',
1627
+ 'NaT': 'NaT',
1628
+ 'None': 'None',
1629
+ '': 'Empty'})
1472
1630
 
1473
1631
  if limit is not None:
1474
1632
  frequency = frequency.nlargest(limit)
@@ -1483,8 +1641,11 @@ def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by
1483
1641
  filtered_df = df[df[current_col] == value]
1484
1642
 
1485
1643
  if len(columns) > 1:
1486
- sub_report = generate_cascade_report(filtered_df, columns[1:], limit, order_by)
1487
- report[value] = {"count": str(count), f"sub_distribution({columns[1]})": sub_report if sub_report else {}}
1644
+ sub_report = generate_cascade_report(
1645
+ filtered_df, columns[1:], limit, order_by)
1646
+ report[value] = {
1647
+ "count": str(count), f"sub_distribution({
1648
+ columns[1]})": sub_report if sub_report else {}}
1488
1649
  else:
1489
1650
  report[value] = {"count": str(count)}
1490
1651
 
@@ -1494,17 +1655,29 @@ def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by
1494
1655
  if order_by == "ASC":
1495
1656
  return dict(sorted(frequency.items(), key=lambda item: item[0]))
1496
1657
  elif order_by == "DESC":
1497
- return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
1658
+ return dict(
1659
+ sorted(
1660
+ frequency.items(),
1661
+ key=lambda item: item[0],
1662
+ reverse=True))
1498
1663
  elif order_by == "FREQ_ASC":
1499
1664
  return dict(sorted(frequency.items(), key=lambda item: item[1]))
1500
1665
  else: # Default to "FREQ_DESC"
1501
- return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
1666
+ return dict(
1667
+ sorted(
1668
+ frequency.items(),
1669
+ key=lambda item: item[1],
1670
+ reverse=True))
1502
1671
 
1503
1672
  report = generate_cascade_report(df, columns, n, order_by)
1504
1673
  print(json.dumps(report, indent=2))
1505
1674
 
1506
1675
 
1507
- def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: str, order_by: str = "FREQ_DESC") -> None:
1676
+ def print_n_frequency_linear(
1677
+ df: pd.DataFrame,
1678
+ n: int,
1679
+ columns: str,
1680
+ order_by: str = "FREQ_DESC") -> None:
1508
1681
  """
1509
1682
  Print the linear frequency of top n values for specified columns.
1510
1683
 
@@ -1524,13 +1697,19 @@ def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: str, order_by: s
1524
1697
  continue
1525
1698
 
1526
1699
  frequency = df[current_col].astype(str).value_counts(dropna=False)
1527
- frequency = frequency.rename(index={'nan': 'NaN', 'NaT': 'NaT', 'None': 'None', '': 'Empty'})
1700
+ frequency = frequency.rename(
1701
+ index={
1702
+ 'nan': 'NaN',
1703
+ 'NaT': 'NaT',
1704
+ 'None': 'None',
1705
+ '': 'Empty'})
1528
1706
 
1529
1707
  if limit is not None:
1530
1708
  frequency = frequency.nlargest(limit)
1531
1709
 
1532
1710
  sorted_frequency = sort_frequency(frequency, order_by)
1533
- col_report = {str(value): str(count) for value, count in sorted_frequency.items()}
1711
+ col_report = {str(value): str(count)
1712
+ for value, count in sorted_frequency.items()}
1534
1713
  report[current_col] = col_report
1535
1714
 
1536
1715
  return report
@@ -1539,17 +1718,27 @@ def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: str, order_by: s
1539
1718
  if order_by == "ASC":
1540
1719
  return dict(sorted(frequency.items(), key=lambda item: item[0]))
1541
1720
  elif order_by == "DESC":
1542
- return dict(sorted(frequency.items(), key=lambda item: item[0], reverse=True))
1721
+ return dict(
1722
+ sorted(
1723
+ frequency.items(),
1724
+ key=lambda item: item[0],
1725
+ reverse=True))
1543
1726
  elif order_by == "FREQ_ASC":
1544
1727
  return dict(sorted(frequency.items(), key=lambda item: item[1]))
1545
1728
  else: # Default to "FREQ_DESC"
1546
- return dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True))
1729
+ return dict(
1730
+ sorted(
1731
+ frequency.items(),
1732
+ key=lambda item: item[1],
1733
+ reverse=True))
1547
1734
 
1548
1735
  report = generate_linear_report(df, columns, n, order_by)
1549
1736
  print(json.dumps(report, indent=2))
1550
1737
 
1551
1738
 
1552
- def retain_columns(df: pd.DataFrame, columns_to_retain: List[str]) -> pd.DataFrame:
1739
+ def retain_columns(
1740
+ df: pd.DataFrame,
1741
+ columns_to_retain: List[str]) -> pd.DataFrame:
1553
1742
  """
1554
1743
  Retain specified columns in the DataFrame and drop the others.
1555
1744
 
@@ -1565,7 +1754,10 @@ def retain_columns(df: pd.DataFrame, columns_to_retain: List[str]) -> pd.DataFra
1565
1754
  return df[columns_to_retain]
1566
1755
 
1567
1756
 
1568
- def mask_against_dataframe(df: pd.DataFrame, other_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
1757
+ def mask_against_dataframe(
1758
+ df: pd.DataFrame,
1759
+ other_df: pd.DataFrame,
1760
+ column_name: str) -> pd.DataFrame:
1569
1761
  """
1570
1762
  Retain only rows with common column values between two DataFrames.
1571
1763
 
@@ -1582,7 +1774,10 @@ def mask_against_dataframe(df: pd.DataFrame, other_df: pd.DataFrame, column_name
1582
1774
  return df[df[column_name].isin(other_df[column_name])]
1583
1775
 
1584
1776
 
1585
- def mask_against_dataframe_converse(df: pd.DataFrame, other_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
1777
+ def mask_against_dataframe_converse(
1778
+ df: pd.DataFrame,
1779
+ other_df: pd.DataFrame,
1780
+ column_name: str) -> pd.DataFrame:
1586
1781
  """
1587
1782
  Retain only rows with uncommon column values between two DataFrames.
1588
1783
 
@@ -1616,7 +1811,8 @@ def union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
1616
1811
  ValueError: If the DataFrames do not have the same columns.
1617
1812
  """
1618
1813
  if set(df1.columns) != set(df2.columns):
1619
- raise ValueError("Both DataFrames must have the same columns for a union join")
1814
+ raise ValueError(
1815
+ "Both DataFrames must have the same columns for a union join")
1620
1816
 
1621
1817
  result_df = pd.concat([df1, df2], ignore_index=True).drop_duplicates()
1622
1818
  return result_df
@@ -1637,13 +1833,18 @@ def bag_union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
1637
1833
  ValueError: If the DataFrames do not have the same columns.
1638
1834
  """
1639
1835
  if set(df1.columns) != set(df2.columns):
1640
- raise ValueError("Both DataFrames must have the same columns for a bag union join")
1836
+ raise ValueError(
1837
+ "Both DataFrames must have the same columns for a bag union join")
1641
1838
 
1642
1839
  result_df = pd.concat([df1, df2], ignore_index=True)
1643
1840
  return result_df
1644
1841
 
1645
1842
 
1646
- def left_join(df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str) -> pd.DataFrame:
1843
+ def left_join(
1844
+ df1: pd.DataFrame,
1845
+ df2: pd.DataFrame,
1846
+ left_on: str,
1847
+ right_on: str) -> pd.DataFrame:
1647
1848
  """
1648
1849
  Perform a left join on two DataFrames.
1649
1850
 
@@ -1659,7 +1860,11 @@ def left_join(df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str)
1659
1860
  return df1.merge(df2, how='left', left_on=left_on, right_on=right_on)
1660
1861
 
1661
1862
 
1662
- def right_join(df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str) -> pd.DataFrame:
1863
+ def right_join(
1864
+ df1: pd.DataFrame,
1865
+ df2: pd.DataFrame,
1866
+ left_on: str,
1867
+ right_on: str) -> pd.DataFrame:
1663
1868
  """
1664
1869
  Perform a right join on two DataFrames.
1665
1870
 
@@ -1683,7 +1888,7 @@ def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.Dat
1683
1888
  db_path: str
1684
1889
  The file path to the SQLite database. If the database does not exist,
1685
1890
  it will be created.
1686
-
1891
+
1687
1892
  tablename: str
1688
1893
  The name of the table where the data will be inserted. If the table does
1689
1894
  not exist, it will be created based on the DataFrame's columns and types.
@@ -1697,8 +1902,8 @@ def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.Dat
1697
1902
  - Inserts the DataFrame's data into the table, appending to any existing data.
1698
1903
 
1699
1904
  Data Type Mapping:
1700
- - Converts Pandas data types to SQLite types: 'int64' to 'INTEGER',
1701
- 'float64' to 'REAL', 'object' to 'TEXT', 'datetime64[ns]' to 'TEXT',
1905
+ - Converts Pandas data types to SQLite types: 'int64' to 'INTEGER',
1906
+ 'float64' to 'REAL', 'object' to 'TEXT', 'datetime64[ns]' to 'TEXT',
1702
1907
  and 'bool' to 'INTEGER'.
1703
1908
 
1704
1909
  Returns:
@@ -1706,10 +1911,10 @@ def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.Dat
1706
1911
  """
1707
1912
 
1708
1913
  def table_exists(cursor, table_name):
1709
- cursor.execute(f"SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{table_name}'")
1914
+ cursor.execute(
1915
+ f"SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{table_name}'")
1710
1916
  return cursor.fetchone()[0] == 1
1711
1917
 
1712
-
1713
1918
  dtype_mapping = {
1714
1919
  'int64': 'INTEGER',
1715
1920
  'float64': 'REAL',
@@ -1726,15 +1931,21 @@ def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.Dat
1726
1931
 
1727
1932
  if not table_exists(cursor, tablename):
1728
1933
  columns_with_types = ', '.join(
1729
- f'"{col}" {map_dtype(dtype)}' for col, dtype in zip(df.columns, df.dtypes)
1730
- )
1934
+ f'"{col}" {
1935
+ map_dtype(dtype)}' for col,
1936
+ dtype in zip(
1937
+ df.columns,
1938
+ df.dtypes))
1731
1939
  create_table_query = f'CREATE TABLE "{tablename}" ({columns_with_types})'
1732
1940
  conn.execute(create_table_query)
1733
1941
 
1734
1942
  df.to_sql(tablename, conn, if_exists='append', index=False)
1735
1943
 
1736
1944
 
1737
- def sync_dataframe_to_sqlite_database(db_path: str, tablename: str, df: pd.DataFrame) -> None:
1945
+ def sync_dataframe_to_sqlite_database(
1946
+ db_path: str,
1947
+ tablename: str,
1948
+ df: pd.DataFrame) -> None:
1738
1949
  """
1739
1950
  Processes and saves a DataFrame to an SQLite database, adding a timestamp column
1740
1951
  and replacing the existing table if needed. Creates the table if it does not exist.
@@ -1770,8 +1981,11 @@ def sync_dataframe_to_sqlite_database(db_path: str, tablename: str, df: pd.DataF
1770
1981
  if cursor.fetchall() == []: # Table does not exist
1771
1982
  # Create a table using the DataFrame's column names and types
1772
1983
  columns_with_types = ', '.join(
1773
- f'"{col}" {map_dtype(dtype)}' for col, dtype in zip(df.columns, df.dtypes)
1774
- )
1984
+ f'"{col}" {
1985
+ map_dtype(dtype)}' for col,
1986
+ dtype in zip(
1987
+ df.columns,
1988
+ df.dtypes))
1775
1989
  create_table_query = f'CREATE TABLE "{new_table_name}" ({columns_with_types})'
1776
1990
  conn.execute(create_table_query)
1777
1991
 
@@ -0,0 +1,104 @@
1
+ import os
2
+ import json
3
+ import requests
4
+ import inspect
5
+ from typing import Tuple, Optional, Dict, Callable
6
+ import warnings
7
+
8
+ # Suppress all FutureWarnings
9
+ warnings.filterwarnings("ignore", category=FutureWarning)
10
+
11
+
12
+ def str_docs(method_type_filter: Optional[str] = None) -> None:
13
+ """
14
+ Print a list of function names in alphabetical order. If method_type_filter
15
+ is specified, print the docstrings of the functions that match the filter.
16
+ Using '*' as a filter will print the docstrings for all functions.
17
+
18
+ Parameters:
19
+ method_type_filter: Optional filter string representing a function name,
20
+ or '*' to display docstrings for all functions.
21
+ """
22
+ # Get the current module's namespace
23
+ current_module = __name__
24
+
25
+ local_functions: Dict[str, Callable] = {
26
+ name: obj for name, obj in globals().items()
27
+ if inspect.isfunction(obj) and obj.__module__ == current_module
28
+ }
29
+
30
+ # List of function names sorted alphabetically
31
+ function_names = sorted(local_functions.keys())
32
+
33
+ # Print function names
34
+ print("Functions in alphabetical order:")
35
+ for name in function_names:
36
+ print(name)
37
+
38
+ # If a filter is provided or '*', print the docstrings of functions
39
+ if method_type_filter:
40
+ # print("\nFiltered function documentation:")
41
+ for name, func in local_functions.items():
42
+ docstring: Optional[str] = func.__doc__
43
+ if docstring:
44
+ if method_type_filter == '*' or method_type_filter == name:
45
+ # Print the entire docstring for the matching function
46
+ print(f"\n{name}:\n{docstring}")
47
+
48
+
49
+ def send_telegram_message(preset_name: str, message: str) -> None:
50
+ """Send a Telegram message using the specified preset.
51
+
52
+ Args:
53
+ preset_name (str): The name of the preset to use for sending the message.
54
+ message (str): The message to send.
55
+
56
+ Raises:
57
+ RuntimeError: If the preset is not found or necessary details are missing.
58
+ """
59
+
60
+ # Set the config path to ~/.rgwfuncsrc
61
+ config_path = os.path.expanduser("~/.rgwfuncsrc")
62
+
63
+ def load_config() -> dict:
64
+ """Load the configuration from the .rgwfuncsrc file."""
65
+ with open(config_path, 'r') as file:
66
+ return json.load(file)
67
+
68
+ def get_telegram_preset(config: dict, preset_name: str) -> dict:
69
+ """Get the Telegram preset configuration."""
70
+ presets = config.get("telegram_bot_presets", [])
71
+ for preset in presets:
72
+ if preset.get("name") == preset_name:
73
+ return preset
74
+ return None
75
+
76
+ def get_telegram_bot_details(config: dict, preset_name: str) -> Tuple[str, str]:
77
+ """Retrieve the Telegram bot token and chat ID from the preset."""
78
+ preset = get_telegram_preset(config, preset_name)
79
+ if not preset:
80
+ raise RuntimeError(f"Telegram bot preset '{preset_name}' not found in the configuration file")
81
+
82
+ bot_token = preset.get("bot_token")
83
+ chat_id = preset.get("chat_id")
84
+
85
+ if not bot_token or not chat_id:
86
+ raise RuntimeError(
87
+ f"Telegram bot token or chat ID for '{preset_name}' not found in the configuration file"
88
+ )
89
+
90
+ return bot_token, chat_id
91
+
92
+ # Load the configuration
93
+ config = load_config()
94
+
95
+ # Get bot details from the configuration
96
+ bot_token, chat_id = get_telegram_bot_details(config, preset_name)
97
+
98
+ # Prepare the request
99
+ url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
100
+ payload = {"chat_id": chat_id, "text": message}
101
+
102
+ # Send the message
103
+ response = requests.post(url, json=payload)
104
+ response.raise_for_status()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: rgwfuncs
3
- Version: 0.0.17
3
+ Version: 0.0.19
4
4
  Summary: A functional programming paradigm for mathematical modelling and data science
5
5
  Home-page: https://github.com/ryangerardwilson/rgwfunc
6
6
  Author: Ryan Gerard Wilson
@@ -135,11 +135,48 @@ To display all docstrings, use:
135
135
 
136
136
  --------------------------------------------------------------------------------
137
137
 
138
- ## Function References and Syntax Examples
138
+ ## String Based Functions
139
+
140
+ ### 1. str_docs
141
+ Print a list of available function names in alphabetical order. If a filter is provided, print the matching docstrings.
142
+
143
+ • Parameters:
144
+ - `method_type_filter` (str): Optional, comma-separated to select docstring types, or '*' for all.
145
+
146
+ • Example:
147
+
148
+ import rgwfuncs
149
+ rgwfuncs.str_docs(method_type_filter='numeric_clean,limit_dataframe')
150
+
151
+ --------------------------------------------------------------------------------
152
+
153
+ ### 2. send_telegram_message
154
+
155
+ Send a message to a Telegram chat using a specified preset from your configuration file.
156
+
157
+ • Parameters:
158
+ - `preset_name` (str): The name of the preset to use for sending the message. This should match a preset in the configuration file.
159
+ - `message` (str): The message text that you want to send to the Telegram chat.
160
+
161
+ • Raises:
162
+ - `RuntimeError`: If the preset is not found in the configuration file or if necessary details (bot token or chat ID) are missing.
163
+
164
+ • Example:
165
+
166
+ from rgwfuncs import send_telegram_message
167
+
168
+ preset_name = "daily_updates"
169
+ message = "Here is your daily update!"
170
+
171
+ send_telegram_message(preset_name, message)
172
+
173
+ --------------------------------------------------------------------------------
174
+
175
+ ## Dataframe Based Functions
139
176
 
140
177
  Below is a quick reference of available functions, their purpose, and basic usage examples.
141
178
 
142
- ### 1. docs
179
+ ### 1. df_docs
143
180
  Print a list of available function names in alphabetical order. If a filter is provided, print the matching docstrings.
144
181
 
145
182
  • Parameters:
@@ -148,7 +185,7 @@ Print a list of available function names in alphabetical order. If a filter is p
148
185
  • Example:
149
186
 
150
187
  import rgwfuncs
151
- rgwfuncs.docs(method_type_filter='numeric_clean,limit_dataframe')
188
+ rgwfuncs.df_docs(method_type_filter='numeric_clean,limit_dataframe')
152
189
 
153
190
  --------------------------------------------------------------------------------
154
191
 
@@ -4,6 +4,7 @@ pyproject.toml
4
4
  setup.cfg
5
5
  src/rgwfuncs/__init__.py
6
6
  src/rgwfuncs/df_lib.py
7
+ src/rgwfuncs/str_lib.py
7
8
  src/rgwfuncs.egg-info/PKG-INFO
8
9
  src/rgwfuncs.egg-info/SOURCES.txt
9
10
  src/rgwfuncs.egg-info/dependency_links.txt
@@ -1,4 +0,0 @@
1
- # This file is automatically generated
2
- # Dynamically importing functions from modules
3
-
4
- from .df_lib import append_columns, append_percentile_classification_column, append_ranged_classification_column, append_ranged_date_classification_column, append_rows, append_xgb_labels, append_xgb_logistic_regression_predictions, append_xgb_regression_predictions, bag_union_join, bottom_n_unique_values, cascade_sort, delete_rows, docs, drop_duplicates, drop_duplicates_retain_first, drop_duplicates_retain_last, filter_dataframe, filter_indian_mobiles, first_n_rows, from_raw_data, insert_dataframe_in_sqlite_database, last_n_rows, left_join, limit_dataframe, load_data_from_path, load_data_from_query, load_data_from_sqlite_path, mask_against_dataframe, mask_against_dataframe_converse, numeric_clean, order_columns, print_correlation, print_dataframe, print_memory_usage, print_n_frequency_cascading, print_n_frequency_linear, rename_columns, retain_columns, right_join, send_data_to_email, send_data_to_slack, send_dataframe_via_telegram, sync_dataframe_to_sqlite_database, top_n_unique_values, union_join, update_rows
File without changes