rgwfuncs 0.0.17__tar.gz → 0.0.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rgwfuncs-0.0.17/src/rgwfuncs.egg-info → rgwfuncs-0.0.19}/PKG-INFO +41 -4
- {rgwfuncs-0.0.17 → rgwfuncs-0.0.19}/README.md +40 -3
- {rgwfuncs-0.0.17 → rgwfuncs-0.0.19}/pyproject.toml +1 -1
- {rgwfuncs-0.0.17 → rgwfuncs-0.0.19}/setup.cfg +1 -1
- rgwfuncs-0.0.19/src/rgwfuncs/__init__.py +5 -0
- {rgwfuncs-0.0.17 → rgwfuncs-0.0.19}/src/rgwfuncs/df_lib.py +304 -90
- rgwfuncs-0.0.19/src/rgwfuncs/str_lib.py +104 -0
- {rgwfuncs-0.0.17 → rgwfuncs-0.0.19/src/rgwfuncs.egg-info}/PKG-INFO +41 -4
- {rgwfuncs-0.0.17 → rgwfuncs-0.0.19}/src/rgwfuncs.egg-info/SOURCES.txt +1 -0
- rgwfuncs-0.0.17/src/rgwfuncs/__init__.py +0 -4
- {rgwfuncs-0.0.17 → rgwfuncs-0.0.19}/LICENSE +0 -0
- {rgwfuncs-0.0.17 → rgwfuncs-0.0.19}/src/rgwfuncs.egg-info/dependency_links.txt +0 -0
- {rgwfuncs-0.0.17 → rgwfuncs-0.0.19}/src/rgwfuncs.egg-info/entry_points.txt +0 -0
- {rgwfuncs-0.0.17 → rgwfuncs-0.0.19}/src/rgwfuncs.egg-info/requires.txt +0 -0
- {rgwfuncs-0.0.17 → rgwfuncs-0.0.19}/src/rgwfuncs.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: rgwfuncs
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.19
|
4
4
|
Summary: A functional programming paradigm for mathematical modelling and data science
|
5
5
|
Home-page: https://github.com/ryangerardwilson/rgwfunc
|
6
6
|
Author: Ryan Gerard Wilson
|
@@ -135,11 +135,48 @@ To display all docstrings, use:
|
|
135
135
|
|
136
136
|
--------------------------------------------------------------------------------
|
137
137
|
|
138
|
-
##
|
138
|
+
## String Based Functions
|
139
|
+
|
140
|
+
### 1. str_docs
|
141
|
+
Print a list of available function names in alphabetical order. If a filter is provided, print the matching docstrings.
|
142
|
+
|
143
|
+
• Parameters:
|
144
|
+
- `method_type_filter` (str): Optional, comma-separated to select docstring types, or '*' for all.
|
145
|
+
|
146
|
+
• Example:
|
147
|
+
|
148
|
+
import rgwfuncs
|
149
|
+
rgwfuncs.str_docs(method_type_filter='numeric_clean,limit_dataframe')
|
150
|
+
|
151
|
+
--------------------------------------------------------------------------------
|
152
|
+
|
153
|
+
### 2. send_telegram_message
|
154
|
+
|
155
|
+
Send a message to a Telegram chat using a specified preset from your configuration file.
|
156
|
+
|
157
|
+
• Parameters:
|
158
|
+
- `preset_name` (str): The name of the preset to use for sending the message. This should match a preset in the configuration file.
|
159
|
+
- `message` (str): The message text that you want to send to the Telegram chat.
|
160
|
+
|
161
|
+
• Raises:
|
162
|
+
- `RuntimeError`: If the preset is not found in the configuration file or if necessary details (bot token or chat ID) are missing.
|
163
|
+
|
164
|
+
• Example:
|
165
|
+
|
166
|
+
from rgwfuncs import send_telegram_message
|
167
|
+
|
168
|
+
preset_name = "daily_updates"
|
169
|
+
message = "Here is your daily update!"
|
170
|
+
|
171
|
+
send_telegram_message(preset_name, message)
|
172
|
+
|
173
|
+
--------------------------------------------------------------------------------
|
174
|
+
|
175
|
+
## Dataframe Based Functions
|
139
176
|
|
140
177
|
Below is a quick reference of available functions, their purpose, and basic usage examples.
|
141
178
|
|
142
|
-
### 1.
|
179
|
+
### 1. df_docs
|
143
180
|
Print a list of available function names in alphabetical order. If a filter is provided, print the matching docstrings.
|
144
181
|
|
145
182
|
• Parameters:
|
@@ -148,7 +185,7 @@ Print a list of available function names in alphabetical order. If a filter is p
|
|
148
185
|
• Example:
|
149
186
|
|
150
187
|
import rgwfuncs
|
151
|
-
rgwfuncs.
|
188
|
+
rgwfuncs.df_docs(method_type_filter='numeric_clean,limit_dataframe')
|
152
189
|
|
153
190
|
--------------------------------------------------------------------------------
|
154
191
|
|
@@ -109,11 +109,48 @@ To display all docstrings, use:
|
|
109
109
|
|
110
110
|
--------------------------------------------------------------------------------
|
111
111
|
|
112
|
-
##
|
112
|
+
## String Based Functions
|
113
|
+
|
114
|
+
### 1. str_docs
|
115
|
+
Print a list of available function names in alphabetical order. If a filter is provided, print the matching docstrings.
|
116
|
+
|
117
|
+
• Parameters:
|
118
|
+
- `method_type_filter` (str): Optional, comma-separated to select docstring types, or '*' for all.
|
119
|
+
|
120
|
+
• Example:
|
121
|
+
|
122
|
+
import rgwfuncs
|
123
|
+
rgwfuncs.str_docs(method_type_filter='numeric_clean,limit_dataframe')
|
124
|
+
|
125
|
+
--------------------------------------------------------------------------------
|
126
|
+
|
127
|
+
### 2. send_telegram_message
|
128
|
+
|
129
|
+
Send a message to a Telegram chat using a specified preset from your configuration file.
|
130
|
+
|
131
|
+
• Parameters:
|
132
|
+
- `preset_name` (str): The name of the preset to use for sending the message. This should match a preset in the configuration file.
|
133
|
+
- `message` (str): The message text that you want to send to the Telegram chat.
|
134
|
+
|
135
|
+
• Raises:
|
136
|
+
- `RuntimeError`: If the preset is not found in the configuration file or if necessary details (bot token or chat ID) are missing.
|
137
|
+
|
138
|
+
• Example:
|
139
|
+
|
140
|
+
from rgwfuncs import send_telegram_message
|
141
|
+
|
142
|
+
preset_name = "daily_updates"
|
143
|
+
message = "Here is your daily update!"
|
144
|
+
|
145
|
+
send_telegram_message(preset_name, message)
|
146
|
+
|
147
|
+
--------------------------------------------------------------------------------
|
148
|
+
|
149
|
+
## Dataframe Based Functions
|
113
150
|
|
114
151
|
Below is a quick reference of available functions, their purpose, and basic usage examples.
|
115
152
|
|
116
|
-
### 1.
|
153
|
+
### 1. df_docs
|
117
154
|
Print a list of available function names in alphabetical order. If a filter is provided, print the matching docstrings.
|
118
155
|
|
119
156
|
• Parameters:
|
@@ -122,7 +159,7 @@ Print a list of available function names in alphabetical order. If a filter is p
|
|
122
159
|
• Example:
|
123
160
|
|
124
161
|
import rgwfuncs
|
125
|
-
rgwfuncs.
|
162
|
+
rgwfuncs.df_docs(method_type_filter='numeric_clean,limit_dataframe')
|
126
163
|
|
127
164
|
--------------------------------------------------------------------------------
|
128
165
|
|
@@ -0,0 +1,5 @@
|
|
1
|
+
# This file is automatically generated
|
2
|
+
# Dynamically importing functions from modules
|
3
|
+
|
4
|
+
from .df_lib import append_columns, append_percentile_classification_column, append_ranged_classification_column, append_ranged_date_classification_column, append_rows, append_xgb_labels, append_xgb_logistic_regression_predictions, append_xgb_regression_predictions, bag_union_join, bottom_n_unique_values, cascade_sort, delete_rows, df_docs, drop_duplicates, drop_duplicates_retain_first, drop_duplicates_retain_last, filter_dataframe, filter_indian_mobiles, first_n_rows, from_raw_data, insert_dataframe_in_sqlite_database, last_n_rows, left_join, limit_dataframe, load_data_from_path, load_data_from_query, load_data_from_sqlite_path, mask_against_dataframe, mask_against_dataframe_converse, numeric_clean, order_columns, print_correlation, print_dataframe, print_memory_usage, print_n_frequency_cascading, print_n_frequency_linear, rename_columns, retain_columns, right_join, send_data_to_email, send_data_to_slack, send_dataframe_via_telegram, sync_dataframe_to_sqlite_database, top_n_unique_values, union_join, update_rows
|
5
|
+
from .str_lib import send_telegram_message, str_docs
|
@@ -29,7 +29,7 @@ import warnings
|
|
29
29
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
30
30
|
|
31
31
|
|
32
|
-
def
|
32
|
+
def df_docs(method_type_filter: Optional[str] = None) -> None:
|
33
33
|
"""
|
34
34
|
Print a list of function names in alphabetical order. If method_type_filter
|
35
35
|
is specified, print the docstrings of the functions that match the filter.
|
@@ -66,7 +66,11 @@ def docs(method_type_filter: Optional[str] = None) -> None:
|
|
66
66
|
print(f"\n{name}:\n{docstring}")
|
67
67
|
|
68
68
|
|
69
|
-
def numeric_clean(
|
69
|
+
def numeric_clean(
|
70
|
+
df: pd.DataFrame,
|
71
|
+
column_names: str,
|
72
|
+
column_type: str,
|
73
|
+
irregular_value_treatment: str) -> pd.DataFrame:
|
70
74
|
"""
|
71
75
|
Cleans the numeric columns based on specified treatments.
|
72
76
|
|
@@ -297,7 +301,9 @@ def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
|
|
297
301
|
return df.drop_duplicates(keep='first')
|
298
302
|
|
299
303
|
|
300
|
-
def drop_duplicates_retain_first(
|
304
|
+
def drop_duplicates_retain_first(
|
305
|
+
df: pd.DataFrame,
|
306
|
+
columns: Optional[str] = None) -> pd.DataFrame:
|
301
307
|
"""
|
302
308
|
Drop duplicate rows in the DataFrame based on specified columns, retaining the first occurrence.
|
303
309
|
|
@@ -319,7 +325,9 @@ def drop_duplicates_retain_first(df: pd.DataFrame, columns: Optional[str] = None
|
|
319
325
|
return df.drop_duplicates(subset=columns_list, keep='first')
|
320
326
|
|
321
327
|
|
322
|
-
def drop_duplicates_retain_last(
|
328
|
+
def drop_duplicates_retain_last(
|
329
|
+
df: pd.DataFrame,
|
330
|
+
columns: Optional[str] = None) -> pd.DataFrame:
|
323
331
|
"""
|
324
332
|
Drop duplicate rows in the DataFrame based on specified columns, retaining the last occurrence.
|
325
333
|
|
@@ -336,7 +344,8 @@ def drop_duplicates_retain_last(df: pd.DataFrame, columns: Optional[str] = None)
|
|
336
344
|
if df is None:
|
337
345
|
raise ValueError("DataFrame is not initialized.")
|
338
346
|
|
339
|
-
columns_list = [col.strip()
|
347
|
+
columns_list = [col.strip()
|
348
|
+
for col in columns.split(',')] if columns else None
|
340
349
|
return df.drop_duplicates(subset=columns_list, keep='last')
|
341
350
|
|
342
351
|
|
@@ -380,11 +389,13 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
380
389
|
with conn.cursor() as cursor:
|
381
390
|
cursor.execute(query)
|
382
391
|
rows = cursor.fetchall()
|
383
|
-
columns = ([desc[0] for desc in cursor.description]
|
392
|
+
columns = ([desc[0] for desc in cursor.description]
|
393
|
+
if cursor.description else [])
|
384
394
|
|
385
395
|
return pd.DataFrame(rows, columns=columns)
|
386
396
|
|
387
|
-
def query_clickhouse(
|
397
|
+
def query_clickhouse(
|
398
|
+
db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
388
399
|
host = db_preset['host']
|
389
400
|
user = db_preset['username']
|
390
401
|
password = db_preset['password']
|
@@ -395,7 +406,8 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
395
406
|
|
396
407
|
for attempt in range(max_retries):
|
397
408
|
try:
|
398
|
-
client = clickhouse_connect.get_client(
|
409
|
+
client = clickhouse_connect.get_client(
|
410
|
+
host=host, port='8123', username=user, password=password, database=database)
|
399
411
|
data = client.query(query)
|
400
412
|
rows = data.result_rows
|
401
413
|
columns = data.column_names
|
@@ -409,11 +421,13 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
409
421
|
raise ConnectionError(
|
410
422
|
"All attempts to connect to ClickHouse failed.")
|
411
423
|
|
412
|
-
def query_google_big_query(
|
424
|
+
def query_google_big_query(
|
425
|
+
db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
413
426
|
json_file_path = db_preset['json_file_path']
|
414
427
|
project_id = db_preset['project_id']
|
415
428
|
|
416
|
-
credentials = service_account.Credentials.from_service_account_file(
|
429
|
+
credentials = service_account.Credentials.from_service_account_file(
|
430
|
+
json_file_path)
|
417
431
|
client = bigquery.Client(credentials=credentials, project=project_id)
|
418
432
|
|
419
433
|
query_job = client.query(query)
|
@@ -429,7 +443,9 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
429
443
|
config = json.load(f)
|
430
444
|
|
431
445
|
db_presets = config.get('db_presets', [])
|
432
|
-
db_preset = next(
|
446
|
+
db_preset = next(
|
447
|
+
(preset for preset in db_presets if preset['name'] == db_preset_name),
|
448
|
+
None)
|
433
449
|
if not db_preset:
|
434
450
|
raise ValueError(f"No matching db_preset found for {db_preset_name}")
|
435
451
|
|
@@ -447,7 +463,6 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
447
463
|
raise ValueError(f"Unsupported db_type: {db_type}")
|
448
464
|
|
449
465
|
|
450
|
-
|
451
466
|
def load_data_from_path(file_path: str) -> pd.DataFrame:
|
452
467
|
"""
|
453
468
|
Load data from a file into a DataFrame based on the file extension.
|
@@ -608,10 +623,20 @@ def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
|
|
608
623
|
for column in columns:
|
609
624
|
if column in df.columns:
|
610
625
|
frequency = df[column].astype(str).value_counts(dropna=False)
|
611
|
-
frequency = frequency.rename(
|
626
|
+
frequency = frequency.rename(
|
627
|
+
index={
|
628
|
+
'nan': 'NaN',
|
629
|
+
'NaT': 'NaT',
|
630
|
+
'None': 'None',
|
631
|
+
'': 'Empty'})
|
612
632
|
top_n_values = frequency.nlargest(n)
|
613
|
-
report[column] = {str(value): str(count)
|
614
|
-
|
633
|
+
report[column] = {str(value): str(count)
|
634
|
+
for value, count in top_n_values.items()}
|
635
|
+
print(
|
636
|
+
f"Top {n} unique values for column '{column}':\n{
|
637
|
+
json.dumps(
|
638
|
+
report[column],
|
639
|
+
indent=2)}\n")
|
615
640
|
else:
|
616
641
|
print(f"Column '{column}' does not exist in the DataFrame.")
|
617
642
|
else:
|
@@ -621,7 +646,10 @@ def top_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None:
|
|
621
646
|
gc.collect()
|
622
647
|
|
623
648
|
|
624
|
-
def bottom_n_unique_values(
|
649
|
+
def bottom_n_unique_values(
|
650
|
+
df: pd.DataFrame,
|
651
|
+
n: int,
|
652
|
+
columns: List[str]) -> None:
|
625
653
|
"""
|
626
654
|
Print the bottom `n` unique values for specified columns in the DataFrame.
|
627
655
|
|
@@ -641,12 +669,21 @@ def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None
|
|
641
669
|
for column in columns:
|
642
670
|
if column in df.columns:
|
643
671
|
frequency = df[column].astype(str).value_counts(dropna=False)
|
644
|
-
frequency = frequency.rename(
|
672
|
+
frequency = frequency.rename(
|
673
|
+
index={
|
674
|
+
'nan': 'NaN',
|
675
|
+
'NaT': 'NaT',
|
676
|
+
'None': 'None',
|
677
|
+
'': 'Empty'})
|
645
678
|
bottom_n_values = frequency.nsmallest(n)
|
646
679
|
report[column] = {
|
647
680
|
str(value): str(count) for value,
|
648
681
|
count in bottom_n_values.items()}
|
649
|
-
print(
|
682
|
+
print(
|
683
|
+
f"Bottom {n} unique values for column '{column}':\n{
|
684
|
+
json.dumps(
|
685
|
+
report[column],
|
686
|
+
indent=2)}\n")
|
650
687
|
else:
|
651
688
|
print(f"Column '{column}' does not exist in the DataFrame.")
|
652
689
|
else:
|
@@ -656,7 +693,8 @@ def bottom_n_unique_values(df: pd.DataFrame, n: int, columns: List[str]) -> None
|
|
656
693
|
gc.collect()
|
657
694
|
|
658
695
|
|
659
|
-
def print_correlation(
|
696
|
+
def print_correlation(
|
697
|
+
df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) -> None:
|
660
698
|
"""
|
661
699
|
Print correlation for multiple pairs of columns in the DataFrame.
|
662
700
|
|
@@ -675,13 +713,16 @@ def print_correlation(df: pd.DataFrame, column_pairs: List[Tuple[str, str]]) ->
|
|
675
713
|
|
676
714
|
correlation = numeric_col1.corr(numeric_col2)
|
677
715
|
if pd.notnull(correlation):
|
678
|
-
print(
|
716
|
+
print(
|
717
|
+
f"The correlation between '{col1}' and '{col2}' is {correlation}.")
|
679
718
|
else:
|
680
|
-
print(
|
719
|
+
print(
|
720
|
+
f"Cannot calculate correlation between '{col1}' and '{col2}' due to insufficient numeric data.")
|
681
721
|
except Exception as e:
|
682
722
|
print(f"Error processing cols '{col1}' and '{col2}': {e}")
|
683
723
|
else:
|
684
|
-
print(
|
724
|
+
print(
|
725
|
+
f"One or both of the specified cols ('{col1}', '{col2}') do not exist in the DataFrame.")
|
685
726
|
else:
|
686
727
|
print("The DataFrame is empty.")
|
687
728
|
|
@@ -701,7 +742,8 @@ def print_memory_usage(df: pd.DataFrame) -> None:
|
|
701
742
|
- ValueError: If the DataFrame is `None`.
|
702
743
|
"""
|
703
744
|
if df is not None:
|
704
|
-
memory_usage = df.memory_usage(deep=True).sum(
|
745
|
+
memory_usage = df.memory_usage(deep=True).sum(
|
746
|
+
) / (1024 * 1024) # Convert bytes to MB
|
705
747
|
print(f"Memory usage of DataFrame: {memory_usage:.2f} MB")
|
706
748
|
else:
|
707
749
|
raise ValueError("No DataFrame to print. Please provide a DataFrame.")
|
@@ -782,7 +824,8 @@ def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
|
|
782
824
|
"""
|
783
825
|
if df is not None:
|
784
826
|
print(df)
|
785
|
-
columns_with_types = [
|
827
|
+
columns_with_types = [
|
828
|
+
f"{col} ({df[col].dtypes})" for col in df.columns]
|
786
829
|
print("Columns:", columns_with_types)
|
787
830
|
if source:
|
788
831
|
print(f"Source: {source}")
|
@@ -820,7 +863,9 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
|
|
820
863
|
config_path = os.path.expanduser('~/.rgwfuncsrc')
|
821
864
|
config = get_config(config_path)
|
822
865
|
|
823
|
-
bot_config = next(
|
866
|
+
bot_config = next(
|
867
|
+
(bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name),
|
868
|
+
None)
|
824
869
|
if not bot_config:
|
825
870
|
raise ValueError(f"No bot found with the name {bot_name}")
|
826
871
|
|
@@ -834,9 +879,15 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
|
|
834
879
|
df.to_csv(file_name, index=False)
|
835
880
|
try:
|
836
881
|
with open(file_name, 'rb') as file:
|
837
|
-
payload = {
|
882
|
+
payload = {
|
883
|
+
'chat_id': bot_config['chat_id'],
|
884
|
+
'caption': message or ''}
|
838
885
|
files = {'document': file}
|
839
|
-
response = requests.post(
|
886
|
+
response = requests.post(
|
887
|
+
f"https://api.telegram.org/bot{
|
888
|
+
bot_config['bot_token']}/sendDocument",
|
889
|
+
data=payload,
|
890
|
+
files=files)
|
840
891
|
if remove_after_send and os.path.exists(file_name):
|
841
892
|
os.remove(file_name)
|
842
893
|
except Exception as e:
|
@@ -849,7 +900,8 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
|
|
849
900
|
'text': (message + "\n\n" + df_str) if message else df_str,
|
850
901
|
'parse_mode': 'HTML'
|
851
902
|
}
|
852
|
-
response = requests.post(
|
903
|
+
response = requests.post(
|
904
|
+
f"https://api.telegram.org/bot{bot_config['bot_token']}/sendMessage", data=payload)
|
853
905
|
|
854
906
|
if response and not response.ok:
|
855
907
|
raise Exception(f"Error sending message: {response.text}")
|
@@ -857,7 +909,14 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
|
|
857
909
|
print("Message sent successfully.")
|
858
910
|
|
859
911
|
|
860
|
-
def send_data_to_email(
|
912
|
+
def send_data_to_email(
|
913
|
+
df: pd.DataFrame,
|
914
|
+
preset_name: str,
|
915
|
+
to_email: str,
|
916
|
+
subject: Optional[str] = None,
|
917
|
+
body: Optional[str] = None,
|
918
|
+
as_file: bool = True,
|
919
|
+
remove_after_send: bool = True) -> None:
|
861
920
|
"""
|
862
921
|
Send an email with an optional DataFrame attachment using the Gmail API via a specified preset.
|
863
922
|
|
@@ -885,7 +944,9 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec
|
|
885
944
|
except json.JSONDecodeError as e:
|
886
945
|
raise ValueError(f"Invalid JSON format in config file: {e}")
|
887
946
|
|
888
|
-
def authenticate_service_account(
|
947
|
+
def authenticate_service_account(
|
948
|
+
service_account_credentials_path: str,
|
949
|
+
sender_email_id: str) -> Any:
|
889
950
|
credentials = service_account.Credentials.from_service_account_file(
|
890
951
|
service_account_credentials_path,
|
891
952
|
scopes=['https://mail.google.com/'],
|
@@ -898,7 +959,9 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec
|
|
898
959
|
config = get_config(config_path)
|
899
960
|
|
900
961
|
# Retrieve Gmail preset configuration
|
901
|
-
gmail_config = next(
|
962
|
+
gmail_config = next(
|
963
|
+
(preset for preset in config['gmail_bot_presets'] if preset['name'] == preset_name),
|
964
|
+
None)
|
902
965
|
|
903
966
|
if not gmail_config:
|
904
967
|
raise ValueError(f"No preset found with the name {preset_name}")
|
@@ -921,13 +984,18 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec
|
|
921
984
|
message['to'] = to_email
|
922
985
|
message['from'] = sender_email
|
923
986
|
message['subject'] = subject if subject else 'DataFrame CSV File'
|
924
|
-
message.attach(
|
987
|
+
message.attach(
|
988
|
+
MIMEText(
|
989
|
+
body if body else 'Please find the CSV file attached.'))
|
925
990
|
|
926
991
|
with open(tmp_file_name, 'rb') as file:
|
927
992
|
part = MIMEBase('application', 'octet-stream')
|
928
993
|
part.set_payload(file.read())
|
929
994
|
encoders.encode_base64(part)
|
930
|
-
part.add_header(
|
995
|
+
part.add_header(
|
996
|
+
'Content-Disposition',
|
997
|
+
f'attachment; filename={
|
998
|
+
os.path.basename(tmp_file_name)}')
|
931
999
|
message.attach(part)
|
932
1000
|
|
933
1001
|
if remove_after_send and os.path.exists(tmp_file_name):
|
@@ -949,13 +1017,19 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec
|
|
949
1017
|
try:
|
950
1018
|
raw = base64.urlsafe_b64encode(message.as_bytes()).decode()
|
951
1019
|
email_body = {'raw': raw}
|
952
|
-
sent_message = service.users().messages().send(
|
1020
|
+
sent_message = service.users().messages().send(
|
1021
|
+
userId="me", body=email_body).execute()
|
953
1022
|
print(f"Email with Message Id {sent_message['id']} successfully sent.")
|
954
1023
|
except Exception as error:
|
955
1024
|
raise Exception(f"Error sending email: {error}")
|
956
1025
|
|
957
1026
|
|
958
|
-
def send_data_to_slack(
|
1027
|
+
def send_data_to_slack(
|
1028
|
+
df: pd.DataFrame,
|
1029
|
+
bot_name: str,
|
1030
|
+
message: Optional[str] = None,
|
1031
|
+
as_file: bool = True,
|
1032
|
+
remove_after_send: bool = True) -> None:
|
959
1033
|
"""
|
960
1034
|
Send a DataFrame or message to Slack using a specified bot configuration.
|
961
1035
|
|
@@ -983,7 +1057,9 @@ def send_data_to_slack(df: pd.DataFrame, bot_name: str, message: Optional[str] =
|
|
983
1057
|
config_path = os.path.expanduser('~/.rgwfuncsrc')
|
984
1058
|
config = get_config(config_path)
|
985
1059
|
|
986
|
-
bot_config = next(
|
1060
|
+
bot_config = next(
|
1061
|
+
(bot for bot in config['slack_bot_presets'] if bot['name'] == bot_name),
|
1062
|
+
None)
|
987
1063
|
|
988
1064
|
if not bot_config:
|
989
1065
|
raise ValueError(f"No bot found with the name {bot_name}")
|
@@ -1070,7 +1146,11 @@ def order_columns(df: pd.DataFrame, column_order_str: str) -> pd.DataFrame:
|
|
1070
1146
|
return df[new_order]
|
1071
1147
|
|
1072
1148
|
|
1073
|
-
def append_ranged_classification_column(
|
1149
|
+
def append_ranged_classification_column(
|
1150
|
+
df: pd.DataFrame,
|
1151
|
+
ranges: str,
|
1152
|
+
target_col: str,
|
1153
|
+
new_col_name: str) -> pd.DataFrame:
|
1074
1154
|
"""
|
1075
1155
|
Append a ranged classification column to the DataFrame.
|
1076
1156
|
|
@@ -1138,16 +1218,27 @@ def append_ranged_classification_column(df: pd.DataFrame, ranges: str, target_co
|
|
1138
1218
|
for r in range_list
|
1139
1219
|
)
|
1140
1220
|
|
1141
|
-
labels = [f"{pad_number(range_list[i],
|
1221
|
+
labels = [f"{pad_number(range_list[i],
|
1222
|
+
max_integer_length)} to {pad_number(range_list[i + 1],
|
1223
|
+
max_integer_length)}" for i in range(len(range_list) - 1)]
|
1142
1224
|
|
1143
1225
|
# Ensure the target column is numeric
|
1144
1226
|
df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
|
1145
|
-
df[new_col_name] = pd.cut(
|
1227
|
+
df[new_col_name] = pd.cut(
|
1228
|
+
df[target_col],
|
1229
|
+
bins=range_list,
|
1230
|
+
labels=labels,
|
1231
|
+
right=False,
|
1232
|
+
include_lowest=True)
|
1146
1233
|
|
1147
1234
|
return df
|
1148
1235
|
|
1149
1236
|
|
1150
|
-
def append_percentile_classification_column(
|
1237
|
+
def append_percentile_classification_column(
|
1238
|
+
df: pd.DataFrame,
|
1239
|
+
percentiles: str,
|
1240
|
+
target_col: str,
|
1241
|
+
new_col_name: str) -> pd.DataFrame:
|
1151
1242
|
"""
|
1152
1243
|
Append a percentile classification column to the DataFrame.
|
1153
1244
|
|
@@ -1175,14 +1266,21 @@ def append_percentile_classification_column(df: pd.DataFrame, percentiles: str,
|
|
1175
1266
|
|
1176
1267
|
if has_decimals:
|
1177
1268
|
percentiles_list = [float(p) for p in percentiles_list]
|
1178
|
-
max_decimal_length = max(
|
1179
|
-
|
1269
|
+
max_decimal_length = max(
|
1270
|
+
len(str(p).split('.')[1]) for p in percentiles_list if '.' in str(p))
|
1271
|
+
max_integer_length = max(len(str(int(float(p))))
|
1272
|
+
for p in percentiles_list)
|
1180
1273
|
|
1181
1274
|
labels = []
|
1182
1275
|
|
1183
1276
|
for i in range(len(percentiles_list) - 1):
|
1184
|
-
start = pad_number(
|
1185
|
-
|
1277
|
+
start = pad_number(
|
1278
|
+
percentiles_list[i],
|
1279
|
+
max_integer_length,
|
1280
|
+
max_decimal_length,
|
1281
|
+
decimal=True)
|
1282
|
+
end = pad_number(
|
1283
|
+
percentiles_list[i + 1], max_integer_length, max_decimal_length, decimal=True)
|
1186
1284
|
|
1187
1285
|
label = f"{start} to {end}"
|
1188
1286
|
labels.append(label)
|
@@ -1205,12 +1303,20 @@ def append_percentile_classification_column(df: pd.DataFrame, percentiles: str,
|
|
1205
1303
|
df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
|
1206
1304
|
quantiles = [df[target_col].quantile(p / 100) for p in percentiles_list]
|
1207
1305
|
|
1208
|
-
df[new_col_name] = pd.cut(
|
1306
|
+
df[new_col_name] = pd.cut(
|
1307
|
+
df[target_col],
|
1308
|
+
bins=quantiles,
|
1309
|
+
labels=labels,
|
1310
|
+
include_lowest=True)
|
1209
1311
|
|
1210
1312
|
return df
|
1211
1313
|
|
1212
1314
|
|
1213
|
-
def append_ranged_date_classification_column(
|
1315
|
+
def append_ranged_date_classification_column(
|
1316
|
+
df: pd.DataFrame,
|
1317
|
+
date_ranges: str,
|
1318
|
+
target_col: str,
|
1319
|
+
new_col_name: str) -> pd.DataFrame:
|
1214
1320
|
"""
|
1215
1321
|
Append a ranged date classification column to the DataFrame.
|
1216
1322
|
|
@@ -1243,7 +1349,9 @@ def append_ranged_date_classification_column(df: pd.DataFrame, date_ranges: str,
|
|
1243
1349
|
return df
|
1244
1350
|
|
1245
1351
|
|
1246
|
-
def rename_columns(df: pd.DataFrame,
|
1352
|
+
def rename_columns(df: pd.DataFrame,
|
1353
|
+
rename_pairs: Dict[str,
|
1354
|
+
str]) -> pd.DataFrame:
|
1247
1355
|
"""
|
1248
1356
|
Rename columns in the DataFrame.
|
1249
1357
|
|
@@ -1255,7 +1363,8 @@ def rename_columns(df: pd.DataFrame, rename_pairs: Dict[str, str]) -> pd.DataFra
|
|
1255
1363
|
A new DataFrame with columns renamed.
|
1256
1364
|
"""
|
1257
1365
|
if df is None:
|
1258
|
-
raise ValueError(
|
1366
|
+
raise ValueError(
|
1367
|
+
"No DataFrame to rename columns. Please provide a valid DataFrame.")
|
1259
1368
|
|
1260
1369
|
return df.rename(columns=rename_pairs)
|
1261
1370
|
|
@@ -1273,7 +1382,8 @@ def cascade_sort(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
|
|
1273
1382
|
A new DataFrame sorted by specified columns.
|
1274
1383
|
"""
|
1275
1384
|
if df is None:
|
1276
|
-
raise ValueError(
|
1385
|
+
raise ValueError(
|
1386
|
+
"No DataFrame to sort. Please provide a valid DataFrame.")
|
1277
1387
|
|
1278
1388
|
col_names = []
|
1279
1389
|
asc_order = []
|
@@ -1308,7 +1418,8 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
|
|
1308
1418
|
A new DataFrame with XGB_TYPE labels appended.
|
1309
1419
|
"""
|
1310
1420
|
if df is None:
|
1311
|
-
raise ValueError(
|
1421
|
+
raise ValueError(
|
1422
|
+
"No DataFrame to add labels. Please provide a valid DataFrame.")
|
1312
1423
|
|
1313
1424
|
ratios = list(map(int, ratio_str.split(':')))
|
1314
1425
|
total_ratio = sum(ratios)
|
@@ -1325,7 +1436,8 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
|
|
1325
1436
|
labels = ['TRAIN'] * train_rows + ['VALIDATE'] * \
|
1326
1437
|
validate_rows + ['TEST'] * test_rows
|
1327
1438
|
else:
|
1328
|
-
raise ValueError(
|
1439
|
+
raise ValueError(
|
1440
|
+
"Invalid ratio string format. Use 'TRAIN:TEST' or 'TRAIN:VALIDATE:TEST'.")
|
1329
1441
|
|
1330
1442
|
df_with_labels = df.copy()
|
1331
1443
|
df_with_labels['XGB_TYPE'] = labels
|
@@ -1333,7 +1445,13 @@ def append_xgb_labels(df: pd.DataFrame, ratio_str: str) -> pd.DataFrame:
|
|
1333
1445
|
return df_with_labels
|
1334
1446
|
|
1335
1447
|
|
1336
|
-
def append_xgb_regression_predictions(
|
1448
|
+
def append_xgb_regression_predictions(
|
1449
|
+
df: pd.DataFrame,
|
1450
|
+
target_col: str,
|
1451
|
+
feature_cols: str,
|
1452
|
+
pred_col: str,
|
1453
|
+
boosting_rounds: int = 100,
|
1454
|
+
model_path: Optional[str] = None) -> pd.DataFrame:
|
1337
1455
|
"""
|
1338
1456
|
Append XGB regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
|
1339
1457
|
|
@@ -1349,7 +1467,8 @@ def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature
|
|
1349
1467
|
DataFrame with predictions appended.
|
1350
1468
|
"""
|
1351
1469
|
if df is None or 'XGB_TYPE' not in df.columns:
|
1352
|
-
raise ValueError(
|
1470
|
+
raise ValueError(
|
1471
|
+
"DataFrame is not initialized or 'XGB_TYPE' column is missing.")
|
1353
1472
|
|
1354
1473
|
features = feature_cols.replace(' ', '').split(',')
|
1355
1474
|
|
@@ -1365,16 +1484,27 @@ def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature
|
|
1365
1484
|
else:
|
1366
1485
|
validate_data = None
|
1367
1486
|
|
1368
|
-
dtrain = xgb.DMatrix(
|
1487
|
+
dtrain = xgb.DMatrix(
|
1488
|
+
train_data[features],
|
1489
|
+
label=train_data[target_col],
|
1490
|
+
enable_categorical=True)
|
1369
1491
|
evals = [(dtrain, 'train')]
|
1370
1492
|
|
1371
1493
|
if validate_data is not None:
|
1372
|
-
dvalidate = xgb.DMatrix(
|
1494
|
+
dvalidate = xgb.DMatrix(
|
1495
|
+
validate_data[features],
|
1496
|
+
label=validate_data[target_col],
|
1497
|
+
enable_categorical=True)
|
1373
1498
|
evals.append((dvalidate, 'validate'))
|
1374
1499
|
|
1375
1500
|
params = {'objective': 'reg:squarederror', 'eval_metric': 'rmse'}
|
1376
1501
|
|
1377
|
-
model = xgb.train(
|
1502
|
+
model = xgb.train(
|
1503
|
+
params,
|
1504
|
+
dtrain,
|
1505
|
+
num_boost_round=boosting_rounds,
|
1506
|
+
evals=evals,
|
1507
|
+
early_stopping_rounds=10 if validate_data is not None else None)
|
1378
1508
|
|
1379
1509
|
# Make predictions for all data
|
1380
1510
|
dall = xgb.DMatrix(df[features], enable_categorical=True)
|
@@ -1383,13 +1513,20 @@ def append_xgb_regression_predictions(df: pd.DataFrame, target_col: str, feature
|
|
1383
1513
|
if model_path:
|
1384
1514
|
model.save_model(model_path)
|
1385
1515
|
|
1386
|
-
columns_order = [col for col in df.columns if col not in [
|
1516
|
+
columns_order = [col for col in df.columns if col not in [
|
1517
|
+
'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
|
1387
1518
|
df = df[columns_order]
|
1388
1519
|
|
1389
1520
|
return df
|
1390
1521
|
|
1391
1522
|
|
1392
|
-
def append_xgb_logistic_regression_predictions(
|
1523
|
+
def append_xgb_logistic_regression_predictions(
|
1524
|
+
df: pd.DataFrame,
|
1525
|
+
target_col: str,
|
1526
|
+
feature_cols: str,
|
1527
|
+
pred_col: str,
|
1528
|
+
boosting_rounds: int = 100,
|
1529
|
+
model_path: Optional[str] = None) -> pd.DataFrame:
|
1393
1530
|
"""
|
1394
1531
|
Append XGB logistic regression predictions to DataFrame. Assumes data is labeled by an 'XGB_TYPE' column.
|
1395
1532
|
|
@@ -1421,16 +1558,27 @@ def append_xgb_logistic_regression_predictions(df: pd.DataFrame, target_col: str
|
|
1421
1558
|
if 'VALIDATE' in df['XGB_TYPE'].values:
|
1422
1559
|
validate_data = df[df['XGB_TYPE'] == 'VALIDATE']
|
1423
1560
|
|
1424
|
-
dtrain = xgb.DMatrix(
|
1561
|
+
dtrain = xgb.DMatrix(
|
1562
|
+
train_data[features],
|
1563
|
+
label=train_data[target_col],
|
1564
|
+
enable_categorical=True)
|
1425
1565
|
evals = [(dtrain, 'train')]
|
1426
1566
|
|
1427
1567
|
if validate_data is not None:
|
1428
|
-
dvalidate = xgb.DMatrix(
|
1568
|
+
dvalidate = xgb.DMatrix(
|
1569
|
+
validate_data[features],
|
1570
|
+
label=validate_data[target_col],
|
1571
|
+
enable_categorical=True)
|
1429
1572
|
evals.append((dvalidate, 'validate'))
|
1430
1573
|
|
1431
1574
|
params = {'objective': 'binary:logistic', 'eval_metric': 'auc'}
|
1432
1575
|
|
1433
|
-
model = xgb.train(
|
1576
|
+
model = xgb.train(
|
1577
|
+
params,
|
1578
|
+
dtrain,
|
1579
|
+
num_boost_round=boosting_rounds,
|
1580
|
+
evals=evals,
|
1581
|
+
early_stopping_rounds=10 if validate_data is not None else None)
|
1434
1582
|
|
1435
1583
|
# Make predictions for all data
|
1436
1584
|
dall = xgb.DMatrix(df[features], enable_categorical=True)
|
@@ -1439,13 +1587,18 @@ def append_xgb_logistic_regression_predictions(df: pd.DataFrame, target_col: str
|
|
1439
1587
|
if model_path:
|
1440
1588
|
model.save_model(model_path)
|
1441
1589
|
|
1442
|
-
columns_order = [col for col in df.columns if col not in [
|
1590
|
+
columns_order = [col for col in df.columns if col not in [
|
1591
|
+
'XGB_TYPE', target_col, pred_col]] + ['XGB_TYPE', target_col, pred_col]
|
1443
1592
|
df = df[columns_order]
|
1444
1593
|
|
1445
1594
|
return df
|
1446
1595
|
|
1447
1596
|
|
1448
|
-
def print_n_frequency_cascading(
|
1597
|
+
def print_n_frequency_cascading(
|
1598
|
+
df: pd.DataFrame,
|
1599
|
+
n: int,
|
1600
|
+
columns: str,
|
1601
|
+
order_by: str = "FREQ_DESC") -> None:
|
1449
1602
|
"""
|
1450
1603
|
Print the cascading frequency of top n values for specified columns.
|
1451
1604
|
|
@@ -1468,7 +1621,12 @@ def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by
|
|
1468
1621
|
# Convert the column to string representation
|
1469
1622
|
df[current_col] = df[current_col].astype(str)
|
1470
1623
|
frequency = df[current_col].value_counts(dropna=False)
|
1471
|
-
frequency = frequency.rename(
|
1624
|
+
frequency = frequency.rename(
|
1625
|
+
index={
|
1626
|
+
'nan': 'NaN',
|
1627
|
+
'NaT': 'NaT',
|
1628
|
+
'None': 'None',
|
1629
|
+
'': 'Empty'})
|
1472
1630
|
|
1473
1631
|
if limit is not None:
|
1474
1632
|
frequency = frequency.nlargest(limit)
|
@@ -1483,8 +1641,11 @@ def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by
|
|
1483
1641
|
filtered_df = df[df[current_col] == value]
|
1484
1642
|
|
1485
1643
|
if len(columns) > 1:
|
1486
|
-
sub_report = generate_cascade_report(
|
1487
|
-
|
1644
|
+
sub_report = generate_cascade_report(
|
1645
|
+
filtered_df, columns[1:], limit, order_by)
|
1646
|
+
report[value] = {
|
1647
|
+
"count": str(count), f"sub_distribution({
|
1648
|
+
columns[1]})": sub_report if sub_report else {}}
|
1488
1649
|
else:
|
1489
1650
|
report[value] = {"count": str(count)}
|
1490
1651
|
|
@@ -1494,17 +1655,29 @@ def print_n_frequency_cascading(df: pd.DataFrame, n: int, columns: str, order_by
|
|
1494
1655
|
if order_by == "ASC":
|
1495
1656
|
return dict(sorted(frequency.items(), key=lambda item: item[0]))
|
1496
1657
|
elif order_by == "DESC":
|
1497
|
-
return dict(
|
1658
|
+
return dict(
|
1659
|
+
sorted(
|
1660
|
+
frequency.items(),
|
1661
|
+
key=lambda item: item[0],
|
1662
|
+
reverse=True))
|
1498
1663
|
elif order_by == "FREQ_ASC":
|
1499
1664
|
return dict(sorted(frequency.items(), key=lambda item: item[1]))
|
1500
1665
|
else: # Default to "FREQ_DESC"
|
1501
|
-
return dict(
|
1666
|
+
return dict(
|
1667
|
+
sorted(
|
1668
|
+
frequency.items(),
|
1669
|
+
key=lambda item: item[1],
|
1670
|
+
reverse=True))
|
1502
1671
|
|
1503
1672
|
report = generate_cascade_report(df, columns, n, order_by)
|
1504
1673
|
print(json.dumps(report, indent=2))
|
1505
1674
|
|
1506
1675
|
|
1507
|
-
def print_n_frequency_linear(
|
1676
|
+
def print_n_frequency_linear(
|
1677
|
+
df: pd.DataFrame,
|
1678
|
+
n: int,
|
1679
|
+
columns: str,
|
1680
|
+
order_by: str = "FREQ_DESC") -> None:
|
1508
1681
|
"""
|
1509
1682
|
Print the linear frequency of top n values for specified columns.
|
1510
1683
|
|
@@ -1524,13 +1697,19 @@ def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: str, order_by: s
|
|
1524
1697
|
continue
|
1525
1698
|
|
1526
1699
|
frequency = df[current_col].astype(str).value_counts(dropna=False)
|
1527
|
-
frequency = frequency.rename(
|
1700
|
+
frequency = frequency.rename(
|
1701
|
+
index={
|
1702
|
+
'nan': 'NaN',
|
1703
|
+
'NaT': 'NaT',
|
1704
|
+
'None': 'None',
|
1705
|
+
'': 'Empty'})
|
1528
1706
|
|
1529
1707
|
if limit is not None:
|
1530
1708
|
frequency = frequency.nlargest(limit)
|
1531
1709
|
|
1532
1710
|
sorted_frequency = sort_frequency(frequency, order_by)
|
1533
|
-
col_report = {str(value): str(count)
|
1711
|
+
col_report = {str(value): str(count)
|
1712
|
+
for value, count in sorted_frequency.items()}
|
1534
1713
|
report[current_col] = col_report
|
1535
1714
|
|
1536
1715
|
return report
|
@@ -1539,17 +1718,27 @@ def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: str, order_by: s
|
|
1539
1718
|
if order_by == "ASC":
|
1540
1719
|
return dict(sorted(frequency.items(), key=lambda item: item[0]))
|
1541
1720
|
elif order_by == "DESC":
|
1542
|
-
return dict(
|
1721
|
+
return dict(
|
1722
|
+
sorted(
|
1723
|
+
frequency.items(),
|
1724
|
+
key=lambda item: item[0],
|
1725
|
+
reverse=True))
|
1543
1726
|
elif order_by == "FREQ_ASC":
|
1544
1727
|
return dict(sorted(frequency.items(), key=lambda item: item[1]))
|
1545
1728
|
else: # Default to "FREQ_DESC"
|
1546
|
-
return dict(
|
1729
|
+
return dict(
|
1730
|
+
sorted(
|
1731
|
+
frequency.items(),
|
1732
|
+
key=lambda item: item[1],
|
1733
|
+
reverse=True))
|
1547
1734
|
|
1548
1735
|
report = generate_linear_report(df, columns, n, order_by)
|
1549
1736
|
print(json.dumps(report, indent=2))
|
1550
1737
|
|
1551
1738
|
|
1552
|
-
def retain_columns(
|
1739
|
+
def retain_columns(
|
1740
|
+
df: pd.DataFrame,
|
1741
|
+
columns_to_retain: List[str]) -> pd.DataFrame:
|
1553
1742
|
"""
|
1554
1743
|
Retain specified columns in the DataFrame and drop the others.
|
1555
1744
|
|
@@ -1565,7 +1754,10 @@ def retain_columns(df: pd.DataFrame, columns_to_retain: List[str]) -> pd.DataFra
|
|
1565
1754
|
return df[columns_to_retain]
|
1566
1755
|
|
1567
1756
|
|
1568
|
-
def mask_against_dataframe(
|
1757
|
+
def mask_against_dataframe(
|
1758
|
+
df: pd.DataFrame,
|
1759
|
+
other_df: pd.DataFrame,
|
1760
|
+
column_name: str) -> pd.DataFrame:
|
1569
1761
|
"""
|
1570
1762
|
Retain only rows with common column values between two DataFrames.
|
1571
1763
|
|
@@ -1582,7 +1774,10 @@ def mask_against_dataframe(df: pd.DataFrame, other_df: pd.DataFrame, column_name
|
|
1582
1774
|
return df[df[column_name].isin(other_df[column_name])]
|
1583
1775
|
|
1584
1776
|
|
1585
|
-
def mask_against_dataframe_converse(
|
1777
|
+
def mask_against_dataframe_converse(
|
1778
|
+
df: pd.DataFrame,
|
1779
|
+
other_df: pd.DataFrame,
|
1780
|
+
column_name: str) -> pd.DataFrame:
|
1586
1781
|
"""
|
1587
1782
|
Retain only rows with uncommon column values between two DataFrames.
|
1588
1783
|
|
@@ -1616,7 +1811,8 @@ def union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
|
|
1616
1811
|
ValueError: If the DataFrames do not have the same columns.
|
1617
1812
|
"""
|
1618
1813
|
if set(df1.columns) != set(df2.columns):
|
1619
|
-
raise ValueError(
|
1814
|
+
raise ValueError(
|
1815
|
+
"Both DataFrames must have the same columns for a union join")
|
1620
1816
|
|
1621
1817
|
result_df = pd.concat([df1, df2], ignore_index=True).drop_duplicates()
|
1622
1818
|
return result_df
|
@@ -1637,13 +1833,18 @@ def bag_union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
|
|
1637
1833
|
ValueError: If the DataFrames do not have the same columns.
|
1638
1834
|
"""
|
1639
1835
|
if set(df1.columns) != set(df2.columns):
|
1640
|
-
raise ValueError(
|
1836
|
+
raise ValueError(
|
1837
|
+
"Both DataFrames must have the same columns for a bag union join")
|
1641
1838
|
|
1642
1839
|
result_df = pd.concat([df1, df2], ignore_index=True)
|
1643
1840
|
return result_df
|
1644
1841
|
|
1645
1842
|
|
1646
|
-
def left_join(
|
1843
|
+
def left_join(
|
1844
|
+
df1: pd.DataFrame,
|
1845
|
+
df2: pd.DataFrame,
|
1846
|
+
left_on: str,
|
1847
|
+
right_on: str) -> pd.DataFrame:
|
1647
1848
|
"""
|
1648
1849
|
Perform a left join on two DataFrames.
|
1649
1850
|
|
@@ -1659,7 +1860,11 @@ def left_join(df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str)
|
|
1659
1860
|
return df1.merge(df2, how='left', left_on=left_on, right_on=right_on)
|
1660
1861
|
|
1661
1862
|
|
1662
|
-
def right_join(
|
1863
|
+
def right_join(
|
1864
|
+
df1: pd.DataFrame,
|
1865
|
+
df2: pd.DataFrame,
|
1866
|
+
left_on: str,
|
1867
|
+
right_on: str) -> pd.DataFrame:
|
1663
1868
|
"""
|
1664
1869
|
Perform a right join on two DataFrames.
|
1665
1870
|
|
@@ -1683,7 +1888,7 @@ def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.Dat
|
|
1683
1888
|
db_path: str
|
1684
1889
|
The file path to the SQLite database. If the database does not exist,
|
1685
1890
|
it will be created.
|
1686
|
-
|
1891
|
+
|
1687
1892
|
tablename: str
|
1688
1893
|
The name of the table where the data will be inserted. If the table does
|
1689
1894
|
not exist, it will be created based on the DataFrame's columns and types.
|
@@ -1697,8 +1902,8 @@ def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.Dat
|
|
1697
1902
|
- Inserts the DataFrame's data into the table, appending to any existing data.
|
1698
1903
|
|
1699
1904
|
Data Type Mapping:
|
1700
|
-
- Converts Pandas data types to SQLite types: 'int64' to 'INTEGER',
|
1701
|
-
'float64' to 'REAL', 'object' to 'TEXT', 'datetime64[ns]' to 'TEXT',
|
1905
|
+
- Converts Pandas data types to SQLite types: 'int64' to 'INTEGER',
|
1906
|
+
'float64' to 'REAL', 'object' to 'TEXT', 'datetime64[ns]' to 'TEXT',
|
1702
1907
|
and 'bool' to 'INTEGER'.
|
1703
1908
|
|
1704
1909
|
Returns:
|
@@ -1706,10 +1911,10 @@ def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.Dat
|
|
1706
1911
|
"""
|
1707
1912
|
|
1708
1913
|
def table_exists(cursor, table_name):
|
1709
|
-
cursor.execute(
|
1914
|
+
cursor.execute(
|
1915
|
+
f"SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{table_name}'")
|
1710
1916
|
return cursor.fetchone()[0] == 1
|
1711
1917
|
|
1712
|
-
|
1713
1918
|
dtype_mapping = {
|
1714
1919
|
'int64': 'INTEGER',
|
1715
1920
|
'float64': 'REAL',
|
@@ -1726,15 +1931,21 @@ def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.Dat
|
|
1726
1931
|
|
1727
1932
|
if not table_exists(cursor, tablename):
|
1728
1933
|
columns_with_types = ', '.join(
|
1729
|
-
f'"{col}" {
|
1730
|
-
|
1934
|
+
f'"{col}" {
|
1935
|
+
map_dtype(dtype)}' for col,
|
1936
|
+
dtype in zip(
|
1937
|
+
df.columns,
|
1938
|
+
df.dtypes))
|
1731
1939
|
create_table_query = f'CREATE TABLE "{tablename}" ({columns_with_types})'
|
1732
1940
|
conn.execute(create_table_query)
|
1733
1941
|
|
1734
1942
|
df.to_sql(tablename, conn, if_exists='append', index=False)
|
1735
1943
|
|
1736
1944
|
|
1737
|
-
def sync_dataframe_to_sqlite_database(
|
1945
|
+
def sync_dataframe_to_sqlite_database(
|
1946
|
+
db_path: str,
|
1947
|
+
tablename: str,
|
1948
|
+
df: pd.DataFrame) -> None:
|
1738
1949
|
"""
|
1739
1950
|
Processes and saves a DataFrame to an SQLite database, adding a timestamp column
|
1740
1951
|
and replacing the existing table if needed. Creates the table if it does not exist.
|
@@ -1770,8 +1981,11 @@ def sync_dataframe_to_sqlite_database(db_path: str, tablename: str, df: pd.DataF
|
|
1770
1981
|
if cursor.fetchall() == []: # Table does not exist
|
1771
1982
|
# Create a table using the DataFrame's column names and types
|
1772
1983
|
columns_with_types = ', '.join(
|
1773
|
-
f'"{col}" {
|
1774
|
-
|
1984
|
+
f'"{col}" {
|
1985
|
+
map_dtype(dtype)}' for col,
|
1986
|
+
dtype in zip(
|
1987
|
+
df.columns,
|
1988
|
+
df.dtypes))
|
1775
1989
|
create_table_query = f'CREATE TABLE "{new_table_name}" ({columns_with_types})'
|
1776
1990
|
conn.execute(create_table_query)
|
1777
1991
|
|
@@ -0,0 +1,104 @@
|
|
1
|
+
import os
|
2
|
+
import json
|
3
|
+
import requests
|
4
|
+
import inspect
|
5
|
+
from typing import Tuple, Optional, Dict, Callable
|
6
|
+
import warnings
|
7
|
+
|
8
|
+
# Suppress all FutureWarnings
|
9
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
10
|
+
|
11
|
+
|
12
|
+
def str_docs(method_type_filter: Optional[str] = None) -> None:
|
13
|
+
"""
|
14
|
+
Print a list of function names in alphabetical order. If method_type_filter
|
15
|
+
is specified, print the docstrings of the functions that match the filter.
|
16
|
+
Using '*' as a filter will print the docstrings for all functions.
|
17
|
+
|
18
|
+
Parameters:
|
19
|
+
method_type_filter: Optional filter string representing a function name,
|
20
|
+
or '*' to display docstrings for all functions.
|
21
|
+
"""
|
22
|
+
# Get the current module's namespace
|
23
|
+
current_module = __name__
|
24
|
+
|
25
|
+
local_functions: Dict[str, Callable] = {
|
26
|
+
name: obj for name, obj in globals().items()
|
27
|
+
if inspect.isfunction(obj) and obj.__module__ == current_module
|
28
|
+
}
|
29
|
+
|
30
|
+
# List of function names sorted alphabetically
|
31
|
+
function_names = sorted(local_functions.keys())
|
32
|
+
|
33
|
+
# Print function names
|
34
|
+
print("Functions in alphabetical order:")
|
35
|
+
for name in function_names:
|
36
|
+
print(name)
|
37
|
+
|
38
|
+
# If a filter is provided or '*', print the docstrings of functions
|
39
|
+
if method_type_filter:
|
40
|
+
# print("\nFiltered function documentation:")
|
41
|
+
for name, func in local_functions.items():
|
42
|
+
docstring: Optional[str] = func.__doc__
|
43
|
+
if docstring:
|
44
|
+
if method_type_filter == '*' or method_type_filter == name:
|
45
|
+
# Print the entire docstring for the matching function
|
46
|
+
print(f"\n{name}:\n{docstring}")
|
47
|
+
|
48
|
+
|
49
|
+
def send_telegram_message(preset_name: str, message: str) -> None:
|
50
|
+
"""Send a Telegram message using the specified preset.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
preset_name (str): The name of the preset to use for sending the message.
|
54
|
+
message (str): The message to send.
|
55
|
+
|
56
|
+
Raises:
|
57
|
+
RuntimeError: If the preset is not found or necessary details are missing.
|
58
|
+
"""
|
59
|
+
|
60
|
+
# Set the config path to ~/.rgwfuncsrc
|
61
|
+
config_path = os.path.expanduser("~/.rgwfuncsrc")
|
62
|
+
|
63
|
+
def load_config() -> dict:
|
64
|
+
"""Load the configuration from the .rgwfuncsrc file."""
|
65
|
+
with open(config_path, 'r') as file:
|
66
|
+
return json.load(file)
|
67
|
+
|
68
|
+
def get_telegram_preset(config: dict, preset_name: str) -> dict:
|
69
|
+
"""Get the Telegram preset configuration."""
|
70
|
+
presets = config.get("telegram_bot_presets", [])
|
71
|
+
for preset in presets:
|
72
|
+
if preset.get("name") == preset_name:
|
73
|
+
return preset
|
74
|
+
return None
|
75
|
+
|
76
|
+
def get_telegram_bot_details(config: dict, preset_name: str) -> Tuple[str, str]:
|
77
|
+
"""Retrieve the Telegram bot token and chat ID from the preset."""
|
78
|
+
preset = get_telegram_preset(config, preset_name)
|
79
|
+
if not preset:
|
80
|
+
raise RuntimeError(f"Telegram bot preset '{preset_name}' not found in the configuration file")
|
81
|
+
|
82
|
+
bot_token = preset.get("bot_token")
|
83
|
+
chat_id = preset.get("chat_id")
|
84
|
+
|
85
|
+
if not bot_token or not chat_id:
|
86
|
+
raise RuntimeError(
|
87
|
+
f"Telegram bot token or chat ID for '{preset_name}' not found in the configuration file"
|
88
|
+
)
|
89
|
+
|
90
|
+
return bot_token, chat_id
|
91
|
+
|
92
|
+
# Load the configuration
|
93
|
+
config = load_config()
|
94
|
+
|
95
|
+
# Get bot details from the configuration
|
96
|
+
bot_token, chat_id = get_telegram_bot_details(config, preset_name)
|
97
|
+
|
98
|
+
# Prepare the request
|
99
|
+
url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
|
100
|
+
payload = {"chat_id": chat_id, "text": message}
|
101
|
+
|
102
|
+
# Send the message
|
103
|
+
response = requests.post(url, json=payload)
|
104
|
+
response.raise_for_status()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: rgwfuncs
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.19
|
4
4
|
Summary: A functional programming paradigm for mathematical modelling and data science
|
5
5
|
Home-page: https://github.com/ryangerardwilson/rgwfunc
|
6
6
|
Author: Ryan Gerard Wilson
|
@@ -135,11 +135,48 @@ To display all docstrings, use:
|
|
135
135
|
|
136
136
|
--------------------------------------------------------------------------------
|
137
137
|
|
138
|
-
##
|
138
|
+
## String Based Functions
|
139
|
+
|
140
|
+
### 1. str_docs
|
141
|
+
Print a list of available function names in alphabetical order. If a filter is provided, print the matching docstrings.
|
142
|
+
|
143
|
+
• Parameters:
|
144
|
+
- `method_type_filter` (str): Optional, comma-separated to select docstring types, or '*' for all.
|
145
|
+
|
146
|
+
• Example:
|
147
|
+
|
148
|
+
import rgwfuncs
|
149
|
+
rgwfuncs.str_docs(method_type_filter='numeric_clean,limit_dataframe')
|
150
|
+
|
151
|
+
--------------------------------------------------------------------------------
|
152
|
+
|
153
|
+
### 2. send_telegram_message
|
154
|
+
|
155
|
+
Send a message to a Telegram chat using a specified preset from your configuration file.
|
156
|
+
|
157
|
+
• Parameters:
|
158
|
+
- `preset_name` (str): The name of the preset to use for sending the message. This should match a preset in the configuration file.
|
159
|
+
- `message` (str): The message text that you want to send to the Telegram chat.
|
160
|
+
|
161
|
+
• Raises:
|
162
|
+
- `RuntimeError`: If the preset is not found in the configuration file or if necessary details (bot token or chat ID) are missing.
|
163
|
+
|
164
|
+
• Example:
|
165
|
+
|
166
|
+
from rgwfuncs import send_telegram_message
|
167
|
+
|
168
|
+
preset_name = "daily_updates"
|
169
|
+
message = "Here is your daily update!"
|
170
|
+
|
171
|
+
send_telegram_message(preset_name, message)
|
172
|
+
|
173
|
+
--------------------------------------------------------------------------------
|
174
|
+
|
175
|
+
## Dataframe Based Functions
|
139
176
|
|
140
177
|
Below is a quick reference of available functions, their purpose, and basic usage examples.
|
141
178
|
|
142
|
-
### 1.
|
179
|
+
### 1. df_docs
|
143
180
|
Print a list of available function names in alphabetical order. If a filter is provided, print the matching docstrings.
|
144
181
|
|
145
182
|
• Parameters:
|
@@ -148,7 +185,7 @@ Print a list of available function names in alphabetical order. If a filter is p
|
|
148
185
|
• Example:
|
149
186
|
|
150
187
|
import rgwfuncs
|
151
|
-
rgwfuncs.
|
188
|
+
rgwfuncs.df_docs(method_type_filter='numeric_clean,limit_dataframe')
|
152
189
|
|
153
190
|
--------------------------------------------------------------------------------
|
154
191
|
|
@@ -1,4 +0,0 @@
|
|
1
|
-
# This file is automatically generated
|
2
|
-
# Dynamically importing functions from modules
|
3
|
-
|
4
|
-
from .df_lib import append_columns, append_percentile_classification_column, append_ranged_classification_column, append_ranged_date_classification_column, append_rows, append_xgb_labels, append_xgb_logistic_regression_predictions, append_xgb_regression_predictions, bag_union_join, bottom_n_unique_values, cascade_sort, delete_rows, docs, drop_duplicates, drop_duplicates_retain_first, drop_duplicates_retain_last, filter_dataframe, filter_indian_mobiles, first_n_rows, from_raw_data, insert_dataframe_in_sqlite_database, last_n_rows, left_join, limit_dataframe, load_data_from_path, load_data_from_query, load_data_from_sqlite_path, mask_against_dataframe, mask_against_dataframe_converse, numeric_clean, order_columns, print_correlation, print_dataframe, print_memory_usage, print_n_frequency_cascading, print_n_frequency_linear, rename_columns, retain_columns, right_join, send_data_to_email, send_data_to_slack, send_dataframe_via_telegram, sync_dataframe_to_sqlite_database, top_n_unique_values, union_join, update_rows
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|