rgwfuncs 0.0.15__tar.gz → 0.0.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rgwfuncs-0.0.15/src/rgwfuncs.egg-info → rgwfuncs-0.0.17}/PKG-INFO +59 -20
- {rgwfuncs-0.0.15 → rgwfuncs-0.0.17}/README.md +58 -19
- {rgwfuncs-0.0.15 → rgwfuncs-0.0.17}/pyproject.toml +1 -1
- {rgwfuncs-0.0.15 → rgwfuncs-0.0.17}/setup.cfg +1 -1
- rgwfuncs-0.0.17/src/rgwfuncs/__init__.py +4 -0
- {rgwfuncs-0.0.15 → rgwfuncs-0.0.17}/src/rgwfuncs/df_lib.py +131 -120
- {rgwfuncs-0.0.15 → rgwfuncs-0.0.17/src/rgwfuncs.egg-info}/PKG-INFO +59 -20
- rgwfuncs-0.0.15/src/rgwfuncs/__init__.py +0 -4
- {rgwfuncs-0.0.15 → rgwfuncs-0.0.17}/LICENSE +0 -0
- {rgwfuncs-0.0.15 → rgwfuncs-0.0.17}/src/rgwfuncs.egg-info/SOURCES.txt +0 -0
- {rgwfuncs-0.0.15 → rgwfuncs-0.0.17}/src/rgwfuncs.egg-info/dependency_links.txt +0 -0
- {rgwfuncs-0.0.15 → rgwfuncs-0.0.17}/src/rgwfuncs.egg-info/entry_points.txt +0 -0
- {rgwfuncs-0.0.15 → rgwfuncs-0.0.17}/src/rgwfuncs.egg-info/requires.txt +0 -0
- {rgwfuncs-0.0.15 → rgwfuncs-0.0.17}/src/rgwfuncs.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: rgwfuncs
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.17
|
4
4
|
Summary: A functional programming paradigm for mathematical modelling and data science
|
5
5
|
Home-page: https://github.com/ryangerardwilson/rgwfunc
|
6
6
|
Author: Ryan Gerard Wilson
|
@@ -40,9 +40,9 @@ Install the package using:
|
|
40
40
|
|
41
41
|
--------------------------------------------------------------------------------
|
42
42
|
|
43
|
-
## Create a `
|
43
|
+
## Create a `.rgwfuncsrc` File
|
44
44
|
|
45
|
-
A `
|
45
|
+
A `.rgwfuncsrc` file (located at `vi ~/.rgwfuncsrc) is required for MSSQL, CLICKHOUSE, MYSQL, GOOGLE BIG QUERY, SLACK, TELEGRAM, and GMAIL integrations.
|
46
46
|
|
47
47
|
{
|
48
48
|
"db_presets" : [
|
@@ -381,28 +381,30 @@ Drop duplicate rows based on specified columns, retaining the last occurrence.
|
|
381
381
|
--------------------------------------------------------------------------------
|
382
382
|
|
383
383
|
### 12. `load_data_from_query`
|
384
|
+
|
384
385
|
Load data from a database query into a DataFrame based on a configuration preset.
|
385
386
|
|
386
|
-
|
387
|
-
- `db_preset_name` (str): Name of the database preset in the
|
388
|
-
- query (str): The SQL query to execute.
|
389
|
-
- `config_file_name` (str): Name of the configuration file (default: "rgwml.config").
|
387
|
+
- **Parameters:**
|
388
|
+
- `db_preset_name` (str): Name of the database preset in the configuration file.
|
389
|
+
- `query` (str): The SQL query to execute.
|
390
390
|
|
391
|
-
|
392
|
-
- pd.DataFrame
|
391
|
+
- **Returns:**
|
392
|
+
- `pd.DataFrame`: A DataFrame containing the query result.
|
393
393
|
|
394
|
-
|
395
|
-
|
396
|
-
from rgwfuncs import load_data_from_query
|
394
|
+
- **Notes:**
|
395
|
+
- The configuration file is assumed to be located at `~/.rgwfuncsrc`.
|
397
396
|
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
397
|
+
- **Example:**
|
398
|
+
|
399
|
+
from rgwfuncs import load_data_from_query
|
400
|
+
|
401
|
+
df = load_data_from_query(
|
402
|
+
db_preset_name="MyDBPreset",
|
403
|
+
query="SELECT * FROM my_table"
|
404
|
+
)
|
405
|
+
print(df)
|
405
406
|
|
407
|
+
|
406
408
|
--------------------------------------------------------------------------------
|
407
409
|
|
408
410
|
### 13. `load_data_from_path`
|
@@ -1148,10 +1150,47 @@ Perform a right join on two DataFrames.
|
|
1148
1150
|
df_right_join = right_join(df1, df2, 'ID', 'ID')
|
1149
1151
|
print(df_right_join)
|
1150
1152
|
|
1153
|
+
--------------------------------------------------------------------------------
|
1154
|
+
|
1155
|
+
### 45. `insert_dataframe_in_sqlite_database`
|
1156
|
+
|
1157
|
+
Inserts a Pandas DataFrame into a SQLite database table. If the specified table does not exist, it will be created with column types automatically inferred from the DataFrame's data types.
|
1158
|
+
|
1159
|
+
- **Parameters:**
|
1160
|
+
- `db_path` (str): The path to the SQLite database file. If the database does not exist, it will be created.
|
1161
|
+
- `tablename` (str): The name of the table in the database. If the table does not exist, it is created with the DataFrame's columns and data types.
|
1162
|
+
- `df` (pd.DataFrame): The DataFrame containing the data to be inserted into the database table.
|
1163
|
+
|
1164
|
+
- **Returns:**
|
1165
|
+
- `None`
|
1166
|
+
|
1167
|
+
- **Notes:**
|
1168
|
+
- Data types in the DataFrame are converted to SQLite-compatible types:
|
1169
|
+
- `int64` is mapped to `INTEGER`
|
1170
|
+
- `float64` is mapped to `REAL`
|
1171
|
+
- `object` is mapped to `TEXT`
|
1172
|
+
- `datetime64[ns]` is mapped to `TEXT` (dates are stored as text)
|
1173
|
+
- `bool` is mapped to `INTEGER` (SQLite does not have a separate Boolean type)
|
1174
|
+
|
1175
|
+
- **Example:**
|
1176
|
+
|
1177
|
+
from rgwfuncs import insert_dataframe_in_sqlite_database
|
1178
|
+
import pandas as pd
|
1179
|
+
|
1180
|
+
df = pd.DataFrame({
|
1181
|
+
'ID': [1, 2, 3],
|
1182
|
+
'Name': ['Alice', 'Bob', 'Charlie'],
|
1183
|
+
'Score': [88.5, 92.3, 85.0]
|
1184
|
+
})
|
1185
|
+
|
1186
|
+
db_path = 'my_database.db'
|
1187
|
+
tablename = 'students'
|
1188
|
+
|
1189
|
+
insert_dataframe_in_sqlite_database(db_path, tablename, df)
|
1151
1190
|
|
1152
1191
|
--------------------------------------------------------------------------------
|
1153
1192
|
|
1154
|
-
###
|
1193
|
+
### 46. `sync_dataframe_to_sqlite_database`
|
1155
1194
|
Processes and saves a DataFrame to an SQLite database, adding a timestamp column and replacing the existing table if needed. Creates the table if it does not exist.
|
1156
1195
|
|
1157
1196
|
• Parameters:
|
@@ -14,9 +14,9 @@ Install the package using:
|
|
14
14
|
|
15
15
|
--------------------------------------------------------------------------------
|
16
16
|
|
17
|
-
## Create a `
|
17
|
+
## Create a `.rgwfuncsrc` File
|
18
18
|
|
19
|
-
A `
|
19
|
+
A `.rgwfuncsrc` file (located at `vi ~/.rgwfuncsrc) is required for MSSQL, CLICKHOUSE, MYSQL, GOOGLE BIG QUERY, SLACK, TELEGRAM, and GMAIL integrations.
|
20
20
|
|
21
21
|
{
|
22
22
|
"db_presets" : [
|
@@ -355,28 +355,30 @@ Drop duplicate rows based on specified columns, retaining the last occurrence.
|
|
355
355
|
--------------------------------------------------------------------------------
|
356
356
|
|
357
357
|
### 12. `load_data_from_query`
|
358
|
+
|
358
359
|
Load data from a database query into a DataFrame based on a configuration preset.
|
359
360
|
|
360
|
-
|
361
|
-
- `db_preset_name` (str): Name of the database preset in the
|
362
|
-
- query (str): The SQL query to execute.
|
363
|
-
- `config_file_name` (str): Name of the configuration file (default: "rgwml.config").
|
361
|
+
- **Parameters:**
|
362
|
+
- `db_preset_name` (str): Name of the database preset in the configuration file.
|
363
|
+
- `query` (str): The SQL query to execute.
|
364
364
|
|
365
|
-
|
366
|
-
- pd.DataFrame
|
365
|
+
- **Returns:**
|
366
|
+
- `pd.DataFrame`: A DataFrame containing the query result.
|
367
367
|
|
368
|
-
|
369
|
-
|
370
|
-
from rgwfuncs import load_data_from_query
|
368
|
+
- **Notes:**
|
369
|
+
- The configuration file is assumed to be located at `~/.rgwfuncsrc`.
|
371
370
|
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
371
|
+
- **Example:**
|
372
|
+
|
373
|
+
from rgwfuncs import load_data_from_query
|
374
|
+
|
375
|
+
df = load_data_from_query(
|
376
|
+
db_preset_name="MyDBPreset",
|
377
|
+
query="SELECT * FROM my_table"
|
378
|
+
)
|
379
|
+
print(df)
|
379
380
|
|
381
|
+
|
380
382
|
--------------------------------------------------------------------------------
|
381
383
|
|
382
384
|
### 13. `load_data_from_path`
|
@@ -1122,10 +1124,47 @@ Perform a right join on two DataFrames.
|
|
1122
1124
|
df_right_join = right_join(df1, df2, 'ID', 'ID')
|
1123
1125
|
print(df_right_join)
|
1124
1126
|
|
1127
|
+
--------------------------------------------------------------------------------
|
1128
|
+
|
1129
|
+
### 45. `insert_dataframe_in_sqlite_database`
|
1130
|
+
|
1131
|
+
Inserts a Pandas DataFrame into a SQLite database table. If the specified table does not exist, it will be created with column types automatically inferred from the DataFrame's data types.
|
1132
|
+
|
1133
|
+
- **Parameters:**
|
1134
|
+
- `db_path` (str): The path to the SQLite database file. If the database does not exist, it will be created.
|
1135
|
+
- `tablename` (str): The name of the table in the database. If the table does not exist, it is created with the DataFrame's columns and data types.
|
1136
|
+
- `df` (pd.DataFrame): The DataFrame containing the data to be inserted into the database table.
|
1137
|
+
|
1138
|
+
- **Returns:**
|
1139
|
+
- `None`
|
1140
|
+
|
1141
|
+
- **Notes:**
|
1142
|
+
- Data types in the DataFrame are converted to SQLite-compatible types:
|
1143
|
+
- `int64` is mapped to `INTEGER`
|
1144
|
+
- `float64` is mapped to `REAL`
|
1145
|
+
- `object` is mapped to `TEXT`
|
1146
|
+
- `datetime64[ns]` is mapped to `TEXT` (dates are stored as text)
|
1147
|
+
- `bool` is mapped to `INTEGER` (SQLite does not have a separate Boolean type)
|
1148
|
+
|
1149
|
+
- **Example:**
|
1150
|
+
|
1151
|
+
from rgwfuncs import insert_dataframe_in_sqlite_database
|
1152
|
+
import pandas as pd
|
1153
|
+
|
1154
|
+
df = pd.DataFrame({
|
1155
|
+
'ID': [1, 2, 3],
|
1156
|
+
'Name': ['Alice', 'Bob', 'Charlie'],
|
1157
|
+
'Score': [88.5, 92.3, 85.0]
|
1158
|
+
})
|
1159
|
+
|
1160
|
+
db_path = 'my_database.db'
|
1161
|
+
tablename = 'students'
|
1162
|
+
|
1163
|
+
insert_dataframe_in_sqlite_database(db_path, tablename, df)
|
1125
1164
|
|
1126
1165
|
--------------------------------------------------------------------------------
|
1127
1166
|
|
1128
|
-
###
|
1167
|
+
### 46. `sync_dataframe_to_sqlite_database`
|
1129
1168
|
Processes and saves a DataFrame to an SQLite database, adding a timestamp column and replacing the existing table if needed. Creates the table if it does not exist.
|
1130
1169
|
|
1131
1170
|
• Parameters:
|
@@ -0,0 +1,4 @@
|
|
1
|
+
# This file is automatically generated
|
2
|
+
# Dynamically importing functions from modules
|
3
|
+
|
4
|
+
from .df_lib import append_columns, append_percentile_classification_column, append_ranged_classification_column, append_ranged_date_classification_column, append_rows, append_xgb_labels, append_xgb_logistic_regression_predictions, append_xgb_regression_predictions, bag_union_join, bottom_n_unique_values, cascade_sort, delete_rows, docs, drop_duplicates, drop_duplicates_retain_first, drop_duplicates_retain_last, filter_dataframe, filter_indian_mobiles, first_n_rows, from_raw_data, insert_dataframe_in_sqlite_database, last_n_rows, left_join, limit_dataframe, load_data_from_path, load_data_from_query, load_data_from_sqlite_path, mask_against_dataframe, mask_against_dataframe_converse, numeric_clean, order_columns, print_correlation, print_dataframe, print_memory_usage, print_n_frequency_cascading, print_n_frequency_linear, rename_columns, retain_columns, right_join, send_data_to_email, send_data_to_slack, send_dataframe_via_telegram, sync_dataframe_to_sqlite_database, top_n_unique_values, union_join, update_rows
|
@@ -23,6 +23,10 @@ from googleapiclient.discovery import build
|
|
23
23
|
import base64
|
24
24
|
import inspect
|
25
25
|
from typing import Optional, Callable, Dict, List, Tuple, Any
|
26
|
+
import warnings
|
27
|
+
|
28
|
+
# Suppress all FutureWarnings
|
29
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
26
30
|
|
27
31
|
|
28
32
|
def docs(method_type_filter: Optional[str] = None) -> None:
|
@@ -336,16 +340,13 @@ def drop_duplicates_retain_last(df: pd.DataFrame, columns: Optional[str] = None)
|
|
336
340
|
return df.drop_duplicates(subset=columns_list, keep='last')
|
337
341
|
|
338
342
|
|
339
|
-
def load_data_from_query(db_preset_name: str, query: str
|
343
|
+
def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
340
344
|
"""
|
341
|
-
Load data from a database query into a DataFrame based on a configuration
|
342
|
-
preset.
|
345
|
+
Load data from a database query into a DataFrame based on a configuration preset.
|
343
346
|
|
344
347
|
Parameters:
|
345
348
|
db_preset_name: The name of the database preset in the configuration file.
|
346
349
|
query: The SQL query to execute.
|
347
|
-
config_file_name: Name of the configuration file
|
348
|
-
(default: 'rgwml.config').
|
349
350
|
|
350
351
|
Returns:
|
351
352
|
A DataFrame containing the query result.
|
@@ -355,17 +356,6 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
|
|
355
356
|
ValueError: If the database preset or db_type is invalid.
|
356
357
|
"""
|
357
358
|
|
358
|
-
def locate_config_file(filename: str = config_file_name) -> str:
|
359
|
-
home_dir = os.path.expanduser("~")
|
360
|
-
search_paths = [os.path.join(home_dir, "Desktop"), os.path.join(home_dir, "Documents"), os.path.join(home_dir, "Downloads"),]
|
361
|
-
|
362
|
-
for path in search_paths:
|
363
|
-
for root, dirs, files in os.walk(path):
|
364
|
-
if filename in files:
|
365
|
-
return os.path.join(root, filename)
|
366
|
-
raise FileNotFoundError(
|
367
|
-
f"{filename} not found in Desktop, Documents, or Downloads folders")
|
368
|
-
|
369
359
|
def query_mssql(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
370
360
|
server = db_preset['host']
|
371
361
|
user = db_preset['username']
|
@@ -395,7 +385,6 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
|
|
395
385
|
return pd.DataFrame(rows, columns=columns)
|
396
386
|
|
397
387
|
def query_clickhouse(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
398
|
-
|
399
388
|
host = db_preset['host']
|
400
389
|
user = db_preset['username']
|
401
390
|
password = db_preset['password']
|
@@ -434,8 +423,8 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
|
|
434
423
|
|
435
424
|
return pd.DataFrame(rows, columns=columns)
|
436
425
|
|
437
|
-
#
|
438
|
-
config_path =
|
426
|
+
# Assume the configuration file is located at ~/.rgwfuncsrc
|
427
|
+
config_path = os.path.expanduser('~/.rgwfuncsrc')
|
439
428
|
with open(config_path, 'r') as f:
|
440
429
|
config = json.load(f)
|
441
430
|
|
@@ -458,6 +447,7 @@ def load_data_from_query(db_preset_name: str, query: str, config_file_name: str
|
|
458
447
|
raise ValueError(f"Unsupported db_type: {db_type}")
|
459
448
|
|
460
449
|
|
450
|
+
|
461
451
|
def load_data_from_path(file_path: str) -> pd.DataFrame:
|
462
452
|
"""
|
463
453
|
Load data from a file into a DataFrame based on the file extension.
|
@@ -808,39 +798,36 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
|
|
808
798
|
|
809
799
|
Parameters:
|
810
800
|
df: The DataFrame to send.
|
811
|
-
bot_name: The name of the Telegram bot as specified in the configuration.
|
812
|
-
message: Custom message to send along with the DataFrame or file.
|
813
|
-
as_file: Boolean flag to
|
814
|
-
remove_after_send: If True, removes the file after sending.
|
815
|
-
"""
|
801
|
+
bot_name: The name of the Telegram bot as specified in the configuration file.
|
802
|
+
message: Custom message to send along with the DataFrame or file. Defaults to None.
|
803
|
+
as_file: Boolean flag to indicate whether the DataFrame should be sent as a file (True) or as text (False). Defaults to True.
|
804
|
+
remove_after_send: If True, removes the CSV file after sending. Defaults to True.
|
816
805
|
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
|
806
|
+
Raises:
|
807
|
+
ValueError: If the specified bot is not found or if no DataFrame is provided.
|
808
|
+
Exception: If the message sending fails.
|
821
809
|
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
return os.path.join(root, filename)
|
826
|
-
raise FileNotFoundError(
|
827
|
-
f"{filename} not found in Desktop, Documents, or Downloads")
|
810
|
+
Notes:
|
811
|
+
The configuration file is assumed to be located at `~/.rgwfuncsrc`.
|
812
|
+
"""
|
828
813
|
|
829
814
|
def get_config(config_path: str) -> dict:
|
830
|
-
"""Load configuration from a
|
815
|
+
"""Load configuration from a JSON file."""
|
831
816
|
with open(config_path, 'r') as file:
|
832
817
|
return json.load(file)
|
833
818
|
|
834
|
-
|
819
|
+
# Assume the configuration file is located at ~/.rgwfuncsrc
|
820
|
+
config_path = os.path.expanduser('~/.rgwfuncsrc')
|
835
821
|
config = get_config(config_path)
|
836
|
-
bot_config = next((bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name), None)
|
837
822
|
|
823
|
+
bot_config = next((bot for bot in config['telegram_bot_presets'] if bot['name'] == bot_name), None)
|
838
824
|
if not bot_config:
|
839
825
|
raise ValueError(f"No bot found with the name {bot_name}")
|
840
826
|
|
841
827
|
if df is None:
|
842
828
|
raise ValueError("No DataFrame to send. Please provide a DataFrame.")
|
843
829
|
|
830
|
+
response = None
|
844
831
|
if as_file:
|
845
832
|
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
846
833
|
file_name = f"df_{timestamp}.csv"
|
@@ -859,11 +846,12 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
|
|
859
846
|
df_str = df.to_string()
|
860
847
|
payload = {
|
861
848
|
'chat_id': bot_config['chat_id'],
|
862
|
-
'text': message + "\n\n" + df_str if message else df_str,
|
863
|
-
'parse_mode': 'HTML'
|
849
|
+
'text': (message + "\n\n" + df_str) if message else df_str,
|
850
|
+
'parse_mode': 'HTML'
|
851
|
+
}
|
864
852
|
response = requests.post(f"https://api.telegram.org/bot{bot_config['bot_token']}/sendMessage", data=payload)
|
865
853
|
|
866
|
-
if not response.ok:
|
854
|
+
if response and not response.ok:
|
867
855
|
raise Exception(f"Error sending message: {response.text}")
|
868
856
|
|
869
857
|
print("Message sent successfully.")
|
@@ -871,28 +859,24 @@ def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Option
|
|
871
859
|
|
872
860
|
def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subject: Optional[str] = None, body: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
|
873
861
|
"""
|
874
|
-
Send an email with optional DataFrame attachment using Gmail API via a specified preset.
|
862
|
+
Send an email with an optional DataFrame attachment using the Gmail API via a specified preset.
|
875
863
|
|
876
864
|
Parameters:
|
877
865
|
df: The DataFrame to send.
|
878
866
|
preset_name: The configuration preset name to use for sending the email.
|
879
867
|
to_email: The recipient email address.
|
880
|
-
subject: Optional subject of the email.
|
881
|
-
body: Optional message body of the email.
|
882
|
-
as_file: Boolean flag to decide whether to send the DataFrame as a file.
|
883
|
-
remove_after_send: If True, removes the CSV file after sending.
|
884
|
-
"""
|
868
|
+
subject: Optional subject of the email. Defaults to 'DataFrame CSV File' if not given.
|
869
|
+
body: Optional message body of the email. Defaults to 'Please find the CSV file attached.' if not given.
|
870
|
+
as_file: Boolean flag to decide whether to send the DataFrame as a file (True) or embed it in the email (False). Defaults to True.
|
871
|
+
remove_after_send: If True, removes the CSV file after sending. Defaults to True.
|
885
872
|
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
|
873
|
+
Raises:
|
874
|
+
ValueError: If the preset is not found in the configuration.
|
875
|
+
Exception: If the email preparation or sending fails.
|
890
876
|
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
return os.path.join(root, filename)
|
895
|
-
raise FileNotFoundError(f"{filename} not found in Desktop, Documents, or Downloads folders")
|
877
|
+
Notes:
|
878
|
+
The configuration file is assumed to be located at `~/.rgwfuncsrc`.
|
879
|
+
"""
|
896
880
|
|
897
881
|
def get_config(config_path: str) -> dict:
|
898
882
|
with open(config_path, 'r') as file:
|
@@ -901,9 +885,7 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec
|
|
901
885
|
except json.JSONDecodeError as e:
|
902
886
|
raise ValueError(f"Invalid JSON format in config file: {e}")
|
903
887
|
|
904
|
-
def authenticate_service_account(
|
905
|
-
service_account_credentials_path: str,
|
906
|
-
sender_email_id: str) -> Any:
|
888
|
+
def authenticate_service_account(service_account_credentials_path: str, sender_email_id: str) -> Any:
|
907
889
|
credentials = service_account.Credentials.from_service_account_file(
|
908
890
|
service_account_credentials_path,
|
909
891
|
scopes=['https://mail.google.com/'],
|
@@ -911,8 +893,8 @@ def send_data_to_email(df: pd.DataFrame, preset_name: str, to_email: str, subjec
|
|
911
893
|
)
|
912
894
|
return build('gmail', 'v1', credentials=credentials)
|
913
895
|
|
914
|
-
# Load configuration
|
915
|
-
config_path =
|
896
|
+
# Load configuration from ~/.rgwfuncsrc
|
897
|
+
config_path = os.path.expanduser('~/.rgwfuncsrc')
|
916
898
|
config = get_config(config_path)
|
917
899
|
|
918
900
|
# Retrieve Gmail preset configuration
|
@@ -980,30 +962,25 @@ def send_data_to_slack(df: pd.DataFrame, bot_name: str, message: Optional[str] =
|
|
980
962
|
Parameters:
|
981
963
|
df: The DataFrame to send.
|
982
964
|
bot_name: The Slack bot configuration preset name.
|
983
|
-
message: Custom message to send along with the DataFrame or file.
|
984
|
-
as_file: Boolean flag to decide whether to send the DataFrame as a file.
|
985
|
-
remove_after_send: If True, removes the CSV file after sending.
|
986
|
-
"""
|
965
|
+
message: Custom message to send along with the DataFrame or file. Defaults to None.
|
966
|
+
as_file: Boolean flag to decide whether to send the DataFrame as a file (True) or as text (False). Defaults to True.
|
967
|
+
remove_after_send: If True, removes the CSV file after sending. Defaults to True.
|
987
968
|
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
search_paths = [os.path.join(home_dir, folder) for folder in ["Desktop", "Documents", "Downloads"]]
|
969
|
+
Raises:
|
970
|
+
ValueError: If the specified bot is not found in the configuration.
|
971
|
+
Exception: If the message sending fails.
|
992
972
|
|
993
|
-
|
994
|
-
|
995
|
-
|
996
|
-
return os.path.join(root, filename)
|
997
|
-
raise FileNotFoundError(
|
998
|
-
f"{filename} not found in Desktop, Documents, or Downloads folders")
|
973
|
+
Notes:
|
974
|
+
The configuration file is assumed to be located at `~/.rgwfuncsrc`.
|
975
|
+
"""
|
999
976
|
|
1000
977
|
def get_config(config_path: str) -> dict:
|
1001
978
|
"""Load configuration from a JSON file."""
|
1002
979
|
with open(config_path, 'r') as file:
|
1003
980
|
return json.load(file)
|
1004
981
|
|
1005
|
-
# Load the Slack configuration
|
1006
|
-
config_path =
|
982
|
+
# Load the Slack configuration from ~/.rgwfuncsrc
|
983
|
+
config_path = os.path.expanduser('~/.rgwfuncsrc')
|
1007
984
|
config = get_config(config_path)
|
1008
985
|
|
1009
986
|
bot_config = next((bot for bot in config['slack_bot_presets'] if bot['name'] == bot_name), None)
|
@@ -1021,13 +998,22 @@ def send_data_to_slack(df: pd.DataFrame, bot_name: str, message: Optional[str] =
|
|
1021
998
|
|
1022
999
|
try:
|
1023
1000
|
with open(file_name, 'rb') as file:
|
1024
|
-
response = client.files_upload(
|
1001
|
+
response = client.files_upload(
|
1002
|
+
channels=bot_config['channel_id'],
|
1003
|
+
file=file,
|
1004
|
+
filename=os.path.basename(file_name),
|
1005
|
+
title="DataFrame Upload",
|
1006
|
+
initial_comment=message or ''
|
1007
|
+
)
|
1025
1008
|
finally:
|
1026
1009
|
if remove_after_send and os.path.exists(file_name):
|
1027
1010
|
os.remove(file_name)
|
1028
1011
|
else:
|
1029
1012
|
df_str = df.to_string()
|
1030
|
-
response = client.chat_postMessage(
|
1013
|
+
response = client.chat_postMessage(
|
1014
|
+
channel=bot_config['channel_id'],
|
1015
|
+
text=(message + "\n\n" + df_str) if message else df_str
|
1016
|
+
)
|
1031
1017
|
|
1032
1018
|
# Check if the message was sent successfully
|
1033
1019
|
if not response["ok"]:
|
@@ -1614,6 +1600,7 @@ def mask_against_dataframe_converse(df: pd.DataFrame, other_df: pd.DataFrame, co
|
|
1614
1600
|
|
1615
1601
|
return df[~df[column_name].isin(other_df[column_name])]
|
1616
1602
|
|
1603
|
+
|
1617
1604
|
def union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
|
1618
1605
|
"""
|
1619
1606
|
Perform a union join, concatenating the two DataFrames and dropping duplicates.
|
@@ -1628,30 +1615,13 @@ def union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
|
|
1628
1615
|
Raises:
|
1629
1616
|
ValueError: If the DataFrames do not have the same columns.
|
1630
1617
|
"""
|
1631
|
-
# Inspect initial columns
|
1632
|
-
# print("Initial df1 columns:", df1.columns)
|
1633
|
-
# print("Initial df2 columns:", df2.columns)
|
1634
|
-
|
1635
|
-
# Standardize columns by adding missing columns filled with empty strings
|
1636
|
-
for col in df2.columns:
|
1637
|
-
if col not in df1:
|
1638
|
-
df1[col] = ""
|
1639
|
-
|
1640
|
-
for col in df1.columns:
|
1641
|
-
if col not in df2:
|
1642
|
-
df2[col] = ""
|
1643
|
-
|
1644
|
-
# print("Standardized df1 columns:", df1.columns)
|
1645
|
-
# print("Standardized df2 columns:", df2.columns)
|
1646
|
-
|
1647
|
-
# Ensure they have the same columns after standardizing
|
1648
1618
|
if set(df1.columns) != set(df2.columns):
|
1649
|
-
raise ValueError("Both DataFrames must have the same columns
|
1619
|
+
raise ValueError("Both DataFrames must have the same columns for a union join")
|
1650
1620
|
|
1651
|
-
# Concatenate and drop duplicates
|
1652
1621
|
result_df = pd.concat([df1, df2], ignore_index=True).drop_duplicates()
|
1653
1622
|
return result_df
|
1654
1623
|
|
1624
|
+
|
1655
1625
|
def bag_union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
|
1656
1626
|
"""
|
1657
1627
|
Perform a bag union join, concatenating the two DataFrames without dropping duplicates.
|
@@ -1666,27 +1636,9 @@ def bag_union_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
|
|
1666
1636
|
Raises:
|
1667
1637
|
ValueError: If the DataFrames do not have the same columns.
|
1668
1638
|
"""
|
1669
|
-
# Inspect initial columns
|
1670
|
-
# print("Initial df1 columns:", df1.columns)
|
1671
|
-
# print("Initial df2 columns:", df2.columns)
|
1672
|
-
|
1673
|
-
# Standardize columns by adding missing columns filled with empty strings
|
1674
|
-
for col in df2.columns:
|
1675
|
-
if col not in df1:
|
1676
|
-
df1[col] = ""
|
1677
|
-
|
1678
|
-
for col in df1.columns:
|
1679
|
-
if col not in df2:
|
1680
|
-
df2[col] = ""
|
1681
|
-
|
1682
|
-
# print("Standardized df1 columns:", df1.columns)
|
1683
|
-
# print("Standardized df2 columns:", df2.columns)
|
1684
|
-
|
1685
|
-
# Ensure they have the same columns after standardizing
|
1686
1639
|
if set(df1.columns) != set(df2.columns):
|
1687
|
-
raise ValueError("Both DataFrames must have the same columns
|
1640
|
+
raise ValueError("Both DataFrames must have the same columns for a bag union join")
|
1688
1641
|
|
1689
|
-
# Concatenate without dropping duplicates
|
1690
1642
|
result_df = pd.concat([df1, df2], ignore_index=True)
|
1691
1643
|
return result_df
|
1692
1644
|
|
@@ -1723,6 +1675,65 @@ def right_join(df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str
|
|
1723
1675
|
return df1.merge(df2, how='right', left_on=left_on, right_on=right_on)
|
1724
1676
|
|
1725
1677
|
|
1678
|
+
def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.DataFrame) -> None:
|
1679
|
+
"""
|
1680
|
+
Inserts a Pandas DataFrame into a SQLite database table.
|
1681
|
+
|
1682
|
+
Parameters:
|
1683
|
+
db_path: str
|
1684
|
+
The file path to the SQLite database. If the database does not exist,
|
1685
|
+
it will be created.
|
1686
|
+
|
1687
|
+
tablename: str
|
1688
|
+
The name of the table where the data will be inserted. If the table does
|
1689
|
+
not exist, it will be created based on the DataFrame's columns and types.
|
1690
|
+
|
1691
|
+
df: pd.DataFrame
|
1692
|
+
The DataFrame containing the data to be inserted into the database.
|
1693
|
+
|
1694
|
+
Functionality:
|
1695
|
+
- Checks if the specified table exists in the database.
|
1696
|
+
- Creates the table with appropriate column types if it doesn't exist.
|
1697
|
+
- Inserts the DataFrame's data into the table, appending to any existing data.
|
1698
|
+
|
1699
|
+
Data Type Mapping:
|
1700
|
+
- Converts Pandas data types to SQLite types: 'int64' to 'INTEGER',
|
1701
|
+
'float64' to 'REAL', 'object' to 'TEXT', 'datetime64[ns]' to 'TEXT',
|
1702
|
+
and 'bool' to 'INTEGER'.
|
1703
|
+
|
1704
|
+
Returns:
|
1705
|
+
None
|
1706
|
+
"""
|
1707
|
+
|
1708
|
+
def table_exists(cursor, table_name):
|
1709
|
+
cursor.execute(f"SELECT count(name) FROM sqlite_master WHERE type='table' AND name='{table_name}'")
|
1710
|
+
return cursor.fetchone()[0] == 1
|
1711
|
+
|
1712
|
+
|
1713
|
+
dtype_mapping = {
|
1714
|
+
'int64': 'INTEGER',
|
1715
|
+
'float64': 'REAL',
|
1716
|
+
'object': 'TEXT',
|
1717
|
+
'datetime64[ns]': 'TEXT',
|
1718
|
+
'bool': 'INTEGER',
|
1719
|
+
}
|
1720
|
+
|
1721
|
+
def map_dtype(dtype):
|
1722
|
+
return dtype_mapping.get(str(dtype), 'TEXT')
|
1723
|
+
|
1724
|
+
with sqlite3.connect(db_path) as conn:
|
1725
|
+
cursor = conn.cursor()
|
1726
|
+
|
1727
|
+
if not table_exists(cursor, tablename):
|
1728
|
+
columns_with_types = ', '.join(
|
1729
|
+
f'"{col}" {map_dtype(dtype)}' for col, dtype in zip(df.columns, df.dtypes)
|
1730
|
+
)
|
1731
|
+
create_table_query = f'CREATE TABLE "{tablename}" ({columns_with_types})'
|
1732
|
+
conn.execute(create_table_query)
|
1733
|
+
|
1734
|
+
df.to_sql(tablename, conn, if_exists='append', index=False)
|
1735
|
+
|
1736
|
+
|
1726
1737
|
def sync_dataframe_to_sqlite_database(db_path: str, tablename: str, df: pd.DataFrame) -> None:
|
1727
1738
|
"""
|
1728
1739
|
Processes and saves a DataFrame to an SQLite database, adding a timestamp column
|
@@ -1733,6 +1744,10 @@ def sync_dataframe_to_sqlite_database(db_path: str, tablename: str, df: pd.DataF
|
|
1733
1744
|
- tablename (str): The name of the table in the database.
|
1734
1745
|
- df (pd.DataFrame): The DataFrame to be processed and saved.
|
1735
1746
|
"""
|
1747
|
+
# Helper function to map pandas dtype to SQLite type
|
1748
|
+
def map_dtype(dtype):
|
1749
|
+
return dtype_mapping.get(str(dtype), 'TEXT')
|
1750
|
+
|
1736
1751
|
# Step 1: Add a timestamp column to the dataframe
|
1737
1752
|
df['rgwfuncs_sync_timestamp'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
1738
1753
|
|
@@ -1745,10 +1760,6 @@ def sync_dataframe_to_sqlite_database(db_path: str, tablename: str, df: pd.DataF
|
|
1745
1760
|
'bool': 'INTEGER', # SQLite does not have a separate Boolean storage class
|
1746
1761
|
}
|
1747
1762
|
|
1748
|
-
# Helper function to map pandas dtype to SQLite type
|
1749
|
-
def map_dtype(dtype):
|
1750
|
-
return dtype_mapping.get(str(dtype), 'TEXT')
|
1751
|
-
|
1752
1763
|
# Step 2: Save df in SQLite3 db as '{tablename}_new'
|
1753
1764
|
with sqlite3.connect(db_path) as conn:
|
1754
1765
|
new_table_name = f"{tablename}_new"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: rgwfuncs
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.17
|
4
4
|
Summary: A functional programming paradigm for mathematical modelling and data science
|
5
5
|
Home-page: https://github.com/ryangerardwilson/rgwfunc
|
6
6
|
Author: Ryan Gerard Wilson
|
@@ -40,9 +40,9 @@ Install the package using:
|
|
40
40
|
|
41
41
|
--------------------------------------------------------------------------------
|
42
42
|
|
43
|
-
## Create a `
|
43
|
+
## Create a `.rgwfuncsrc` File
|
44
44
|
|
45
|
-
A `
|
45
|
+
A `.rgwfuncsrc` file (located at `vi ~/.rgwfuncsrc) is required for MSSQL, CLICKHOUSE, MYSQL, GOOGLE BIG QUERY, SLACK, TELEGRAM, and GMAIL integrations.
|
46
46
|
|
47
47
|
{
|
48
48
|
"db_presets" : [
|
@@ -381,28 +381,30 @@ Drop duplicate rows based on specified columns, retaining the last occurrence.
|
|
381
381
|
--------------------------------------------------------------------------------
|
382
382
|
|
383
383
|
### 12. `load_data_from_query`
|
384
|
+
|
384
385
|
Load data from a database query into a DataFrame based on a configuration preset.
|
385
386
|
|
386
|
-
|
387
|
-
- `db_preset_name` (str): Name of the database preset in the
|
388
|
-
- query (str): The SQL query to execute.
|
389
|
-
- `config_file_name` (str): Name of the configuration file (default: "rgwml.config").
|
387
|
+
- **Parameters:**
|
388
|
+
- `db_preset_name` (str): Name of the database preset in the configuration file.
|
389
|
+
- `query` (str): The SQL query to execute.
|
390
390
|
|
391
|
-
|
392
|
-
- pd.DataFrame
|
391
|
+
- **Returns:**
|
392
|
+
- `pd.DataFrame`: A DataFrame containing the query result.
|
393
393
|
|
394
|
-
|
395
|
-
|
396
|
-
from rgwfuncs import load_data_from_query
|
394
|
+
- **Notes:**
|
395
|
+
- The configuration file is assumed to be located at `~/.rgwfuncsrc`.
|
397
396
|
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
397
|
+
- **Example:**
|
398
|
+
|
399
|
+
from rgwfuncs import load_data_from_query
|
400
|
+
|
401
|
+
df = load_data_from_query(
|
402
|
+
db_preset_name="MyDBPreset",
|
403
|
+
query="SELECT * FROM my_table"
|
404
|
+
)
|
405
|
+
print(df)
|
405
406
|
|
407
|
+
|
406
408
|
--------------------------------------------------------------------------------
|
407
409
|
|
408
410
|
### 13. `load_data_from_path`
|
@@ -1148,10 +1150,47 @@ Perform a right join on two DataFrames.
|
|
1148
1150
|
df_right_join = right_join(df1, df2, 'ID', 'ID')
|
1149
1151
|
print(df_right_join)
|
1150
1152
|
|
1153
|
+
--------------------------------------------------------------------------------
|
1154
|
+
|
1155
|
+
### 45. `insert_dataframe_in_sqlite_database`
|
1156
|
+
|
1157
|
+
Inserts a Pandas DataFrame into a SQLite database table. If the specified table does not exist, it will be created with column types automatically inferred from the DataFrame's data types.
|
1158
|
+
|
1159
|
+
- **Parameters:**
|
1160
|
+
- `db_path` (str): The path to the SQLite database file. If the database does not exist, it will be created.
|
1161
|
+
- `tablename` (str): The name of the table in the database. If the table does not exist, it is created with the DataFrame's columns and data types.
|
1162
|
+
- `df` (pd.DataFrame): The DataFrame containing the data to be inserted into the database table.
|
1163
|
+
|
1164
|
+
- **Returns:**
|
1165
|
+
- `None`
|
1166
|
+
|
1167
|
+
- **Notes:**
|
1168
|
+
- Data types in the DataFrame are converted to SQLite-compatible types:
|
1169
|
+
- `int64` is mapped to `INTEGER`
|
1170
|
+
- `float64` is mapped to `REAL`
|
1171
|
+
- `object` is mapped to `TEXT`
|
1172
|
+
- `datetime64[ns]` is mapped to `TEXT` (dates are stored as text)
|
1173
|
+
- `bool` is mapped to `INTEGER` (SQLite does not have a separate Boolean type)
|
1174
|
+
|
1175
|
+
- **Example:**
|
1176
|
+
|
1177
|
+
from rgwfuncs import insert_dataframe_in_sqlite_database
|
1178
|
+
import pandas as pd
|
1179
|
+
|
1180
|
+
df = pd.DataFrame({
|
1181
|
+
'ID': [1, 2, 3],
|
1182
|
+
'Name': ['Alice', 'Bob', 'Charlie'],
|
1183
|
+
'Score': [88.5, 92.3, 85.0]
|
1184
|
+
})
|
1185
|
+
|
1186
|
+
db_path = 'my_database.db'
|
1187
|
+
tablename = 'students'
|
1188
|
+
|
1189
|
+
insert_dataframe_in_sqlite_database(db_path, tablename, df)
|
1151
1190
|
|
1152
1191
|
--------------------------------------------------------------------------------
|
1153
1192
|
|
1154
|
-
###
|
1193
|
+
### 46. `sync_dataframe_to_sqlite_database`
|
1155
1194
|
Processes and saves a DataFrame to an SQLite database, adding a timestamp column and replacing the existing table if needed. Creates the table if it does not exist.
|
1156
1195
|
|
1157
1196
|
• Parameters:
|
@@ -1,4 +0,0 @@
|
|
1
|
-
# This file is automatically generated
|
2
|
-
# Dynamically importing functions from modules
|
3
|
-
|
4
|
-
from .df_lib import append_columns, append_percentile_classification_column, append_ranged_classification_column, append_ranged_date_classification_column, append_rows, append_xgb_labels, append_xgb_logistic_regression_predictions, append_xgb_regression_predictions, bag_union_join, bottom_n_unique_values, cascade_sort, delete_rows, docs, drop_duplicates, drop_duplicates_retain_first, drop_duplicates_retain_last, filter_dataframe, filter_indian_mobiles, first_n_rows, from_raw_data, last_n_rows, left_join, limit_dataframe, load_data_from_path, load_data_from_query, load_data_from_sqlite_path, mask_against_dataframe, mask_against_dataframe_converse, numeric_clean, order_columns, print_correlation, print_dataframe, print_memory_usage, print_n_frequency_cascading, print_n_frequency_linear, rename_columns, retain_columns, right_join, send_data_to_email, send_data_to_slack, send_dataframe_via_telegram, sync_dataframe_to_sqlite_database, top_n_unique_values, union_join, update_rows
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|