rgwfuncs 0.0.21__py3-none-any.whl → 0.0.54__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rgwfuncs/__init__.py +5 -2
- rgwfuncs/algebra_lib.py +901 -0
- rgwfuncs/df_lib.py +111 -61
- rgwfuncs/docs_lib.py +51 -0
- rgwfuncs/interactive_shell_lib.py +32 -0
- rgwfuncs/str_lib.py +8 -44
- {rgwfuncs-0.0.21.dist-info → rgwfuncs-0.0.54.dist-info}/METADATA +517 -92
- rgwfuncs-0.0.54.dist-info/RECORD +12 -0
- rgwfuncs-0.0.21.dist-info/RECORD +0 -9
- {rgwfuncs-0.0.21.dist-info → rgwfuncs-0.0.54.dist-info}/LICENSE +0 -0
- {rgwfuncs-0.0.21.dist-info → rgwfuncs-0.0.54.dist-info}/WHEEL +0 -0
- {rgwfuncs-0.0.21.dist-info → rgwfuncs-0.0.54.dist-info}/entry_points.txt +0 -0
- {rgwfuncs-0.0.21.dist-info → rgwfuncs-0.0.54.dist-info}/top_level.txt +0 -0
rgwfuncs/df_lib.py
CHANGED
@@ -21,51 +21,15 @@ from email.mime.base import MIMEBase
|
|
21
21
|
from email import encoders
|
22
22
|
from googleapiclient.discovery import build
|
23
23
|
import base64
|
24
|
-
import
|
25
|
-
|
24
|
+
import boto3
|
25
|
+
# import inspect
|
26
|
+
from typing import Optional, Dict, List, Tuple, Any
|
26
27
|
import warnings
|
27
28
|
|
28
29
|
# Suppress all FutureWarnings
|
29
30
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
30
31
|
|
31
32
|
|
32
|
-
def df_docs(method_type_filter: Optional[str] = None) -> None:
|
33
|
-
"""
|
34
|
-
Print a list of function names in alphabetical order. If method_type_filter
|
35
|
-
is specified, print the docstrings of the functions that match the filter.
|
36
|
-
Using '*' as a filter will print the docstrings for all functions.
|
37
|
-
|
38
|
-
Parameters:
|
39
|
-
method_type_filter: Optional filter string representing a function name,
|
40
|
-
or '*' to display docstrings for all functions.
|
41
|
-
"""
|
42
|
-
# Get the current module's namespace
|
43
|
-
current_module = __name__
|
44
|
-
|
45
|
-
local_functions: Dict[str, Callable] = {
|
46
|
-
name: obj for name, obj in globals().items()
|
47
|
-
if inspect.isfunction(obj) and obj.__module__ == current_module
|
48
|
-
}
|
49
|
-
|
50
|
-
# List of function names sorted alphabetically
|
51
|
-
function_names = sorted(local_functions.keys())
|
52
|
-
|
53
|
-
# Print function names
|
54
|
-
print("Functions in alphabetical order:")
|
55
|
-
for name in function_names:
|
56
|
-
print(name)
|
57
|
-
|
58
|
-
# If a filter is provided or '*', print the docstrings of functions
|
59
|
-
if method_type_filter:
|
60
|
-
# print("\nFiltered function documentation:")
|
61
|
-
for name, func in local_functions.items():
|
62
|
-
docstring: Optional[str] = func.__doc__
|
63
|
-
if docstring:
|
64
|
-
if method_type_filter == '*' or method_type_filter == name:
|
65
|
-
# Print the entire docstring for the matching function
|
66
|
-
print(f"\n{name}:\n{docstring}")
|
67
|
-
|
68
|
-
|
69
33
|
def numeric_clean(
|
70
34
|
df: pd.DataFrame,
|
71
35
|
column_names: str,
|
@@ -421,8 +385,7 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
421
385
|
raise ConnectionError(
|
422
386
|
"All attempts to connect to ClickHouse failed.")
|
423
387
|
|
424
|
-
def query_google_big_query(
|
425
|
-
db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
388
|
+
def query_google_big_query(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
426
389
|
json_file_path = db_preset['json_file_path']
|
427
390
|
project_id = db_preset['project_id']
|
428
391
|
|
@@ -437,6 +400,54 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
437
400
|
|
438
401
|
return pd.DataFrame(rows, columns=columns)
|
439
402
|
|
403
|
+
def query_athena(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
404
|
+
|
405
|
+
def execute_athena_query(athena_client, query: str, database: str, output_bucket: str) -> str:
|
406
|
+
response = athena_client.start_query_execution(
|
407
|
+
QueryString=query,
|
408
|
+
QueryExecutionContext={"Database": database},
|
409
|
+
ResultConfiguration={"OutputLocation": output_bucket}
|
410
|
+
)
|
411
|
+
return response["QueryExecutionId"]
|
412
|
+
|
413
|
+
def wait_for_athena_query_to_complete(athena_client, query_execution_id: str):
|
414
|
+
while True:
|
415
|
+
response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
|
416
|
+
state = response["QueryExecution"]["Status"]["State"]
|
417
|
+
if state == "SUCCEEDED":
|
418
|
+
break
|
419
|
+
elif state in ("FAILED", "CANCELLED"):
|
420
|
+
raise Exception(f"Query failed with state: {state}")
|
421
|
+
time.sleep(1)
|
422
|
+
|
423
|
+
def download_athena_query_results(athena_client, query_execution_id: str) -> pd.DataFrame:
|
424
|
+
paginator = athena_client.get_paginator("get_query_results")
|
425
|
+
result_pages = paginator.paginate(QueryExecutionId=query_execution_id)
|
426
|
+
rows = []
|
427
|
+
columns = []
|
428
|
+
for page in result_pages:
|
429
|
+
if not columns:
|
430
|
+
columns = [col["Name"] for col in page["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]]
|
431
|
+
rows.extend(page["ResultSet"]["Rows"])
|
432
|
+
|
433
|
+
data = [[col.get("VarCharValue", None) for col in row["Data"]] for row in rows[1:]]
|
434
|
+
return pd.DataFrame(data, columns=columns)
|
435
|
+
|
436
|
+
aws_region = db_preset['aws_region']
|
437
|
+
database = db_preset['database']
|
438
|
+
output_bucket = db_preset['output_bucket']
|
439
|
+
|
440
|
+
athena_client = boto3.client(
|
441
|
+
'athena',
|
442
|
+
region_name=aws_region,
|
443
|
+
aws_access_key_id=db_preset['aws_access_key'],
|
444
|
+
aws_secret_access_key=db_preset['aws_secret_key']
|
445
|
+
)
|
446
|
+
|
447
|
+
query_execution_id = execute_athena_query(athena_client, query, database, output_bucket)
|
448
|
+
wait_for_athena_query_to_complete(athena_client, query_execution_id)
|
449
|
+
return download_athena_query_results(athena_client, query_execution_id)
|
450
|
+
|
440
451
|
# Assume the configuration file is located at ~/.rgwfuncsrc
|
441
452
|
config_path = os.path.expanduser('~/.rgwfuncsrc')
|
442
453
|
with open(config_path, 'r') as f:
|
@@ -459,6 +470,8 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
459
470
|
return query_clickhouse(db_preset, query)
|
460
471
|
elif db_type == 'google_big_query':
|
461
472
|
return query_google_big_query(db_preset, query)
|
473
|
+
elif db_type == 'aws_athena':
|
474
|
+
return query_athena(db_preset, query)
|
462
475
|
else:
|
463
476
|
raise ValueError(f"Unsupported db_type: {db_type}")
|
464
477
|
|
@@ -835,7 +848,12 @@ def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
|
|
835
848
|
gc.collect()
|
836
849
|
|
837
850
|
|
838
|
-
def send_dataframe_via_telegram(
|
851
|
+
def send_dataframe_via_telegram(
|
852
|
+
df: pd.DataFrame,
|
853
|
+
bot_name: str,
|
854
|
+
message: Optional[str] = None,
|
855
|
+
as_file: bool = True,
|
856
|
+
remove_after_send: bool = True) -> None:
|
839
857
|
"""
|
840
858
|
Send a DataFrame via Telegram using a specified bot configuration.
|
841
859
|
|
@@ -1673,7 +1691,11 @@ def print_n_frequency_cascading(
|
|
1673
1691
|
print(json.dumps(report, indent=2))
|
1674
1692
|
|
1675
1693
|
|
1676
|
-
def print_n_frequency_linear(
|
1694
|
+
def print_n_frequency_linear(
|
1695
|
+
df: pd.DataFrame,
|
1696
|
+
n: int,
|
1697
|
+
columns: list,
|
1698
|
+
order_by: str = "FREQ_DESC") -> None:
|
1677
1699
|
"""
|
1678
1700
|
Print the linear frequency of top n values for specified columns.
|
1679
1701
|
|
@@ -1709,27 +1731,49 @@ def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: list, order_by:
|
|
1709
1731
|
|
1710
1732
|
return report
|
1711
1733
|
|
1734
|
+
def try_parse_numeric(val):
|
1735
|
+
"""Attempt to parse a value as an integer or float."""
|
1736
|
+
try:
|
1737
|
+
return int(val)
|
1738
|
+
except ValueError:
|
1739
|
+
try:
|
1740
|
+
return float(val)
|
1741
|
+
except ValueError:
|
1742
|
+
return val
|
1743
|
+
|
1712
1744
|
def sort_frequency(frequency, order_by):
|
1713
|
-
|
1714
|
-
|
1715
|
-
|
1716
|
-
|
1717
|
-
|
1745
|
+
# keys = frequency.keys()
|
1746
|
+
|
1747
|
+
# Convert keys to numerical values where possible, leaving `NaN` as a
|
1748
|
+
# special string
|
1749
|
+
# parsed_keys = [(try_parse_numeric(key), key) for key in keys]
|
1750
|
+
|
1751
|
+
if order_by in {"BY_KEYS_ASC", "BY_KEYS_DESC"}:
|
1752
|
+
reverse = order_by == "BY_KEYS_DESC"
|
1753
|
+
sorted_items = sorted(
|
1754
|
+
frequency.items(),
|
1755
|
+
key=lambda item: try_parse_numeric(
|
1756
|
+
item[0]),
|
1757
|
+
reverse=reverse)
|
1758
|
+
else:
|
1759
|
+
if order_by == "ASC":
|
1760
|
+
sorted_items = sorted(
|
1761
|
+
frequency.items(), key=lambda item: item[0])
|
1762
|
+
elif order_by == "DESC":
|
1763
|
+
sorted_items = sorted(
|
1718
1764
|
frequency.items(),
|
1719
1765
|
key=lambda item: item[0],
|
1720
|
-
reverse=True)
|
1721
|
-
|
1722
|
-
|
1723
|
-
|
1724
|
-
|
1725
|
-
|
1726
|
-
return dict(sorted(frequency.items(), reverse=True))
|
1727
|
-
else: # Default to "FREQ_DESC"
|
1728
|
-
return dict(
|
1729
|
-
sorted(
|
1766
|
+
reverse=True)
|
1767
|
+
elif order_by == "FREQ_ASC":
|
1768
|
+
sorted_items = sorted(
|
1769
|
+
frequency.items(), key=lambda item: item[1])
|
1770
|
+
else: # Default to "FREQ_DESC"
|
1771
|
+
sorted_items = sorted(
|
1730
1772
|
frequency.items(),
|
1731
1773
|
key=lambda item: item[1],
|
1732
|
-
reverse=True)
|
1774
|
+
reverse=True)
|
1775
|
+
|
1776
|
+
return dict(sorted_items)
|
1733
1777
|
|
1734
1778
|
report = generate_linear_report(df, columns, n, order_by)
|
1735
1779
|
print(json.dumps(report, indent=2))
|
@@ -1879,7 +1923,10 @@ def right_join(
|
|
1879
1923
|
return df1.merge(df2, how='right', left_on=left_on, right_on=right_on)
|
1880
1924
|
|
1881
1925
|
|
1882
|
-
def insert_dataframe_in_sqlite_database(
|
1926
|
+
def insert_dataframe_in_sqlite_database(
|
1927
|
+
db_path: str,
|
1928
|
+
tablename: str,
|
1929
|
+
df: pd.DataFrame) -> None:
|
1883
1930
|
"""
|
1884
1931
|
Inserts a Pandas DataFrame into a SQLite database table.
|
1885
1932
|
|
@@ -1941,7 +1988,10 @@ def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.Dat
|
|
1941
1988
|
df.to_sql(tablename, conn, if_exists='append', index=False)
|
1942
1989
|
|
1943
1990
|
|
1944
|
-
def sync_dataframe_to_sqlite_database(
|
1991
|
+
def sync_dataframe_to_sqlite_database(
|
1992
|
+
db_path: str,
|
1993
|
+
tablename: str,
|
1994
|
+
df: pd.DataFrame) -> None:
|
1945
1995
|
"""
|
1946
1996
|
Processes and saves a DataFrame to an SQLite database, adding a timestamp column
|
1947
1997
|
and replacing the existing table if needed. Creates the table if it does not exist.
|
rgwfuncs/docs_lib.py
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
import os
|
2
|
+
import inspect
|
3
|
+
from typing import Optional
|
4
|
+
import warnings
|
5
|
+
|
6
|
+
# Suppress all FutureWarnings
|
7
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
8
|
+
|
9
|
+
|
10
|
+
def docs(method_type_filter: Optional[str] = None) -> None:
|
11
|
+
"""
|
12
|
+
Print a list of function names in alphabetical order from all modules.
|
13
|
+
If method_type_filter is specified, print the docstrings of the functions
|
14
|
+
that match the filter based on a substring. Using '*' as a filter will print
|
15
|
+
the docstrings for all functions.
|
16
|
+
|
17
|
+
Parameters:
|
18
|
+
method_type_filter: Optional filter string representing a filter for
|
19
|
+
function names, or '*' to display docstrings for all functions.
|
20
|
+
"""
|
21
|
+
|
22
|
+
# Directory containing your modules
|
23
|
+
module_dir = os.path.dirname(__file__)
|
24
|
+
|
25
|
+
# Iterate over each file in the module directory
|
26
|
+
for filename in sorted(os.listdir(module_dir)):
|
27
|
+
if filename.endswith('.py') and filename != '__init__.py':
|
28
|
+
module_name, _ = os.path.splitext(filename)
|
29
|
+
print(f"\n# {module_name}.py")
|
30
|
+
|
31
|
+
# Import the module
|
32
|
+
module_path = f"rgwfuncs.{module_name}"
|
33
|
+
module = __import__(module_path, fromlist=[module_name])
|
34
|
+
|
35
|
+
# Get all functions from the module
|
36
|
+
functions = {
|
37
|
+
name: obj for name, obj
|
38
|
+
in inspect.getmembers(module, inspect.isfunction)
|
39
|
+
if obj.__module__ == module_path
|
40
|
+
}
|
41
|
+
|
42
|
+
# List function names
|
43
|
+
function_names = sorted(functions.keys())
|
44
|
+
for name in function_names:
|
45
|
+
# If a filter is provided or '*', check if the function name
|
46
|
+
# contains the filter
|
47
|
+
if method_type_filter and (
|
48
|
+
method_type_filter == '*' or method_type_filter in name):
|
49
|
+
docstring: Optional[str] = functions[name].__doc__
|
50
|
+
if docstring:
|
51
|
+
print(f"\n{name}:\n{docstring}")
|
@@ -0,0 +1,32 @@
|
|
1
|
+
import code
|
2
|
+
import readline
|
3
|
+
import rlcompleter # noqa: F401
|
4
|
+
import sys # noqa: F401
|
5
|
+
from typing import Dict, Any
|
6
|
+
from .df_lib import * # noqa: F401, F403, E402
|
7
|
+
from .algebra_lib import * # noqa: F401, F403, E402
|
8
|
+
from .str_lib import * # noqa: F401, F403, E402
|
9
|
+
from .docs_lib import * # noqa: F401, F403, E402
|
10
|
+
|
11
|
+
|
12
|
+
def interactive_shell(local_vars: Dict[str, Any]) -> None:
|
13
|
+
"""
|
14
|
+
Launches an interactive prompt for inspecting and modifying local variables, making all methods
|
15
|
+
in the rgwfuncs library available by default.
|
16
|
+
|
17
|
+
Parameters:
|
18
|
+
local_vars (dict): Dictionary of local variables to be available in the interactive shell.
|
19
|
+
"""
|
20
|
+
if not isinstance(local_vars, dict):
|
21
|
+
raise TypeError("local_vars must be a dictionary")
|
22
|
+
|
23
|
+
readline.parse_and_bind("tab: complete")
|
24
|
+
|
25
|
+
# Make imported functions available in the REPL
|
26
|
+
local_vars.update(globals())
|
27
|
+
|
28
|
+
# Create interactive console with local context
|
29
|
+
console = code.InteractiveConsole(locals=local_vars)
|
30
|
+
|
31
|
+
# Start interactive session
|
32
|
+
console.interact(banner="Welcome to the rgwfuncs interactive shell.")
|
rgwfuncs/str_lib.py
CHANGED
@@ -1,53 +1,16 @@
|
|
1
1
|
import os
|
2
2
|
import json
|
3
3
|
import requests
|
4
|
-
import
|
5
|
-
from typing import Tuple, Optional, Dict, Callable
|
4
|
+
from typing import Tuple
|
6
5
|
import warnings
|
7
6
|
|
8
7
|
# Suppress all FutureWarnings
|
9
8
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
10
9
|
|
11
10
|
|
12
|
-
def str_docs(method_type_filter: Optional[str] = None) -> None:
|
13
|
-
"""
|
14
|
-
Print a list of function names in alphabetical order. If method_type_filter
|
15
|
-
is specified, print the docstrings of the functions that match the filter.
|
16
|
-
Using '*' as a filter will print the docstrings for all functions.
|
17
|
-
|
18
|
-
Parameters:
|
19
|
-
method_type_filter: Optional filter string representing a function name,
|
20
|
-
or '*' to display docstrings for all functions.
|
21
|
-
"""
|
22
|
-
# Get the current module's namespace
|
23
|
-
current_module = __name__
|
24
|
-
|
25
|
-
local_functions: Dict[str, Callable] = {
|
26
|
-
name: obj for name, obj in globals().items()
|
27
|
-
if inspect.isfunction(obj) and obj.__module__ == current_module
|
28
|
-
}
|
29
|
-
|
30
|
-
# List of function names sorted alphabetically
|
31
|
-
function_names = sorted(local_functions.keys())
|
32
|
-
|
33
|
-
# Print function names
|
34
|
-
print("Functions in alphabetical order:")
|
35
|
-
for name in function_names:
|
36
|
-
print(name)
|
37
|
-
|
38
|
-
# If a filter is provided or '*', print the docstrings of functions
|
39
|
-
if method_type_filter:
|
40
|
-
# print("\nFiltered function documentation:")
|
41
|
-
for name, func in local_functions.items():
|
42
|
-
docstring: Optional[str] = func.__doc__
|
43
|
-
if docstring:
|
44
|
-
if method_type_filter == '*' or method_type_filter == name:
|
45
|
-
# Print the entire docstring for the matching function
|
46
|
-
print(f"\n{name}:\n{docstring}")
|
47
|
-
|
48
|
-
|
49
11
|
def send_telegram_message(preset_name: str, message: str) -> None:
|
50
|
-
"""
|
12
|
+
"""
|
13
|
+
Send a Telegram message using the specified preset.
|
51
14
|
|
52
15
|
Args:
|
53
16
|
preset_name (str): The name of the preset to use for sending the message.
|
@@ -73,19 +36,20 @@ def send_telegram_message(preset_name: str, message: str) -> None:
|
|
73
36
|
return preset
|
74
37
|
return None
|
75
38
|
|
76
|
-
def get_telegram_bot_details(
|
39
|
+
def get_telegram_bot_details(
|
40
|
+
config: dict, preset_name: str) -> Tuple[str, str]:
|
77
41
|
"""Retrieve the Telegram bot token and chat ID from the preset."""
|
78
42
|
preset = get_telegram_preset(config, preset_name)
|
79
43
|
if not preset:
|
80
|
-
raise RuntimeError(
|
44
|
+
raise RuntimeError(
|
45
|
+
f"Telegram bot preset '{preset_name}' not found in the configuration file")
|
81
46
|
|
82
47
|
bot_token = preset.get("bot_token")
|
83
48
|
chat_id = preset.get("chat_id")
|
84
49
|
|
85
50
|
if not bot_token or not chat_id:
|
86
51
|
raise RuntimeError(
|
87
|
-
f"Telegram bot token or chat ID for '{preset_name}' not found in the configuration file"
|
88
|
-
)
|
52
|
+
f"Telegram bot token or chat ID for '{preset_name}' not found in the configuration file")
|
89
53
|
|
90
54
|
return bot_token, chat_id
|
91
55
|
|