rgwfuncs 0.0.21__py3-none-any.whl → 0.0.54__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- rgwfuncs/__init__.py +5 -2
- rgwfuncs/algebra_lib.py +901 -0
- rgwfuncs/df_lib.py +111 -61
- rgwfuncs/docs_lib.py +51 -0
- rgwfuncs/interactive_shell_lib.py +32 -0
- rgwfuncs/str_lib.py +8 -44
- {rgwfuncs-0.0.21.dist-info → rgwfuncs-0.0.54.dist-info}/METADATA +517 -92
- rgwfuncs-0.0.54.dist-info/RECORD +12 -0
- rgwfuncs-0.0.21.dist-info/RECORD +0 -9
- {rgwfuncs-0.0.21.dist-info → rgwfuncs-0.0.54.dist-info}/LICENSE +0 -0
- {rgwfuncs-0.0.21.dist-info → rgwfuncs-0.0.54.dist-info}/WHEEL +0 -0
- {rgwfuncs-0.0.21.dist-info → rgwfuncs-0.0.54.dist-info}/entry_points.txt +0 -0
- {rgwfuncs-0.0.21.dist-info → rgwfuncs-0.0.54.dist-info}/top_level.txt +0 -0
rgwfuncs/df_lib.py
CHANGED
@@ -21,51 +21,15 @@ from email.mime.base import MIMEBase
|
|
21
21
|
from email import encoders
|
22
22
|
from googleapiclient.discovery import build
|
23
23
|
import base64
|
24
|
-
import
|
25
|
-
|
24
|
+
import boto3
|
25
|
+
# import inspect
|
26
|
+
from typing import Optional, Dict, List, Tuple, Any
|
26
27
|
import warnings
|
27
28
|
|
28
29
|
# Suppress all FutureWarnings
|
29
30
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
30
31
|
|
31
32
|
|
32
|
-
def df_docs(method_type_filter: Optional[str] = None) -> None:
|
33
|
-
"""
|
34
|
-
Print a list of function names in alphabetical order. If method_type_filter
|
35
|
-
is specified, print the docstrings of the functions that match the filter.
|
36
|
-
Using '*' as a filter will print the docstrings for all functions.
|
37
|
-
|
38
|
-
Parameters:
|
39
|
-
method_type_filter: Optional filter string representing a function name,
|
40
|
-
or '*' to display docstrings for all functions.
|
41
|
-
"""
|
42
|
-
# Get the current module's namespace
|
43
|
-
current_module = __name__
|
44
|
-
|
45
|
-
local_functions: Dict[str, Callable] = {
|
46
|
-
name: obj for name, obj in globals().items()
|
47
|
-
if inspect.isfunction(obj) and obj.__module__ == current_module
|
48
|
-
}
|
49
|
-
|
50
|
-
# List of function names sorted alphabetically
|
51
|
-
function_names = sorted(local_functions.keys())
|
52
|
-
|
53
|
-
# Print function names
|
54
|
-
print("Functions in alphabetical order:")
|
55
|
-
for name in function_names:
|
56
|
-
print(name)
|
57
|
-
|
58
|
-
# If a filter is provided or '*', print the docstrings of functions
|
59
|
-
if method_type_filter:
|
60
|
-
# print("\nFiltered function documentation:")
|
61
|
-
for name, func in local_functions.items():
|
62
|
-
docstring: Optional[str] = func.__doc__
|
63
|
-
if docstring:
|
64
|
-
if method_type_filter == '*' or method_type_filter == name:
|
65
|
-
# Print the entire docstring for the matching function
|
66
|
-
print(f"\n{name}:\n{docstring}")
|
67
|
-
|
68
|
-
|
69
33
|
def numeric_clean(
|
70
34
|
df: pd.DataFrame,
|
71
35
|
column_names: str,
|
@@ -421,8 +385,7 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
421
385
|
raise ConnectionError(
|
422
386
|
"All attempts to connect to ClickHouse failed.")
|
423
387
|
|
424
|
-
def query_google_big_query(
|
425
|
-
db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
388
|
+
def query_google_big_query(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
426
389
|
json_file_path = db_preset['json_file_path']
|
427
390
|
project_id = db_preset['project_id']
|
428
391
|
|
@@ -437,6 +400,54 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
437
400
|
|
438
401
|
return pd.DataFrame(rows, columns=columns)
|
439
402
|
|
403
|
+
def query_athena(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
404
|
+
|
405
|
+
def execute_athena_query(athena_client, query: str, database: str, output_bucket: str) -> str:
|
406
|
+
response = athena_client.start_query_execution(
|
407
|
+
QueryString=query,
|
408
|
+
QueryExecutionContext={"Database": database},
|
409
|
+
ResultConfiguration={"OutputLocation": output_bucket}
|
410
|
+
)
|
411
|
+
return response["QueryExecutionId"]
|
412
|
+
|
413
|
+
def wait_for_athena_query_to_complete(athena_client, query_execution_id: str):
|
414
|
+
while True:
|
415
|
+
response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
|
416
|
+
state = response["QueryExecution"]["Status"]["State"]
|
417
|
+
if state == "SUCCEEDED":
|
418
|
+
break
|
419
|
+
elif state in ("FAILED", "CANCELLED"):
|
420
|
+
raise Exception(f"Query failed with state: {state}")
|
421
|
+
time.sleep(1)
|
422
|
+
|
423
|
+
def download_athena_query_results(athena_client, query_execution_id: str) -> pd.DataFrame:
|
424
|
+
paginator = athena_client.get_paginator("get_query_results")
|
425
|
+
result_pages = paginator.paginate(QueryExecutionId=query_execution_id)
|
426
|
+
rows = []
|
427
|
+
columns = []
|
428
|
+
for page in result_pages:
|
429
|
+
if not columns:
|
430
|
+
columns = [col["Name"] for col in page["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]]
|
431
|
+
rows.extend(page["ResultSet"]["Rows"])
|
432
|
+
|
433
|
+
data = [[col.get("VarCharValue", None) for col in row["Data"]] for row in rows[1:]]
|
434
|
+
return pd.DataFrame(data, columns=columns)
|
435
|
+
|
436
|
+
aws_region = db_preset['aws_region']
|
437
|
+
database = db_preset['database']
|
438
|
+
output_bucket = db_preset['output_bucket']
|
439
|
+
|
440
|
+
athena_client = boto3.client(
|
441
|
+
'athena',
|
442
|
+
region_name=aws_region,
|
443
|
+
aws_access_key_id=db_preset['aws_access_key'],
|
444
|
+
aws_secret_access_key=db_preset['aws_secret_key']
|
445
|
+
)
|
446
|
+
|
447
|
+
query_execution_id = execute_athena_query(athena_client, query, database, output_bucket)
|
448
|
+
wait_for_athena_query_to_complete(athena_client, query_execution_id)
|
449
|
+
return download_athena_query_results(athena_client, query_execution_id)
|
450
|
+
|
440
451
|
# Assume the configuration file is located at ~/.rgwfuncsrc
|
441
452
|
config_path = os.path.expanduser('~/.rgwfuncsrc')
|
442
453
|
with open(config_path, 'r') as f:
|
@@ -459,6 +470,8 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
459
470
|
return query_clickhouse(db_preset, query)
|
460
471
|
elif db_type == 'google_big_query':
|
461
472
|
return query_google_big_query(db_preset, query)
|
473
|
+
elif db_type == 'aws_athena':
|
474
|
+
return query_athena(db_preset, query)
|
462
475
|
else:
|
463
476
|
raise ValueError(f"Unsupported db_type: {db_type}")
|
464
477
|
|
@@ -835,7 +848,12 @@ def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
|
|
835
848
|
gc.collect()
|
836
849
|
|
837
850
|
|
838
|
-
def send_dataframe_via_telegram(
|
851
|
+
def send_dataframe_via_telegram(
|
852
|
+
df: pd.DataFrame,
|
853
|
+
bot_name: str,
|
854
|
+
message: Optional[str] = None,
|
855
|
+
as_file: bool = True,
|
856
|
+
remove_after_send: bool = True) -> None:
|
839
857
|
"""
|
840
858
|
Send a DataFrame via Telegram using a specified bot configuration.
|
841
859
|
|
@@ -1673,7 +1691,11 @@ def print_n_frequency_cascading(
|
|
1673
1691
|
print(json.dumps(report, indent=2))
|
1674
1692
|
|
1675
1693
|
|
1676
|
-
def print_n_frequency_linear(
|
1694
|
+
def print_n_frequency_linear(
|
1695
|
+
df: pd.DataFrame,
|
1696
|
+
n: int,
|
1697
|
+
columns: list,
|
1698
|
+
order_by: str = "FREQ_DESC") -> None:
|
1677
1699
|
"""
|
1678
1700
|
Print the linear frequency of top n values for specified columns.
|
1679
1701
|
|
@@ -1709,27 +1731,49 @@ def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: list, order_by:
|
|
1709
1731
|
|
1710
1732
|
return report
|
1711
1733
|
|
1734
|
+
def try_parse_numeric(val):
|
1735
|
+
"""Attempt to parse a value as an integer or float."""
|
1736
|
+
try:
|
1737
|
+
return int(val)
|
1738
|
+
except ValueError:
|
1739
|
+
try:
|
1740
|
+
return float(val)
|
1741
|
+
except ValueError:
|
1742
|
+
return val
|
1743
|
+
|
1712
1744
|
def sort_frequency(frequency, order_by):
|
1713
|
-
|
1714
|
-
|
1715
|
-
|
1716
|
-
|
1717
|
-
|
1745
|
+
# keys = frequency.keys()
|
1746
|
+
|
1747
|
+
# Convert keys to numerical values where possible, leaving `NaN` as a
|
1748
|
+
# special string
|
1749
|
+
# parsed_keys = [(try_parse_numeric(key), key) for key in keys]
|
1750
|
+
|
1751
|
+
if order_by in {"BY_KEYS_ASC", "BY_KEYS_DESC"}:
|
1752
|
+
reverse = order_by == "BY_KEYS_DESC"
|
1753
|
+
sorted_items = sorted(
|
1754
|
+
frequency.items(),
|
1755
|
+
key=lambda item: try_parse_numeric(
|
1756
|
+
item[0]),
|
1757
|
+
reverse=reverse)
|
1758
|
+
else:
|
1759
|
+
if order_by == "ASC":
|
1760
|
+
sorted_items = sorted(
|
1761
|
+
frequency.items(), key=lambda item: item[0])
|
1762
|
+
elif order_by == "DESC":
|
1763
|
+
sorted_items = sorted(
|
1718
1764
|
frequency.items(),
|
1719
1765
|
key=lambda item: item[0],
|
1720
|
-
reverse=True)
|
1721
|
-
|
1722
|
-
|
1723
|
-
|
1724
|
-
|
1725
|
-
|
1726
|
-
return dict(sorted(frequency.items(), reverse=True))
|
1727
|
-
else: # Default to "FREQ_DESC"
|
1728
|
-
return dict(
|
1729
|
-
sorted(
|
1766
|
+
reverse=True)
|
1767
|
+
elif order_by == "FREQ_ASC":
|
1768
|
+
sorted_items = sorted(
|
1769
|
+
frequency.items(), key=lambda item: item[1])
|
1770
|
+
else: # Default to "FREQ_DESC"
|
1771
|
+
sorted_items = sorted(
|
1730
1772
|
frequency.items(),
|
1731
1773
|
key=lambda item: item[1],
|
1732
|
-
reverse=True)
|
1774
|
+
reverse=True)
|
1775
|
+
|
1776
|
+
return dict(sorted_items)
|
1733
1777
|
|
1734
1778
|
report = generate_linear_report(df, columns, n, order_by)
|
1735
1779
|
print(json.dumps(report, indent=2))
|
@@ -1879,7 +1923,10 @@ def right_join(
|
|
1879
1923
|
return df1.merge(df2, how='right', left_on=left_on, right_on=right_on)
|
1880
1924
|
|
1881
1925
|
|
1882
|
-
def insert_dataframe_in_sqlite_database(
|
1926
|
+
def insert_dataframe_in_sqlite_database(
|
1927
|
+
db_path: str,
|
1928
|
+
tablename: str,
|
1929
|
+
df: pd.DataFrame) -> None:
|
1883
1930
|
"""
|
1884
1931
|
Inserts a Pandas DataFrame into a SQLite database table.
|
1885
1932
|
|
@@ -1941,7 +1988,10 @@ def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.Dat
|
|
1941
1988
|
df.to_sql(tablename, conn, if_exists='append', index=False)
|
1942
1989
|
|
1943
1990
|
|
1944
|
-
def sync_dataframe_to_sqlite_database(
|
1991
|
+
def sync_dataframe_to_sqlite_database(
|
1992
|
+
db_path: str,
|
1993
|
+
tablename: str,
|
1994
|
+
df: pd.DataFrame) -> None:
|
1945
1995
|
"""
|
1946
1996
|
Processes and saves a DataFrame to an SQLite database, adding a timestamp column
|
1947
1997
|
and replacing the existing table if needed. Creates the table if it does not exist.
|
rgwfuncs/docs_lib.py
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
import os
|
2
|
+
import inspect
|
3
|
+
from typing import Optional
|
4
|
+
import warnings
|
5
|
+
|
6
|
+
# Suppress all FutureWarnings
|
7
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
8
|
+
|
9
|
+
|
10
|
+
def docs(method_type_filter: Optional[str] = None) -> None:
|
11
|
+
"""
|
12
|
+
Print a list of function names in alphabetical order from all modules.
|
13
|
+
If method_type_filter is specified, print the docstrings of the functions
|
14
|
+
that match the filter based on a substring. Using '*' as a filter will print
|
15
|
+
the docstrings for all functions.
|
16
|
+
|
17
|
+
Parameters:
|
18
|
+
method_type_filter: Optional filter string representing a filter for
|
19
|
+
function names, or '*' to display docstrings for all functions.
|
20
|
+
"""
|
21
|
+
|
22
|
+
# Directory containing your modules
|
23
|
+
module_dir = os.path.dirname(__file__)
|
24
|
+
|
25
|
+
# Iterate over each file in the module directory
|
26
|
+
for filename in sorted(os.listdir(module_dir)):
|
27
|
+
if filename.endswith('.py') and filename != '__init__.py':
|
28
|
+
module_name, _ = os.path.splitext(filename)
|
29
|
+
print(f"\n# {module_name}.py")
|
30
|
+
|
31
|
+
# Import the module
|
32
|
+
module_path = f"rgwfuncs.{module_name}"
|
33
|
+
module = __import__(module_path, fromlist=[module_name])
|
34
|
+
|
35
|
+
# Get all functions from the module
|
36
|
+
functions = {
|
37
|
+
name: obj for name, obj
|
38
|
+
in inspect.getmembers(module, inspect.isfunction)
|
39
|
+
if obj.__module__ == module_path
|
40
|
+
}
|
41
|
+
|
42
|
+
# List function names
|
43
|
+
function_names = sorted(functions.keys())
|
44
|
+
for name in function_names:
|
45
|
+
# If a filter is provided or '*', check if the function name
|
46
|
+
# contains the filter
|
47
|
+
if method_type_filter and (
|
48
|
+
method_type_filter == '*' or method_type_filter in name):
|
49
|
+
docstring: Optional[str] = functions[name].__doc__
|
50
|
+
if docstring:
|
51
|
+
print(f"\n{name}:\n{docstring}")
|
@@ -0,0 +1,32 @@
|
|
1
|
+
import code
|
2
|
+
import readline
|
3
|
+
import rlcompleter # noqa: F401
|
4
|
+
import sys # noqa: F401
|
5
|
+
from typing import Dict, Any
|
6
|
+
from .df_lib import * # noqa: F401, F403, E402
|
7
|
+
from .algebra_lib import * # noqa: F401, F403, E402
|
8
|
+
from .str_lib import * # noqa: F401, F403, E402
|
9
|
+
from .docs_lib import * # noqa: F401, F403, E402
|
10
|
+
|
11
|
+
|
12
|
+
def interactive_shell(local_vars: Dict[str, Any]) -> None:
|
13
|
+
"""
|
14
|
+
Launches an interactive prompt for inspecting and modifying local variables, making all methods
|
15
|
+
in the rgwfuncs library available by default.
|
16
|
+
|
17
|
+
Parameters:
|
18
|
+
local_vars (dict): Dictionary of local variables to be available in the interactive shell.
|
19
|
+
"""
|
20
|
+
if not isinstance(local_vars, dict):
|
21
|
+
raise TypeError("local_vars must be a dictionary")
|
22
|
+
|
23
|
+
readline.parse_and_bind("tab: complete")
|
24
|
+
|
25
|
+
# Make imported functions available in the REPL
|
26
|
+
local_vars.update(globals())
|
27
|
+
|
28
|
+
# Create interactive console with local context
|
29
|
+
console = code.InteractiveConsole(locals=local_vars)
|
30
|
+
|
31
|
+
# Start interactive session
|
32
|
+
console.interact(banner="Welcome to the rgwfuncs interactive shell.")
|
rgwfuncs/str_lib.py
CHANGED
@@ -1,53 +1,16 @@
|
|
1
1
|
import os
|
2
2
|
import json
|
3
3
|
import requests
|
4
|
-
import
|
5
|
-
from typing import Tuple, Optional, Dict, Callable
|
4
|
+
from typing import Tuple
|
6
5
|
import warnings
|
7
6
|
|
8
7
|
# Suppress all FutureWarnings
|
9
8
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
10
9
|
|
11
10
|
|
12
|
-
def str_docs(method_type_filter: Optional[str] = None) -> None:
|
13
|
-
"""
|
14
|
-
Print a list of function names in alphabetical order. If method_type_filter
|
15
|
-
is specified, print the docstrings of the functions that match the filter.
|
16
|
-
Using '*' as a filter will print the docstrings for all functions.
|
17
|
-
|
18
|
-
Parameters:
|
19
|
-
method_type_filter: Optional filter string representing a function name,
|
20
|
-
or '*' to display docstrings for all functions.
|
21
|
-
"""
|
22
|
-
# Get the current module's namespace
|
23
|
-
current_module = __name__
|
24
|
-
|
25
|
-
local_functions: Dict[str, Callable] = {
|
26
|
-
name: obj for name, obj in globals().items()
|
27
|
-
if inspect.isfunction(obj) and obj.__module__ == current_module
|
28
|
-
}
|
29
|
-
|
30
|
-
# List of function names sorted alphabetically
|
31
|
-
function_names = sorted(local_functions.keys())
|
32
|
-
|
33
|
-
# Print function names
|
34
|
-
print("Functions in alphabetical order:")
|
35
|
-
for name in function_names:
|
36
|
-
print(name)
|
37
|
-
|
38
|
-
# If a filter is provided or '*', print the docstrings of functions
|
39
|
-
if method_type_filter:
|
40
|
-
# print("\nFiltered function documentation:")
|
41
|
-
for name, func in local_functions.items():
|
42
|
-
docstring: Optional[str] = func.__doc__
|
43
|
-
if docstring:
|
44
|
-
if method_type_filter == '*' or method_type_filter == name:
|
45
|
-
# Print the entire docstring for the matching function
|
46
|
-
print(f"\n{name}:\n{docstring}")
|
47
|
-
|
48
|
-
|
49
11
|
def send_telegram_message(preset_name: str, message: str) -> None:
|
50
|
-
"""
|
12
|
+
"""
|
13
|
+
Send a Telegram message using the specified preset.
|
51
14
|
|
52
15
|
Args:
|
53
16
|
preset_name (str): The name of the preset to use for sending the message.
|
@@ -73,19 +36,20 @@ def send_telegram_message(preset_name: str, message: str) -> None:
|
|
73
36
|
return preset
|
74
37
|
return None
|
75
38
|
|
76
|
-
def get_telegram_bot_details(
|
39
|
+
def get_telegram_bot_details(
|
40
|
+
config: dict, preset_name: str) -> Tuple[str, str]:
|
77
41
|
"""Retrieve the Telegram bot token and chat ID from the preset."""
|
78
42
|
preset = get_telegram_preset(config, preset_name)
|
79
43
|
if not preset:
|
80
|
-
raise RuntimeError(
|
44
|
+
raise RuntimeError(
|
45
|
+
f"Telegram bot preset '{preset_name}' not found in the configuration file")
|
81
46
|
|
82
47
|
bot_token = preset.get("bot_token")
|
83
48
|
chat_id = preset.get("chat_id")
|
84
49
|
|
85
50
|
if not bot_token or not chat_id:
|
86
51
|
raise RuntimeError(
|
87
|
-
f"Telegram bot token or chat ID for '{preset_name}' not found in the configuration file"
|
88
|
-
)
|
52
|
+
f"Telegram bot token or chat ID for '{preset_name}' not found in the configuration file")
|
89
53
|
|
90
54
|
return bot_token, chat_id
|
91
55
|
|