rgwfuncs 0.0.21__py3-none-any.whl → 0.0.54__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rgwfuncs/df_lib.py CHANGED
@@ -21,51 +21,15 @@ from email.mime.base import MIMEBase
21
21
  from email import encoders
22
22
  from googleapiclient.discovery import build
23
23
  import base64
24
- import inspect
25
- from typing import Optional, Callable, Dict, List, Tuple, Any
24
+ import boto3
25
+ # import inspect
26
+ from typing import Optional, Dict, List, Tuple, Any
26
27
  import warnings
27
28
 
28
29
  # Suppress all FutureWarnings
29
30
  warnings.filterwarnings("ignore", category=FutureWarning)
30
31
 
31
32
 
32
- def df_docs(method_type_filter: Optional[str] = None) -> None:
33
- """
34
- Print a list of function names in alphabetical order. If method_type_filter
35
- is specified, print the docstrings of the functions that match the filter.
36
- Using '*' as a filter will print the docstrings for all functions.
37
-
38
- Parameters:
39
- method_type_filter: Optional filter string representing a function name,
40
- or '*' to display docstrings for all functions.
41
- """
42
- # Get the current module's namespace
43
- current_module = __name__
44
-
45
- local_functions: Dict[str, Callable] = {
46
- name: obj for name, obj in globals().items()
47
- if inspect.isfunction(obj) and obj.__module__ == current_module
48
- }
49
-
50
- # List of function names sorted alphabetically
51
- function_names = sorted(local_functions.keys())
52
-
53
- # Print function names
54
- print("Functions in alphabetical order:")
55
- for name in function_names:
56
- print(name)
57
-
58
- # If a filter is provided or '*', print the docstrings of functions
59
- if method_type_filter:
60
- # print("\nFiltered function documentation:")
61
- for name, func in local_functions.items():
62
- docstring: Optional[str] = func.__doc__
63
- if docstring:
64
- if method_type_filter == '*' or method_type_filter == name:
65
- # Print the entire docstring for the matching function
66
- print(f"\n{name}:\n{docstring}")
67
-
68
-
69
33
  def numeric_clean(
70
34
  df: pd.DataFrame,
71
35
  column_names: str,
@@ -421,8 +385,7 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
421
385
  raise ConnectionError(
422
386
  "All attempts to connect to ClickHouse failed.")
423
387
 
424
- def query_google_big_query(
425
- db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
388
+ def query_google_big_query(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
426
389
  json_file_path = db_preset['json_file_path']
427
390
  project_id = db_preset['project_id']
428
391
 
@@ -437,6 +400,54 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
437
400
 
438
401
  return pd.DataFrame(rows, columns=columns)
439
402
 
403
+ def query_athena(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
404
+
405
+ def execute_athena_query(athena_client, query: str, database: str, output_bucket: str) -> str:
406
+ response = athena_client.start_query_execution(
407
+ QueryString=query,
408
+ QueryExecutionContext={"Database": database},
409
+ ResultConfiguration={"OutputLocation": output_bucket}
410
+ )
411
+ return response["QueryExecutionId"]
412
+
413
+ def wait_for_athena_query_to_complete(athena_client, query_execution_id: str):
414
+ while True:
415
+ response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
416
+ state = response["QueryExecution"]["Status"]["State"]
417
+ if state == "SUCCEEDED":
418
+ break
419
+ elif state in ("FAILED", "CANCELLED"):
420
+ raise Exception(f"Query failed with state: {state}")
421
+ time.sleep(1)
422
+
423
+ def download_athena_query_results(athena_client, query_execution_id: str) -> pd.DataFrame:
424
+ paginator = athena_client.get_paginator("get_query_results")
425
+ result_pages = paginator.paginate(QueryExecutionId=query_execution_id)
426
+ rows = []
427
+ columns = []
428
+ for page in result_pages:
429
+ if not columns:
430
+ columns = [col["Name"] for col in page["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]]
431
+ rows.extend(page["ResultSet"]["Rows"])
432
+
433
+ data = [[col.get("VarCharValue", None) for col in row["Data"]] for row in rows[1:]]
434
+ return pd.DataFrame(data, columns=columns)
435
+
436
+ aws_region = db_preset['aws_region']
437
+ database = db_preset['database']
438
+ output_bucket = db_preset['output_bucket']
439
+
440
+ athena_client = boto3.client(
441
+ 'athena',
442
+ region_name=aws_region,
443
+ aws_access_key_id=db_preset['aws_access_key'],
444
+ aws_secret_access_key=db_preset['aws_secret_key']
445
+ )
446
+
447
+ query_execution_id = execute_athena_query(athena_client, query, database, output_bucket)
448
+ wait_for_athena_query_to_complete(athena_client, query_execution_id)
449
+ return download_athena_query_results(athena_client, query_execution_id)
450
+
440
451
  # Assume the configuration file is located at ~/.rgwfuncsrc
441
452
  config_path = os.path.expanduser('~/.rgwfuncsrc')
442
453
  with open(config_path, 'r') as f:
@@ -459,6 +470,8 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
459
470
  return query_clickhouse(db_preset, query)
460
471
  elif db_type == 'google_big_query':
461
472
  return query_google_big_query(db_preset, query)
473
+ elif db_type == 'aws_athena':
474
+ return query_athena(db_preset, query)
462
475
  else:
463
476
  raise ValueError(f"Unsupported db_type: {db_type}")
464
477
 
@@ -835,7 +848,12 @@ def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
835
848
  gc.collect()
836
849
 
837
850
 
838
- def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
851
+ def send_dataframe_via_telegram(
852
+ df: pd.DataFrame,
853
+ bot_name: str,
854
+ message: Optional[str] = None,
855
+ as_file: bool = True,
856
+ remove_after_send: bool = True) -> None:
839
857
  """
840
858
  Send a DataFrame via Telegram using a specified bot configuration.
841
859
 
@@ -1673,7 +1691,11 @@ def print_n_frequency_cascading(
1673
1691
  print(json.dumps(report, indent=2))
1674
1692
 
1675
1693
 
1676
- def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: list, order_by: str = "FREQ_DESC") -> None:
1694
+ def print_n_frequency_linear(
1695
+ df: pd.DataFrame,
1696
+ n: int,
1697
+ columns: list,
1698
+ order_by: str = "FREQ_DESC") -> None:
1677
1699
  """
1678
1700
  Print the linear frequency of top n values for specified columns.
1679
1701
 
@@ -1709,27 +1731,49 @@ def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: list, order_by:
1709
1731
 
1710
1732
  return report
1711
1733
 
1734
+ def try_parse_numeric(val):
1735
+ """Attempt to parse a value as an integer or float."""
1736
+ try:
1737
+ return int(val)
1738
+ except ValueError:
1739
+ try:
1740
+ return float(val)
1741
+ except ValueError:
1742
+ return val
1743
+
1712
1744
  def sort_frequency(frequency, order_by):
1713
- if order_by == "ASC":
1714
- return dict(sorted(frequency.items(), key=lambda item: item[0]))
1715
- elif order_by == "DESC":
1716
- return dict(
1717
- sorted(
1745
+ # keys = frequency.keys()
1746
+
1747
+ # Convert keys to numerical values where possible, leaving `NaN` as a
1748
+ # special string
1749
+ # parsed_keys = [(try_parse_numeric(key), key) for key in keys]
1750
+
1751
+ if order_by in {"BY_KEYS_ASC", "BY_KEYS_DESC"}:
1752
+ reverse = order_by == "BY_KEYS_DESC"
1753
+ sorted_items = sorted(
1754
+ frequency.items(),
1755
+ key=lambda item: try_parse_numeric(
1756
+ item[0]),
1757
+ reverse=reverse)
1758
+ else:
1759
+ if order_by == "ASC":
1760
+ sorted_items = sorted(
1761
+ frequency.items(), key=lambda item: item[0])
1762
+ elif order_by == "DESC":
1763
+ sorted_items = sorted(
1718
1764
  frequency.items(),
1719
1765
  key=lambda item: item[0],
1720
- reverse=True))
1721
- elif order_by == "FREQ_ASC":
1722
- return dict(sorted(frequency.items(), key=lambda item: item[1]))
1723
- elif order_by == "BY_KEYS_ASC":
1724
- return dict(sorted(frequency.items()))
1725
- elif order_by == "BY_KEYS_DESC":
1726
- return dict(sorted(frequency.items(), reverse=True))
1727
- else: # Default to "FREQ_DESC"
1728
- return dict(
1729
- sorted(
1766
+ reverse=True)
1767
+ elif order_by == "FREQ_ASC":
1768
+ sorted_items = sorted(
1769
+ frequency.items(), key=lambda item: item[1])
1770
+ else: # Default to "FREQ_DESC"
1771
+ sorted_items = sorted(
1730
1772
  frequency.items(),
1731
1773
  key=lambda item: item[1],
1732
- reverse=True))
1774
+ reverse=True)
1775
+
1776
+ return dict(sorted_items)
1733
1777
 
1734
1778
  report = generate_linear_report(df, columns, n, order_by)
1735
1779
  print(json.dumps(report, indent=2))
@@ -1879,7 +1923,10 @@ def right_join(
1879
1923
  return df1.merge(df2, how='right', left_on=left_on, right_on=right_on)
1880
1924
 
1881
1925
 
1882
- def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.DataFrame) -> None:
1926
+ def insert_dataframe_in_sqlite_database(
1927
+ db_path: str,
1928
+ tablename: str,
1929
+ df: pd.DataFrame) -> None:
1883
1930
  """
1884
1931
  Inserts a Pandas DataFrame into a SQLite database table.
1885
1932
 
@@ -1941,7 +1988,10 @@ def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.Dat
1941
1988
  df.to_sql(tablename, conn, if_exists='append', index=False)
1942
1989
 
1943
1990
 
1944
- def sync_dataframe_to_sqlite_database(db_path: str, tablename: str, df: pd.DataFrame) -> None:
1991
+ def sync_dataframe_to_sqlite_database(
1992
+ db_path: str,
1993
+ tablename: str,
1994
+ df: pd.DataFrame) -> None:
1945
1995
  """
1946
1996
  Processes and saves a DataFrame to an SQLite database, adding a timestamp column
1947
1997
  and replacing the existing table if needed. Creates the table if it does not exist.
rgwfuncs/docs_lib.py ADDED
@@ -0,0 +1,51 @@
1
+ import os
2
+ import inspect
3
+ from typing import Optional
4
+ import warnings
5
+
6
+ # Suppress all FutureWarnings
7
+ warnings.filterwarnings("ignore", category=FutureWarning)
8
+
9
+
10
+ def docs(method_type_filter: Optional[str] = None) -> None:
11
+ """
12
+ Print a list of function names in alphabetical order from all modules.
13
+ If method_type_filter is specified, print the docstrings of the functions
14
+ that match the filter based on a substring. Using '*' as a filter will print
15
+ the docstrings for all functions.
16
+
17
+ Parameters:
18
+ method_type_filter: Optional filter string representing a filter for
19
+ function names, or '*' to display docstrings for all functions.
20
+ """
21
+
22
+ # Directory containing your modules
23
+ module_dir = os.path.dirname(__file__)
24
+
25
+ # Iterate over each file in the module directory
26
+ for filename in sorted(os.listdir(module_dir)):
27
+ if filename.endswith('.py') and filename != '__init__.py':
28
+ module_name, _ = os.path.splitext(filename)
29
+ print(f"\n# {module_name}.py")
30
+
31
+ # Import the module
32
+ module_path = f"rgwfuncs.{module_name}"
33
+ module = __import__(module_path, fromlist=[module_name])
34
+
35
+ # Get all functions from the module
36
+ functions = {
37
+ name: obj for name, obj
38
+ in inspect.getmembers(module, inspect.isfunction)
39
+ if obj.__module__ == module_path
40
+ }
41
+
42
+ # List function names
43
+ function_names = sorted(functions.keys())
44
+ for name in function_names:
45
+ # If a filter is provided or '*', check if the function name
46
+ # contains the filter
47
+ if method_type_filter and (
48
+ method_type_filter == '*' or method_type_filter in name):
49
+ docstring: Optional[str] = functions[name].__doc__
50
+ if docstring:
51
+ print(f"\n{name}:\n{docstring}")
@@ -0,0 +1,32 @@
1
+ import code
2
+ import readline
3
+ import rlcompleter # noqa: F401
4
+ import sys # noqa: F401
5
+ from typing import Dict, Any
6
+ from .df_lib import * # noqa: F401, F403, E402
7
+ from .algebra_lib import * # noqa: F401, F403, E402
8
+ from .str_lib import * # noqa: F401, F403, E402
9
+ from .docs_lib import * # noqa: F401, F403, E402
10
+
11
+
12
+ def interactive_shell(local_vars: Dict[str, Any]) -> None:
13
+ """
14
+ Launches an interactive prompt for inspecting and modifying local variables, making all methods
15
+ in the rgwfuncs library available by default.
16
+
17
+ Parameters:
18
+ local_vars (dict): Dictionary of local variables to be available in the interactive shell.
19
+ """
20
+ if not isinstance(local_vars, dict):
21
+ raise TypeError("local_vars must be a dictionary")
22
+
23
+ readline.parse_and_bind("tab: complete")
24
+
25
+ # Make imported functions available in the REPL
26
+ local_vars.update(globals())
27
+
28
+ # Create interactive console with local context
29
+ console = code.InteractiveConsole(locals=local_vars)
30
+
31
+ # Start interactive session
32
+ console.interact(banner="Welcome to the rgwfuncs interactive shell.")
rgwfuncs/str_lib.py CHANGED
@@ -1,53 +1,16 @@
1
1
  import os
2
2
  import json
3
3
  import requests
4
- import inspect
5
- from typing import Tuple, Optional, Dict, Callable
4
+ from typing import Tuple
6
5
  import warnings
7
6
 
8
7
  # Suppress all FutureWarnings
9
8
  warnings.filterwarnings("ignore", category=FutureWarning)
10
9
 
11
10
 
12
- def str_docs(method_type_filter: Optional[str] = None) -> None:
13
- """
14
- Print a list of function names in alphabetical order. If method_type_filter
15
- is specified, print the docstrings of the functions that match the filter.
16
- Using '*' as a filter will print the docstrings for all functions.
17
-
18
- Parameters:
19
- method_type_filter: Optional filter string representing a function name,
20
- or '*' to display docstrings for all functions.
21
- """
22
- # Get the current module's namespace
23
- current_module = __name__
24
-
25
- local_functions: Dict[str, Callable] = {
26
- name: obj for name, obj in globals().items()
27
- if inspect.isfunction(obj) and obj.__module__ == current_module
28
- }
29
-
30
- # List of function names sorted alphabetically
31
- function_names = sorted(local_functions.keys())
32
-
33
- # Print function names
34
- print("Functions in alphabetical order:")
35
- for name in function_names:
36
- print(name)
37
-
38
- # If a filter is provided or '*', print the docstrings of functions
39
- if method_type_filter:
40
- # print("\nFiltered function documentation:")
41
- for name, func in local_functions.items():
42
- docstring: Optional[str] = func.__doc__
43
- if docstring:
44
- if method_type_filter == '*' or method_type_filter == name:
45
- # Print the entire docstring for the matching function
46
- print(f"\n{name}:\n{docstring}")
47
-
48
-
49
11
  def send_telegram_message(preset_name: str, message: str) -> None:
50
- """Send a Telegram message using the specified preset.
12
+ """
13
+ Send a Telegram message using the specified preset.
51
14
 
52
15
  Args:
53
16
  preset_name (str): The name of the preset to use for sending the message.
@@ -73,19 +36,20 @@ def send_telegram_message(preset_name: str, message: str) -> None:
73
36
  return preset
74
37
  return None
75
38
 
76
- def get_telegram_bot_details(config: dict, preset_name: str) -> Tuple[str, str]:
39
+ def get_telegram_bot_details(
40
+ config: dict, preset_name: str) -> Tuple[str, str]:
77
41
  """Retrieve the Telegram bot token and chat ID from the preset."""
78
42
  preset = get_telegram_preset(config, preset_name)
79
43
  if not preset:
80
- raise RuntimeError(f"Telegram bot preset '{preset_name}' not found in the configuration file")
44
+ raise RuntimeError(
45
+ f"Telegram bot preset '{preset_name}' not found in the configuration file")
81
46
 
82
47
  bot_token = preset.get("bot_token")
83
48
  chat_id = preset.get("chat_id")
84
49
 
85
50
  if not bot_token or not chat_id:
86
51
  raise RuntimeError(
87
- f"Telegram bot token or chat ID for '{preset_name}' not found in the configuration file"
88
- )
52
+ f"Telegram bot token or chat ID for '{preset_name}' not found in the configuration file")
89
53
 
90
54
  return bot_token, chat_id
91
55