rgwfuncs 0.0.21__py3-none-any.whl → 0.0.54__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
rgwfuncs/df_lib.py CHANGED
@@ -21,51 +21,15 @@ from email.mime.base import MIMEBase
21
21
  from email import encoders
22
22
  from googleapiclient.discovery import build
23
23
  import base64
24
- import inspect
25
- from typing import Optional, Callable, Dict, List, Tuple, Any
24
+ import boto3
25
+ # import inspect
26
+ from typing import Optional, Dict, List, Tuple, Any
26
27
  import warnings
27
28
 
28
29
  # Suppress all FutureWarnings
29
30
  warnings.filterwarnings("ignore", category=FutureWarning)
30
31
 
31
32
 
32
- def df_docs(method_type_filter: Optional[str] = None) -> None:
33
- """
34
- Print a list of function names in alphabetical order. If method_type_filter
35
- is specified, print the docstrings of the functions that match the filter.
36
- Using '*' as a filter will print the docstrings for all functions.
37
-
38
- Parameters:
39
- method_type_filter: Optional filter string representing a function name,
40
- or '*' to display docstrings for all functions.
41
- """
42
- # Get the current module's namespace
43
- current_module = __name__
44
-
45
- local_functions: Dict[str, Callable] = {
46
- name: obj for name, obj in globals().items()
47
- if inspect.isfunction(obj) and obj.__module__ == current_module
48
- }
49
-
50
- # List of function names sorted alphabetically
51
- function_names = sorted(local_functions.keys())
52
-
53
- # Print function names
54
- print("Functions in alphabetical order:")
55
- for name in function_names:
56
- print(name)
57
-
58
- # If a filter is provided or '*', print the docstrings of functions
59
- if method_type_filter:
60
- # print("\nFiltered function documentation:")
61
- for name, func in local_functions.items():
62
- docstring: Optional[str] = func.__doc__
63
- if docstring:
64
- if method_type_filter == '*' or method_type_filter == name:
65
- # Print the entire docstring for the matching function
66
- print(f"\n{name}:\n{docstring}")
67
-
68
-
69
33
  def numeric_clean(
70
34
  df: pd.DataFrame,
71
35
  column_names: str,
@@ -421,8 +385,7 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
421
385
  raise ConnectionError(
422
386
  "All attempts to connect to ClickHouse failed.")
423
387
 
424
- def query_google_big_query(
425
- db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
388
+ def query_google_big_query(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
426
389
  json_file_path = db_preset['json_file_path']
427
390
  project_id = db_preset['project_id']
428
391
 
@@ -437,6 +400,54 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
437
400
 
438
401
  return pd.DataFrame(rows, columns=columns)
439
402
 
403
+ def query_athena(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
404
+
405
+ def execute_athena_query(athena_client, query: str, database: str, output_bucket: str) -> str:
406
+ response = athena_client.start_query_execution(
407
+ QueryString=query,
408
+ QueryExecutionContext={"Database": database},
409
+ ResultConfiguration={"OutputLocation": output_bucket}
410
+ )
411
+ return response["QueryExecutionId"]
412
+
413
+ def wait_for_athena_query_to_complete(athena_client, query_execution_id: str):
414
+ while True:
415
+ response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
416
+ state = response["QueryExecution"]["Status"]["State"]
417
+ if state == "SUCCEEDED":
418
+ break
419
+ elif state in ("FAILED", "CANCELLED"):
420
+ raise Exception(f"Query failed with state: {state}")
421
+ time.sleep(1)
422
+
423
+ def download_athena_query_results(athena_client, query_execution_id: str) -> pd.DataFrame:
424
+ paginator = athena_client.get_paginator("get_query_results")
425
+ result_pages = paginator.paginate(QueryExecutionId=query_execution_id)
426
+ rows = []
427
+ columns = []
428
+ for page in result_pages:
429
+ if not columns:
430
+ columns = [col["Name"] for col in page["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]]
431
+ rows.extend(page["ResultSet"]["Rows"])
432
+
433
+ data = [[col.get("VarCharValue", None) for col in row["Data"]] for row in rows[1:]]
434
+ return pd.DataFrame(data, columns=columns)
435
+
436
+ aws_region = db_preset['aws_region']
437
+ database = db_preset['database']
438
+ output_bucket = db_preset['output_bucket']
439
+
440
+ athena_client = boto3.client(
441
+ 'athena',
442
+ region_name=aws_region,
443
+ aws_access_key_id=db_preset['aws_access_key'],
444
+ aws_secret_access_key=db_preset['aws_secret_key']
445
+ )
446
+
447
+ query_execution_id = execute_athena_query(athena_client, query, database, output_bucket)
448
+ wait_for_athena_query_to_complete(athena_client, query_execution_id)
449
+ return download_athena_query_results(athena_client, query_execution_id)
450
+
440
451
  # Assume the configuration file is located at ~/.rgwfuncsrc
441
452
  config_path = os.path.expanduser('~/.rgwfuncsrc')
442
453
  with open(config_path, 'r') as f:
@@ -459,6 +470,8 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
459
470
  return query_clickhouse(db_preset, query)
460
471
  elif db_type == 'google_big_query':
461
472
  return query_google_big_query(db_preset, query)
473
+ elif db_type == 'aws_athena':
474
+ return query_athena(db_preset, query)
462
475
  else:
463
476
  raise ValueError(f"Unsupported db_type: {db_type}")
464
477
 
@@ -835,7 +848,12 @@ def print_dataframe(df: pd.DataFrame, source: Optional[str] = None) -> None:
835
848
  gc.collect()
836
849
 
837
850
 
838
- def send_dataframe_via_telegram(df: pd.DataFrame, bot_name: str, message: Optional[str] = None, as_file: bool = True, remove_after_send: bool = True) -> None:
851
+ def send_dataframe_via_telegram(
852
+ df: pd.DataFrame,
853
+ bot_name: str,
854
+ message: Optional[str] = None,
855
+ as_file: bool = True,
856
+ remove_after_send: bool = True) -> None:
839
857
  """
840
858
  Send a DataFrame via Telegram using a specified bot configuration.
841
859
 
@@ -1673,7 +1691,11 @@ def print_n_frequency_cascading(
1673
1691
  print(json.dumps(report, indent=2))
1674
1692
 
1675
1693
 
1676
- def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: list, order_by: str = "FREQ_DESC") -> None:
1694
+ def print_n_frequency_linear(
1695
+ df: pd.DataFrame,
1696
+ n: int,
1697
+ columns: list,
1698
+ order_by: str = "FREQ_DESC") -> None:
1677
1699
  """
1678
1700
  Print the linear frequency of top n values for specified columns.
1679
1701
 
@@ -1709,27 +1731,49 @@ def print_n_frequency_linear(df: pd.DataFrame, n: int, columns: list, order_by:
1709
1731
 
1710
1732
  return report
1711
1733
 
1734
+ def try_parse_numeric(val):
1735
+ """Attempt to parse a value as an integer or float."""
1736
+ try:
1737
+ return int(val)
1738
+ except ValueError:
1739
+ try:
1740
+ return float(val)
1741
+ except ValueError:
1742
+ return val
1743
+
1712
1744
  def sort_frequency(frequency, order_by):
1713
- if order_by == "ASC":
1714
- return dict(sorted(frequency.items(), key=lambda item: item[0]))
1715
- elif order_by == "DESC":
1716
- return dict(
1717
- sorted(
1745
+ # keys = frequency.keys()
1746
+
1747
+ # Convert keys to numerical values where possible, leaving `NaN` as a
1748
+ # special string
1749
+ # parsed_keys = [(try_parse_numeric(key), key) for key in keys]
1750
+
1751
+ if order_by in {"BY_KEYS_ASC", "BY_KEYS_DESC"}:
1752
+ reverse = order_by == "BY_KEYS_DESC"
1753
+ sorted_items = sorted(
1754
+ frequency.items(),
1755
+ key=lambda item: try_parse_numeric(
1756
+ item[0]),
1757
+ reverse=reverse)
1758
+ else:
1759
+ if order_by == "ASC":
1760
+ sorted_items = sorted(
1761
+ frequency.items(), key=lambda item: item[0])
1762
+ elif order_by == "DESC":
1763
+ sorted_items = sorted(
1718
1764
  frequency.items(),
1719
1765
  key=lambda item: item[0],
1720
- reverse=True))
1721
- elif order_by == "FREQ_ASC":
1722
- return dict(sorted(frequency.items(), key=lambda item: item[1]))
1723
- elif order_by == "BY_KEYS_ASC":
1724
- return dict(sorted(frequency.items()))
1725
- elif order_by == "BY_KEYS_DESC":
1726
- return dict(sorted(frequency.items(), reverse=True))
1727
- else: # Default to "FREQ_DESC"
1728
- return dict(
1729
- sorted(
1766
+ reverse=True)
1767
+ elif order_by == "FREQ_ASC":
1768
+ sorted_items = sorted(
1769
+ frequency.items(), key=lambda item: item[1])
1770
+ else: # Default to "FREQ_DESC"
1771
+ sorted_items = sorted(
1730
1772
  frequency.items(),
1731
1773
  key=lambda item: item[1],
1732
- reverse=True))
1774
+ reverse=True)
1775
+
1776
+ return dict(sorted_items)
1733
1777
 
1734
1778
  report = generate_linear_report(df, columns, n, order_by)
1735
1779
  print(json.dumps(report, indent=2))
@@ -1879,7 +1923,10 @@ def right_join(
1879
1923
  return df1.merge(df2, how='right', left_on=left_on, right_on=right_on)
1880
1924
 
1881
1925
 
1882
- def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.DataFrame) -> None:
1926
+ def insert_dataframe_in_sqlite_database(
1927
+ db_path: str,
1928
+ tablename: str,
1929
+ df: pd.DataFrame) -> None:
1883
1930
  """
1884
1931
  Inserts a Pandas DataFrame into a SQLite database table.
1885
1932
 
@@ -1941,7 +1988,10 @@ def insert_dataframe_in_sqlite_database(db_path: str, tablename: str, df: pd.Dat
1941
1988
  df.to_sql(tablename, conn, if_exists='append', index=False)
1942
1989
 
1943
1990
 
1944
- def sync_dataframe_to_sqlite_database(db_path: str, tablename: str, df: pd.DataFrame) -> None:
1991
+ def sync_dataframe_to_sqlite_database(
1992
+ db_path: str,
1993
+ tablename: str,
1994
+ df: pd.DataFrame) -> None:
1945
1995
  """
1946
1996
  Processes and saves a DataFrame to an SQLite database, adding a timestamp column
1947
1997
  and replacing the existing table if needed. Creates the table if it does not exist.
rgwfuncs/docs_lib.py ADDED
@@ -0,0 +1,51 @@
1
+ import os
2
+ import inspect
3
+ from typing import Optional
4
+ import warnings
5
+
6
+ # Suppress all FutureWarnings
7
+ warnings.filterwarnings("ignore", category=FutureWarning)
8
+
9
+
10
+ def docs(method_type_filter: Optional[str] = None) -> None:
11
+ """
12
+ Print a list of function names in alphabetical order from all modules.
13
+ If method_type_filter is specified, print the docstrings of the functions
14
+ that match the filter based on a substring. Using '*' as a filter will print
15
+ the docstrings for all functions.
16
+
17
+ Parameters:
18
+ method_type_filter: Optional filter string representing a filter for
19
+ function names, or '*' to display docstrings for all functions.
20
+ """
21
+
22
+ # Directory containing your modules
23
+ module_dir = os.path.dirname(__file__)
24
+
25
+ # Iterate over each file in the module directory
26
+ for filename in sorted(os.listdir(module_dir)):
27
+ if filename.endswith('.py') and filename != '__init__.py':
28
+ module_name, _ = os.path.splitext(filename)
29
+ print(f"\n# {module_name}.py")
30
+
31
+ # Import the module
32
+ module_path = f"rgwfuncs.{module_name}"
33
+ module = __import__(module_path, fromlist=[module_name])
34
+
35
+ # Get all functions from the module
36
+ functions = {
37
+ name: obj for name, obj
38
+ in inspect.getmembers(module, inspect.isfunction)
39
+ if obj.__module__ == module_path
40
+ }
41
+
42
+ # List function names
43
+ function_names = sorted(functions.keys())
44
+ for name in function_names:
45
+ # If a filter is provided or '*', check if the function name
46
+ # contains the filter
47
+ if method_type_filter and (
48
+ method_type_filter == '*' or method_type_filter in name):
49
+ docstring: Optional[str] = functions[name].__doc__
50
+ if docstring:
51
+ print(f"\n{name}:\n{docstring}")
@@ -0,0 +1,32 @@
1
+ import code
2
+ import readline
3
+ import rlcompleter # noqa: F401
4
+ import sys # noqa: F401
5
+ from typing import Dict, Any
6
+ from .df_lib import * # noqa: F401, F403, E402
7
+ from .algebra_lib import * # noqa: F401, F403, E402
8
+ from .str_lib import * # noqa: F401, F403, E402
9
+ from .docs_lib import * # noqa: F401, F403, E402
10
+
11
+
12
+ def interactive_shell(local_vars: Dict[str, Any]) -> None:
13
+ """
14
+ Launches an interactive prompt for inspecting and modifying local variables, making all methods
15
+ in the rgwfuncs library available by default.
16
+
17
+ Parameters:
18
+ local_vars (dict): Dictionary of local variables to be available in the interactive shell.
19
+ """
20
+ if not isinstance(local_vars, dict):
21
+ raise TypeError("local_vars must be a dictionary")
22
+
23
+ readline.parse_and_bind("tab: complete")
24
+
25
+ # Make imported functions available in the REPL
26
+ local_vars.update(globals())
27
+
28
+ # Create interactive console with local context
29
+ console = code.InteractiveConsole(locals=local_vars)
30
+
31
+ # Start interactive session
32
+ console.interact(banner="Welcome to the rgwfuncs interactive shell.")
rgwfuncs/str_lib.py CHANGED
@@ -1,53 +1,16 @@
1
1
  import os
2
2
  import json
3
3
  import requests
4
- import inspect
5
- from typing import Tuple, Optional, Dict, Callable
4
+ from typing import Tuple
6
5
  import warnings
7
6
 
8
7
  # Suppress all FutureWarnings
9
8
  warnings.filterwarnings("ignore", category=FutureWarning)
10
9
 
11
10
 
12
- def str_docs(method_type_filter: Optional[str] = None) -> None:
13
- """
14
- Print a list of function names in alphabetical order. If method_type_filter
15
- is specified, print the docstrings of the functions that match the filter.
16
- Using '*' as a filter will print the docstrings for all functions.
17
-
18
- Parameters:
19
- method_type_filter: Optional filter string representing a function name,
20
- or '*' to display docstrings for all functions.
21
- """
22
- # Get the current module's namespace
23
- current_module = __name__
24
-
25
- local_functions: Dict[str, Callable] = {
26
- name: obj for name, obj in globals().items()
27
- if inspect.isfunction(obj) and obj.__module__ == current_module
28
- }
29
-
30
- # List of function names sorted alphabetically
31
- function_names = sorted(local_functions.keys())
32
-
33
- # Print function names
34
- print("Functions in alphabetical order:")
35
- for name in function_names:
36
- print(name)
37
-
38
- # If a filter is provided or '*', print the docstrings of functions
39
- if method_type_filter:
40
- # print("\nFiltered function documentation:")
41
- for name, func in local_functions.items():
42
- docstring: Optional[str] = func.__doc__
43
- if docstring:
44
- if method_type_filter == '*' or method_type_filter == name:
45
- # Print the entire docstring for the matching function
46
- print(f"\n{name}:\n{docstring}")
47
-
48
-
49
11
  def send_telegram_message(preset_name: str, message: str) -> None:
50
- """Send a Telegram message using the specified preset.
12
+ """
13
+ Send a Telegram message using the specified preset.
51
14
 
52
15
  Args:
53
16
  preset_name (str): The name of the preset to use for sending the message.
@@ -73,19 +36,20 @@ def send_telegram_message(preset_name: str, message: str) -> None:
73
36
  return preset
74
37
  return None
75
38
 
76
- def get_telegram_bot_details(config: dict, preset_name: str) -> Tuple[str, str]:
39
+ def get_telegram_bot_details(
40
+ config: dict, preset_name: str) -> Tuple[str, str]:
77
41
  """Retrieve the Telegram bot token and chat ID from the preset."""
78
42
  preset = get_telegram_preset(config, preset_name)
79
43
  if not preset:
80
- raise RuntimeError(f"Telegram bot preset '{preset_name}' not found in the configuration file")
44
+ raise RuntimeError(
45
+ f"Telegram bot preset '{preset_name}' not found in the configuration file")
81
46
 
82
47
  bot_token = preset.get("bot_token")
83
48
  chat_id = preset.get("chat_id")
84
49
 
85
50
  if not bot_token or not chat_id:
86
51
  raise RuntimeError(
87
- f"Telegram bot token or chat ID for '{preset_name}' not found in the configuration file"
88
- )
52
+ f"Telegram bot token or chat ID for '{preset_name}' not found in the configuration file")
89
53
 
90
54
  return bot_token, chat_id
91
55