rgwfuncs 0.0.34__py3-none-any.whl → 0.0.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rgwfuncs/df_lib.py CHANGED
@@ -384,8 +384,7 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
384
384
  raise ConnectionError(
385
385
  "All attempts to connect to ClickHouse failed.")
386
386
 
387
- def query_google_big_query(
388
- db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
387
+ def query_google_big_query(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
389
388
  json_file_path = db_preset['json_file_path']
390
389
  project_id = db_preset['project_id']
391
390
 
@@ -400,6 +399,56 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
400
399
 
401
400
  return pd.DataFrame(rows, columns=columns)
402
401
 
402
+
403
+ def query_athena(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
404
+
405
+ def execute_athena_query(athena_client, query: str, database: str, output_bucket: str) -> str:
406
+ response = athena_client.start_query_execution(
407
+ QueryString=query,
408
+ QueryExecutionContext={"Database": database},
409
+ ResultConfiguration={"OutputLocation": output_bucket}
410
+ )
411
+ return response["QueryExecutionId"]
412
+
413
+ def wait_for_athena_query_to_complete(athena_client, query_execution_id: str):
414
+ while True:
415
+ response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
416
+ state = response["QueryExecution"]["Status"]["State"]
417
+ if state == "SUCCEEDED":
418
+ break
419
+ elif state in ("FAILED", "CANCELLED"):
420
+ raise Exception(f"Query failed with state: {state}")
421
+ time.sleep(1)
422
+
423
+ def download_athena_query_results(athena_client, query_execution_id: str) -> pd.DataFrame:
424
+ paginator = athena_client.get_paginator("get_query_results")
425
+ result_pages = paginator.paginate(QueryExecutionId=query_execution_id)
426
+ rows = []
427
+ columns = []
428
+ for page in result_pages:
429
+ if not columns:
430
+ columns = [col["Name"] for col in page["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]]
431
+ rows.extend(page["ResultSet"]["Rows"])
432
+
433
+ data = [[col.get("VarCharValue", None) for col in row["Data"]] for row in rows[1:]]
434
+ return pd.DataFrame(data, columns=columns)
435
+
436
+
437
+ aws_region = db_preset['region']
438
+ database = db_preset['database']
439
+ output_bucket = db_preset['output_bucket']
440
+
441
+ athena_client = boto3.client(
442
+ 'athena',
443
+ region_name=aws_region,
444
+ aws_access_key_id=db_preset['aws_access_key'],
445
+ aws_secret_access_key=db_preset['aws_secret_key']
446
+ )
447
+
448
+ query_execution_id = execute_athena_query(athena_client, query, database, output_bucket)
449
+ wait_for_athena_query_to_complete(athena_client, query_execution_id)
450
+ return download_athena_query_results(athena_client, query_execution_id)
451
+
403
452
  # Assume the configuration file is located at ~/.rgwfuncsrc
404
453
  config_path = os.path.expanduser('~/.rgwfuncsrc')
405
454
  with open(config_path, 'r') as f:
@@ -422,6 +471,8 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
422
471
  return query_clickhouse(db_preset, query)
423
472
  elif db_type == 'google_big_query':
424
473
  return query_google_big_query(db_preset, query)
474
+ elif db_type == 'athena':
475
+ return query_athena(db_preset, query)
425
476
  else:
426
477
  raise ValueError(f"Unsupported db_type: {db_type}")
427
478
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: rgwfuncs
3
- Version: 0.0.34
3
+ Version: 0.0.36
4
4
  Summary: A functional programming paradigm for mathematical modelling and data science
5
5
  Home-page: https://github.com/ryangerardwilson/rgwfunc
6
6
  Author: Ryan Gerard Wilson
@@ -135,7 +135,7 @@ To display all docstrings, use:
135
135
 
136
136
  --------------------------------------------------------------------------------
137
137
 
138
- ## Documentation Access Functions
138
+ ## Documentation Access
139
139
 
140
140
  ### 1. docs
141
141
  Print a list of available function names in alphabetical order. If a filter is provided, print the docstrings of functions containing the term.
@@ -150,15 +150,13 @@ Print a list of available function names in alphabetical order. If a filter is p
150
150
 
151
151
  --------------------------------------------------------------------------------
152
152
 
153
- ## Interactive Shell Function
153
+ ## Interactive Shell
154
154
 
155
155
  This section includes functions that facilitate launching an interactive Python shell to inspect and modify local variables within the user's environment.
156
156
 
157
157
  ### 1. `interactive_shell`
158
158
 
159
- #### Usage
160
-
161
- Launches an interactive prompt for inspecting and modifying local variables, making all methods in the rgwfuncs library available by default. This REPL (Read-Eval-Print Loop) environment supports command history and autocompletion, making it easier to interact with your Python code.
159
+ Launches an interactive prompt for inspecting and modifying local variables, making all methods in the rgwfuncs library available by default. This REPL (Read-Eval-Print Loop) environment supports command history and autocompletion, making it easier to interact with your Python code. This function is particularly useful for debugging purposes when you want real-time interaction with your program's execution environment.
162
160
 
163
161
  • Parameters:
164
162
  - `local_vars` (dict, optional): A dictionary of local variables to be accessible within the interactive shell. If not provided, defaults to an empty dictionary.
@@ -180,18 +178,10 @@ Launches an interactive prompt for inspecting and modifying local variables, mak
180
178
  'city': ['New York', 'Los Angeles', 'Chicago', 'San Francisco', 'Boston']
181
179
  })
182
180
 
183
- # Function to retrieve the first n rows of a DataFrame
184
- def first_n_rows(df, n):
185
- return df.head(n).to_dict('records')
186
-
187
181
  # Launch the interactive shell with local variables
188
182
  interactive_shell(locals())
189
183
 
190
- - Once in the interactive shell, you are greeted with a welcome message. You can access the variables defined in the local scope where `interactive_shell(locals())` was called, including any imported modules such as `pandas` (accessed as `pd`) and `numpy` (accessed as `np`). This means you can directly use these modules in the interactive session. Type `exit()` to quit the shell.
191
-
192
- #### Interactive Shell Example Sessions
193
-
194
- 1. DataFrame Inspection with Pandas:
184
+ Subsequently, in the interactive shell you can use any library in your python file, as well as all rgwfuncs methods (even if they are not imported). Notice, that while pandas and numpy are available in the shell as a result of importing them in the above script, the rgwfuncs method `first_n_rows` was not imported - yet is available for use.
195
185
 
196
186
  Welcome to the rgwfuncs interactive shell.
197
187
  >>> pirst_n_rows(df, 2)
@@ -208,23 +198,10 @@ Launches an interactive prompt for inspecting and modifying local variables, mak
208
198
  2 3 Charlie 35 Chicago
209
199
  3 4 David 28 San Francisco
210
200
  4 5 Eva 22 Boston
211
-
212
- 2. Array Operations with NumPy:
213
-
214
- Welcome to the rgwfuncs interactive shell.
215
201
  >>> arr = np.array([1, 2, 3, 4, 5])
216
202
  >>> arr
217
203
  array([1, 2, 3, 4, 5])
218
204
 
219
- These examples illustrate how you can use functions and variables within the interactive shell, handle errors with meaningful suggestions, and perform operations using external libraries like pandas and numpy.
220
-
221
- #### Key Features
222
-
223
- - Autocompletion: Uses the `rlcompleter` library to provide tab-completion of variables and functions names, enhancing ease of use.
224
- - History Support: Utilizes `readline` for command-line history, allowing you to navigate through previous commands using the arrow keys.
225
-
226
- This function is particularly useful for debugging purposes when you want real-time interaction with your program's execution environment.
227
-
228
205
  --------------------------------------------------------------------------------
229
206
 
230
207
  ## Algebra Based Functions
@@ -690,28 +667,49 @@ Drop duplicate rows based on specified columns, retaining the last occurrence.
690
667
 
691
668
  ### 11. `load_data_from_query`
692
669
 
693
- Load data from a database query into a DataFrame based on a configuration preset.
670
+ Load data from a specified database using a SQL query and return the results in a Pandas DataFrame. The database connection configurations are determined by a preset name specified in a configuration file.
694
671
 
695
- - **Parameters:**
696
- - `db_preset_name` (str): Name of the database preset in the configuration file.
697
- - `query` (str): The SQL query to execute.
672
+ #### Features
698
673
 
699
- - **Returns:**
700
- - `pd.DataFrame`: A DataFrame containing the query result.
674
+ - Multi-Database Support: This function supports different database types, including MSSQL, MySQL, ClickHouse, Google BigQuery, and AWS Athena, based on the configuration preset selected.
675
+ - Configuration-Based: It utilizes a configuration file to store database connection details securely, avoiding hardcoding sensitive information directly into the script.
676
+ - Dynamic Query Execution: Capable of executing custom user-defined SQL queries against the specified database.
677
+ - Automatic Result Loading: Fetches query results and loads them directly into a Pandas DataFrame for further manipulation and analysis.
701
678
 
702
- - **Notes:**
703
- - The configuration file is assumed to be located at `~/.rgwfuncsrc`.
679
+ #### Parameters
704
680
 
705
- - **Example:**
681
+ - `db_preset_name` (str): The name of the database preset found in the configuration file. This preset determines which database connection details to use.
682
+ - `query` (str): The SQL query string to be executed on the database.
683
+
684
+ #### Returns
685
+
686
+ - `pd.DataFrame`: Returns a DataFrame that contains the results from the executed SQL query.
687
+
688
+ #### Configuration Details
706
689
 
707
- from rgwfuncs import load_data_from_query
690
+ - The configuration file is expected to be in JSON format and located at `~/.rgwfuncsrc`.
691
+ - Each preset within the configuration file must include:
692
+ - `name`: Name of the database preset.
693
+ - `db_type`: Type of the database (`mssql`, `mysql`, `clickhouse`, `google_big_query`, `aws_athena`).
694
+ - `credentials`: Necessary credentials such as host, username, password, and potentially others depending on the database type.
695
+
696
+ #### Example
697
+
698
+ from rgwfuncs import load_data_from_query
699
+
700
+ # Load data using a preset configuration
701
+ df = load_data_from_query(
702
+ db_preset_name="MyDBPreset",
703
+ query="SELECT * FROM my_table"
704
+ )
705
+ print(df)
708
706
 
709
- df = load_data_from_query(
710
- db_preset_name="MyDBPreset",
711
- query="SELECT * FROM my_table"
712
- )
713
- print(df)
707
+ #### Notes
714
708
 
709
+ - Security: Ensure that the configuration file (`~/.rgwfuncsrc`) is secure and accessible only to authorized users, as it contains sensitive information.
710
+ - Pre-requisites: Ensure the necessary Python packages are installed for each database type you wish to query. For example, `pymssql` for MSSQL, `mysql-connector-python` for MySQL, and so on.
711
+ - Error Handling: The function raises a `ValueError` if the specified preset name does not exist or if the database type is unsupported. Additional exceptions may arise from network issues or database errors.
712
+ - Environment: For AWS Athena, ensure that AWS credentials are configured properly for the boto3 library to authenticate successfully. Consider using AWS IAM roles or AWS Secrets Manager for better security management.
715
713
 
716
714
  --------------------------------------------------------------------------------
717
715
 
@@ -1,12 +1,12 @@
1
1
  rgwfuncs/__init__.py,sha256=CLPRpLtzXxyFHEjS-MrxnhXH0LdS6THjAC5sCHg0m3c,1520
2
2
  rgwfuncs/algebra_lib.py,sha256=g-sNkf9Hz4i17uRIgLUYLQlyUu8yROgsoJMujdj0U3Y,21577
3
- rgwfuncs/df_lib.py,sha256=G_H3PXNVeseX2YLjkkrmO9eXA_7r29swUZlbPBDZjXA,66612
3
+ rgwfuncs/df_lib.py,sha256=VYs_2avxVODnxBAuoZ9eQ8dDY1rcNMGmbgwcRkDv_VA,68966
4
4
  rgwfuncs/docs_lib.py,sha256=y3wSAOPO3qsA4HZ7xAtW8HimM8w-c8hjcEzMRLJ96ao,1960
5
5
  rgwfuncs/interactive_shell_lib.py,sha256=A7EWsYxAfDev_N0-2GjRvAtp0bAwBPHIczXb8Gu9fzI,1107
6
6
  rgwfuncs/str_lib.py,sha256=rtAdRlnSJIu3JhI-tA_A0wCiPK2m-zn5RoGpBxv_g-4,2228
7
- rgwfuncs-0.0.34.dist-info/LICENSE,sha256=7EI8xVBu6h_7_JlVw-yPhhOZlpY9hP8wal7kHtqKT_E,1074
8
- rgwfuncs-0.0.34.dist-info/METADATA,sha256=JuG4wIxpjCaw7ST1u2lQHD9fBAUC9lC68X1E_4OztuM,48334
9
- rgwfuncs-0.0.34.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
10
- rgwfuncs-0.0.34.dist-info/entry_points.txt,sha256=j-c5IOPIQ0252EaOV6j6STio56sbXl2C4ym_fQ0lXx0,43
11
- rgwfuncs-0.0.34.dist-info/top_level.txt,sha256=aGuVIzWsKiV1f2gCb6mynx0zx5ma0B1EwPGFKVEMTi4,9
12
- rgwfuncs-0.0.34.dist-info/RECORD,,
7
+ rgwfuncs-0.0.36.dist-info/LICENSE,sha256=7EI8xVBu6h_7_JlVw-yPhhOZlpY9hP8wal7kHtqKT_E,1074
8
+ rgwfuncs-0.0.36.dist-info/METADATA,sha256=jU298sn67VvVuv85s4crL4B_833DaxkuHMA0c7Cyf88,49550
9
+ rgwfuncs-0.0.36.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
10
+ rgwfuncs-0.0.36.dist-info/entry_points.txt,sha256=j-c5IOPIQ0252EaOV6j6STio56sbXl2C4ym_fQ0lXx0,43
11
+ rgwfuncs-0.0.36.dist-info/top_level.txt,sha256=aGuVIzWsKiV1f2gCb6mynx0zx5ma0B1EwPGFKVEMTi4,9
12
+ rgwfuncs-0.0.36.dist-info/RECORD,,