rgwfuncs 0.0.34__py3-none-any.whl → 0.0.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rgwfuncs/df_lib.py +53 -2
- {rgwfuncs-0.0.34.dist-info → rgwfuncs-0.0.36.dist-info}/METADATA +41 -43
- {rgwfuncs-0.0.34.dist-info → rgwfuncs-0.0.36.dist-info}/RECORD +7 -7
- {rgwfuncs-0.0.34.dist-info → rgwfuncs-0.0.36.dist-info}/LICENSE +0 -0
- {rgwfuncs-0.0.34.dist-info → rgwfuncs-0.0.36.dist-info}/WHEEL +0 -0
- {rgwfuncs-0.0.34.dist-info → rgwfuncs-0.0.36.dist-info}/entry_points.txt +0 -0
- {rgwfuncs-0.0.34.dist-info → rgwfuncs-0.0.36.dist-info}/top_level.txt +0 -0
rgwfuncs/df_lib.py
CHANGED
@@ -384,8 +384,7 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
384
384
|
raise ConnectionError(
|
385
385
|
"All attempts to connect to ClickHouse failed.")
|
386
386
|
|
387
|
-
def query_google_big_query(
|
388
|
-
db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
387
|
+
def query_google_big_query(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
389
388
|
json_file_path = db_preset['json_file_path']
|
390
389
|
project_id = db_preset['project_id']
|
391
390
|
|
@@ -400,6 +399,56 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
400
399
|
|
401
400
|
return pd.DataFrame(rows, columns=columns)
|
402
401
|
|
402
|
+
|
403
|
+
def query_athena(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
404
|
+
|
405
|
+
def execute_athena_query(athena_client, query: str, database: str, output_bucket: str) -> str:
|
406
|
+
response = athena_client.start_query_execution(
|
407
|
+
QueryString=query,
|
408
|
+
QueryExecutionContext={"Database": database},
|
409
|
+
ResultConfiguration={"OutputLocation": output_bucket}
|
410
|
+
)
|
411
|
+
return response["QueryExecutionId"]
|
412
|
+
|
413
|
+
def wait_for_athena_query_to_complete(athena_client, query_execution_id: str):
|
414
|
+
while True:
|
415
|
+
response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
|
416
|
+
state = response["QueryExecution"]["Status"]["State"]
|
417
|
+
if state == "SUCCEEDED":
|
418
|
+
break
|
419
|
+
elif state in ("FAILED", "CANCELLED"):
|
420
|
+
raise Exception(f"Query failed with state: {state}")
|
421
|
+
time.sleep(1)
|
422
|
+
|
423
|
+
def download_athena_query_results(athena_client, query_execution_id: str) -> pd.DataFrame:
|
424
|
+
paginator = athena_client.get_paginator("get_query_results")
|
425
|
+
result_pages = paginator.paginate(QueryExecutionId=query_execution_id)
|
426
|
+
rows = []
|
427
|
+
columns = []
|
428
|
+
for page in result_pages:
|
429
|
+
if not columns:
|
430
|
+
columns = [col["Name"] for col in page["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]]
|
431
|
+
rows.extend(page["ResultSet"]["Rows"])
|
432
|
+
|
433
|
+
data = [[col.get("VarCharValue", None) for col in row["Data"]] for row in rows[1:]]
|
434
|
+
return pd.DataFrame(data, columns=columns)
|
435
|
+
|
436
|
+
|
437
|
+
aws_region = db_preset['region']
|
438
|
+
database = db_preset['database']
|
439
|
+
output_bucket = db_preset['output_bucket']
|
440
|
+
|
441
|
+
athena_client = boto3.client(
|
442
|
+
'athena',
|
443
|
+
region_name=aws_region,
|
444
|
+
aws_access_key_id=db_preset['aws_access_key'],
|
445
|
+
aws_secret_access_key=db_preset['aws_secret_key']
|
446
|
+
)
|
447
|
+
|
448
|
+
query_execution_id = execute_athena_query(athena_client, query, database, output_bucket)
|
449
|
+
wait_for_athena_query_to_complete(athena_client, query_execution_id)
|
450
|
+
return download_athena_query_results(athena_client, query_execution_id)
|
451
|
+
|
403
452
|
# Assume the configuration file is located at ~/.rgwfuncsrc
|
404
453
|
config_path = os.path.expanduser('~/.rgwfuncsrc')
|
405
454
|
with open(config_path, 'r') as f:
|
@@ -422,6 +471,8 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
422
471
|
return query_clickhouse(db_preset, query)
|
423
472
|
elif db_type == 'google_big_query':
|
424
473
|
return query_google_big_query(db_preset, query)
|
474
|
+
elif db_type == 'athena':
|
475
|
+
return query_athena(db_preset, query)
|
425
476
|
else:
|
426
477
|
raise ValueError(f"Unsupported db_type: {db_type}")
|
427
478
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: rgwfuncs
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.36
|
4
4
|
Summary: A functional programming paradigm for mathematical modelling and data science
|
5
5
|
Home-page: https://github.com/ryangerardwilson/rgwfunc
|
6
6
|
Author: Ryan Gerard Wilson
|
@@ -135,7 +135,7 @@ To display all docstrings, use:
|
|
135
135
|
|
136
136
|
--------------------------------------------------------------------------------
|
137
137
|
|
138
|
-
## Documentation Access
|
138
|
+
## Documentation Access
|
139
139
|
|
140
140
|
### 1. docs
|
141
141
|
Print a list of available function names in alphabetical order. If a filter is provided, print the docstrings of functions containing the term.
|
@@ -150,15 +150,13 @@ Print a list of available function names in alphabetical order. If a filter is p
|
|
150
150
|
|
151
151
|
--------------------------------------------------------------------------------
|
152
152
|
|
153
|
-
## Interactive Shell
|
153
|
+
## Interactive Shell
|
154
154
|
|
155
155
|
This section includes functions that facilitate launching an interactive Python shell to inspect and modify local variables within the user's environment.
|
156
156
|
|
157
157
|
### 1. `interactive_shell`
|
158
158
|
|
159
|
-
|
160
|
-
|
161
|
-
Launches an interactive prompt for inspecting and modifying local variables, making all methods in the rgwfuncs library available by default. This REPL (Read-Eval-Print Loop) environment supports command history and autocompletion, making it easier to interact with your Python code.
|
159
|
+
Launches an interactive prompt for inspecting and modifying local variables, making all methods in the rgwfuncs library available by default. This REPL (Read-Eval-Print Loop) environment supports command history and autocompletion, making it easier to interact with your Python code. This function is particularly useful for debugging purposes when you want real-time interaction with your program's execution environment.
|
162
160
|
|
163
161
|
• Parameters:
|
164
162
|
- `local_vars` (dict, optional): A dictionary of local variables to be accessible within the interactive shell. If not provided, defaults to an empty dictionary.
|
@@ -180,18 +178,10 @@ Launches an interactive prompt for inspecting and modifying local variables, mak
|
|
180
178
|
'city': ['New York', 'Los Angeles', 'Chicago', 'San Francisco', 'Boston']
|
181
179
|
})
|
182
180
|
|
183
|
-
# Function to retrieve the first n rows of a DataFrame
|
184
|
-
def first_n_rows(df, n):
|
185
|
-
return df.head(n).to_dict('records')
|
186
|
-
|
187
181
|
# Launch the interactive shell with local variables
|
188
182
|
interactive_shell(locals())
|
189
183
|
|
190
|
-
|
191
|
-
|
192
|
-
#### Interactive Shell Example Sessions
|
193
|
-
|
194
|
-
1. DataFrame Inspection with Pandas:
|
184
|
+
Subsequently, in the interactive shell you can use any library in your python file, as well as all rgwfuncs methods (even if they are not imported). Notice, that while pandas and numpy are available in the shell as a result of importing them in the above script, the rgwfuncs method `first_n_rows` was not imported - yet is available for use.
|
195
185
|
|
196
186
|
Welcome to the rgwfuncs interactive shell.
|
197
187
|
>>> pirst_n_rows(df, 2)
|
@@ -208,23 +198,10 @@ Launches an interactive prompt for inspecting and modifying local variables, mak
|
|
208
198
|
2 3 Charlie 35 Chicago
|
209
199
|
3 4 David 28 San Francisco
|
210
200
|
4 5 Eva 22 Boston
|
211
|
-
|
212
|
-
2. Array Operations with NumPy:
|
213
|
-
|
214
|
-
Welcome to the rgwfuncs interactive shell.
|
215
201
|
>>> arr = np.array([1, 2, 3, 4, 5])
|
216
202
|
>>> arr
|
217
203
|
array([1, 2, 3, 4, 5])
|
218
204
|
|
219
|
-
These examples illustrate how you can use functions and variables within the interactive shell, handle errors with meaningful suggestions, and perform operations using external libraries like pandas and numpy.
|
220
|
-
|
221
|
-
#### Key Features
|
222
|
-
|
223
|
-
- Autocompletion: Uses the `rlcompleter` library to provide tab-completion of variables and functions names, enhancing ease of use.
|
224
|
-
- History Support: Utilizes `readline` for command-line history, allowing you to navigate through previous commands using the arrow keys.
|
225
|
-
|
226
|
-
This function is particularly useful for debugging purposes when you want real-time interaction with your program's execution environment.
|
227
|
-
|
228
205
|
--------------------------------------------------------------------------------
|
229
206
|
|
230
207
|
## Algebra Based Functions
|
@@ -690,28 +667,49 @@ Drop duplicate rows based on specified columns, retaining the last occurrence.
|
|
690
667
|
|
691
668
|
### 11. `load_data_from_query`
|
692
669
|
|
693
|
-
Load data from a database query
|
670
|
+
Load data from a specified database using a SQL query and return the results in a Pandas DataFrame. The database connection configurations are determined by a preset name specified in a configuration file.
|
694
671
|
|
695
|
-
|
696
|
-
- `db_preset_name` (str): Name of the database preset in the configuration file.
|
697
|
-
- `query` (str): The SQL query to execute.
|
672
|
+
#### Features
|
698
673
|
|
699
|
-
-
|
700
|
-
|
674
|
+
- Multi-Database Support: This function supports different database types, including MSSQL, MySQL, ClickHouse, Google BigQuery, and AWS Athena, based on the configuration preset selected.
|
675
|
+
- Configuration-Based: It utilizes a configuration file to store database connection details securely, avoiding hardcoding sensitive information directly into the script.
|
676
|
+
- Dynamic Query Execution: Capable of executing custom user-defined SQL queries against the specified database.
|
677
|
+
- Automatic Result Loading: Fetches query results and loads them directly into a Pandas DataFrame for further manipulation and analysis.
|
701
678
|
|
702
|
-
|
703
|
-
- The configuration file is assumed to be located at `~/.rgwfuncsrc`.
|
679
|
+
#### Parameters
|
704
680
|
|
705
|
-
-
|
681
|
+
- `db_preset_name` (str): The name of the database preset found in the configuration file. This preset determines which database connection details to use.
|
682
|
+
- `query` (str): The SQL query string to be executed on the database.
|
683
|
+
|
684
|
+
#### Returns
|
685
|
+
|
686
|
+
- `pd.DataFrame`: Returns a DataFrame that contains the results from the executed SQL query.
|
687
|
+
|
688
|
+
#### Configuration Details
|
706
689
|
|
707
|
-
|
690
|
+
- The configuration file is expected to be in JSON format and located at `~/.rgwfuncsrc`.
|
691
|
+
- Each preset within the configuration file must include:
|
692
|
+
- `name`: Name of the database preset.
|
693
|
+
- `db_type`: Type of the database (`mssql`, `mysql`, `clickhouse`, `google_big_query`, `aws_athena`).
|
694
|
+
- `credentials`: Necessary credentials such as host, username, password, and potentially others depending on the database type.
|
695
|
+
|
696
|
+
#### Example
|
697
|
+
|
698
|
+
from rgwfuncs import load_data_from_query
|
699
|
+
|
700
|
+
# Load data using a preset configuration
|
701
|
+
df = load_data_from_query(
|
702
|
+
db_preset_name="MyDBPreset",
|
703
|
+
query="SELECT * FROM my_table"
|
704
|
+
)
|
705
|
+
print(df)
|
708
706
|
|
709
|
-
|
710
|
-
db_preset_name="MyDBPreset",
|
711
|
-
query="SELECT * FROM my_table"
|
712
|
-
)
|
713
|
-
print(df)
|
707
|
+
#### Notes
|
714
708
|
|
709
|
+
- Security: Ensure that the configuration file (`~/.rgwfuncsrc`) is secure and accessible only to authorized users, as it contains sensitive information.
|
710
|
+
- Pre-requisites: Ensure the necessary Python packages are installed for each database type you wish to query. For example, `pymssql` for MSSQL, `mysql-connector-python` for MySQL, and so on.
|
711
|
+
- Error Handling: The function raises a `ValueError` if the specified preset name does not exist or if the database type is unsupported. Additional exceptions may arise from network issues or database errors.
|
712
|
+
- Environment: For AWS Athena, ensure that AWS credentials are configured properly for the boto3 library to authenticate successfully. Consider using AWS IAM roles or AWS Secrets Manager for better security management.
|
715
713
|
|
716
714
|
--------------------------------------------------------------------------------
|
717
715
|
|
@@ -1,12 +1,12 @@
|
|
1
1
|
rgwfuncs/__init__.py,sha256=CLPRpLtzXxyFHEjS-MrxnhXH0LdS6THjAC5sCHg0m3c,1520
|
2
2
|
rgwfuncs/algebra_lib.py,sha256=g-sNkf9Hz4i17uRIgLUYLQlyUu8yROgsoJMujdj0U3Y,21577
|
3
|
-
rgwfuncs/df_lib.py,sha256=
|
3
|
+
rgwfuncs/df_lib.py,sha256=VYs_2avxVODnxBAuoZ9eQ8dDY1rcNMGmbgwcRkDv_VA,68966
|
4
4
|
rgwfuncs/docs_lib.py,sha256=y3wSAOPO3qsA4HZ7xAtW8HimM8w-c8hjcEzMRLJ96ao,1960
|
5
5
|
rgwfuncs/interactive_shell_lib.py,sha256=A7EWsYxAfDev_N0-2GjRvAtp0bAwBPHIczXb8Gu9fzI,1107
|
6
6
|
rgwfuncs/str_lib.py,sha256=rtAdRlnSJIu3JhI-tA_A0wCiPK2m-zn5RoGpBxv_g-4,2228
|
7
|
-
rgwfuncs-0.0.
|
8
|
-
rgwfuncs-0.0.
|
9
|
-
rgwfuncs-0.0.
|
10
|
-
rgwfuncs-0.0.
|
11
|
-
rgwfuncs-0.0.
|
12
|
-
rgwfuncs-0.0.
|
7
|
+
rgwfuncs-0.0.36.dist-info/LICENSE,sha256=7EI8xVBu6h_7_JlVw-yPhhOZlpY9hP8wal7kHtqKT_E,1074
|
8
|
+
rgwfuncs-0.0.36.dist-info/METADATA,sha256=jU298sn67VvVuv85s4crL4B_833DaxkuHMA0c7Cyf88,49550
|
9
|
+
rgwfuncs-0.0.36.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
10
|
+
rgwfuncs-0.0.36.dist-info/entry_points.txt,sha256=j-c5IOPIQ0252EaOV6j6STio56sbXl2C4ym_fQ0lXx0,43
|
11
|
+
rgwfuncs-0.0.36.dist-info/top_level.txt,sha256=aGuVIzWsKiV1f2gCb6mynx0zx5ma0B1EwPGFKVEMTi4,9
|
12
|
+
rgwfuncs-0.0.36.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|