rgwfuncs 0.0.35__py3-none-any.whl → 0.0.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rgwfuncs/df_lib.py +53 -2
- {rgwfuncs-0.0.35.dist-info → rgwfuncs-0.0.38.dist-info}/METADATA +46 -16
- {rgwfuncs-0.0.35.dist-info → rgwfuncs-0.0.38.dist-info}/RECORD +7 -7
- {rgwfuncs-0.0.35.dist-info → rgwfuncs-0.0.38.dist-info}/LICENSE +0 -0
- {rgwfuncs-0.0.35.dist-info → rgwfuncs-0.0.38.dist-info}/WHEEL +0 -0
- {rgwfuncs-0.0.35.dist-info → rgwfuncs-0.0.38.dist-info}/entry_points.txt +0 -0
- {rgwfuncs-0.0.35.dist-info → rgwfuncs-0.0.38.dist-info}/top_level.txt +0 -0
rgwfuncs/df_lib.py
CHANGED
@@ -384,8 +384,7 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
384
384
|
raise ConnectionError(
|
385
385
|
"All attempts to connect to ClickHouse failed.")
|
386
386
|
|
387
|
-
def query_google_big_query(
|
388
|
-
db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
387
|
+
def query_google_big_query(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
389
388
|
json_file_path = db_preset['json_file_path']
|
390
389
|
project_id = db_preset['project_id']
|
391
390
|
|
@@ -400,6 +399,56 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
400
399
|
|
401
400
|
return pd.DataFrame(rows, columns=columns)
|
402
401
|
|
402
|
+
|
403
|
+
def query_athena(db_preset: Dict[str, Any], query: str) -> pd.DataFrame:
|
404
|
+
|
405
|
+
def execute_athena_query(athena_client, query: str, database: str, output_bucket: str) -> str:
|
406
|
+
response = athena_client.start_query_execution(
|
407
|
+
QueryString=query,
|
408
|
+
QueryExecutionContext={"Database": database},
|
409
|
+
ResultConfiguration={"OutputLocation": output_bucket}
|
410
|
+
)
|
411
|
+
return response["QueryExecutionId"]
|
412
|
+
|
413
|
+
def wait_for_athena_query_to_complete(athena_client, query_execution_id: str):
|
414
|
+
while True:
|
415
|
+
response = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
|
416
|
+
state = response["QueryExecution"]["Status"]["State"]
|
417
|
+
if state == "SUCCEEDED":
|
418
|
+
break
|
419
|
+
elif state in ("FAILED", "CANCELLED"):
|
420
|
+
raise Exception(f"Query failed with state: {state}")
|
421
|
+
time.sleep(1)
|
422
|
+
|
423
|
+
def download_athena_query_results(athena_client, query_execution_id: str) -> pd.DataFrame:
|
424
|
+
paginator = athena_client.get_paginator("get_query_results")
|
425
|
+
result_pages = paginator.paginate(QueryExecutionId=query_execution_id)
|
426
|
+
rows = []
|
427
|
+
columns = []
|
428
|
+
for page in result_pages:
|
429
|
+
if not columns:
|
430
|
+
columns = [col["Name"] for col in page["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]]
|
431
|
+
rows.extend(page["ResultSet"]["Rows"])
|
432
|
+
|
433
|
+
data = [[col.get("VarCharValue", None) for col in row["Data"]] for row in rows[1:]]
|
434
|
+
return pd.DataFrame(data, columns=columns)
|
435
|
+
|
436
|
+
|
437
|
+
aws_region = db_preset['aws_region']
|
438
|
+
database = db_preset['database']
|
439
|
+
output_bucket = db_preset['output_bucket']
|
440
|
+
|
441
|
+
athena_client = boto3.client(
|
442
|
+
'athena',
|
443
|
+
region_name=aws_region,
|
444
|
+
aws_access_key_id=db_preset['aws_access_key'],
|
445
|
+
aws_secret_access_key=db_preset['aws_secret_key']
|
446
|
+
)
|
447
|
+
|
448
|
+
query_execution_id = execute_athena_query(athena_client, query, database, output_bucket)
|
449
|
+
wait_for_athena_query_to_complete(athena_client, query_execution_id)
|
450
|
+
return download_athena_query_results(athena_client, query_execution_id)
|
451
|
+
|
403
452
|
# Assume the configuration file is located at ~/.rgwfuncsrc
|
404
453
|
config_path = os.path.expanduser('~/.rgwfuncsrc')
|
405
454
|
with open(config_path, 'r') as f:
|
@@ -422,6 +471,8 @@ def load_data_from_query(db_preset_name: str, query: str) -> pd.DataFrame:
|
|
422
471
|
return query_clickhouse(db_preset, query)
|
423
472
|
elif db_type == 'google_big_query':
|
424
473
|
return query_google_big_query(db_preset, query)
|
474
|
+
elif db_type == 'aws_athena':
|
475
|
+
return query_athena(db_preset, query)
|
425
476
|
else:
|
426
477
|
raise ValueError(f"Unsupported db_type: {db_type}")
|
427
478
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: rgwfuncs
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.38
|
4
4
|
Summary: A functional programming paradigm for mathematical modelling and data science
|
5
5
|
Home-page: https://github.com/ryangerardwilson/rgwfunc
|
6
6
|
Author: Ryan Gerard Wilson
|
@@ -75,6 +75,15 @@ A `.rgwfuncsrc` file (located at `vi ~/.rgwfuncsrc) is required for MSSQL, CLICK
|
|
75
75
|
"db_type": "google_big_query",
|
76
76
|
"json_file_path": "",
|
77
77
|
"project_id": ""
|
78
|
+
},
|
79
|
+
{
|
80
|
+
"name": "athena_db1",
|
81
|
+
"db_type": "aws_athena",
|
82
|
+
"aws_access_key": "",
|
83
|
+
"aws_secret_key": "",
|
84
|
+
"aws_region: "",
|
85
|
+
"database": "logs",
|
86
|
+
"output_bucket": "s3://bucket-name"
|
78
87
|
}
|
79
88
|
],
|
80
89
|
"vm_presets": [
|
@@ -667,28 +676,49 @@ Drop duplicate rows based on specified columns, retaining the last occurrence.
|
|
667
676
|
|
668
677
|
### 11. `load_data_from_query`
|
669
678
|
|
670
|
-
Load data from a database query
|
679
|
+
Load data from a specified database using a SQL query and return the results in a Pandas DataFrame. The database connection configurations are determined by a preset name specified in a configuration file.
|
671
680
|
|
672
|
-
|
673
|
-
- `db_preset_name` (str): Name of the database preset in the configuration file.
|
674
|
-
- `query` (str): The SQL query to execute.
|
681
|
+
#### Features
|
675
682
|
|
676
|
-
-
|
677
|
-
|
683
|
+
- Multi-Database Support: This function supports different database types, including MSSQL, MySQL, ClickHouse, Google BigQuery, and AWS Athena, based on the configuration preset selected.
|
684
|
+
- Configuration-Based: It utilizes a configuration file to store database connection details securely, avoiding hardcoding sensitive information directly into the script.
|
685
|
+
- Dynamic Query Execution: Capable of executing custom user-defined SQL queries against the specified database.
|
686
|
+
- Automatic Result Loading: Fetches query results and loads them directly into a Pandas DataFrame for further manipulation and analysis.
|
678
687
|
|
679
|
-
|
680
|
-
- The configuration file is assumed to be located at `~/.rgwfuncsrc`.
|
688
|
+
#### Parameters
|
681
689
|
|
682
|
-
-
|
690
|
+
- `db_preset_name` (str): The name of the database preset found in the configuration file. This preset determines which database connection details to use.
|
691
|
+
- `query` (str): The SQL query string to be executed on the database.
|
692
|
+
|
693
|
+
#### Returns
|
683
694
|
|
684
|
-
|
695
|
+
- `pd.DataFrame`: Returns a DataFrame that contains the results from the executed SQL query.
|
696
|
+
|
697
|
+
#### Configuration Details
|
698
|
+
|
699
|
+
- The configuration file is expected to be in JSON format and located at `~/.rgwfuncsrc`.
|
700
|
+
- Each preset within the configuration file must include:
|
701
|
+
- `name`: Name of the database preset.
|
702
|
+
- `db_type`: Type of the database (`mssql`, `mysql`, `clickhouse`, `google_big_query`, `aws_athena`).
|
703
|
+
- `credentials`: Necessary credentials such as host, username, password, and potentially others depending on the database type.
|
704
|
+
|
705
|
+
#### Example
|
706
|
+
|
707
|
+
from rgwfuncs import load_data_from_query
|
708
|
+
|
709
|
+
# Load data using a preset configuration
|
710
|
+
df = load_data_from_query(
|
711
|
+
db_preset_name="MyDBPreset",
|
712
|
+
query="SELECT * FROM my_table"
|
713
|
+
)
|
714
|
+
print(df)
|
685
715
|
|
686
|
-
|
687
|
-
db_preset_name="MyDBPreset",
|
688
|
-
query="SELECT * FROM my_table"
|
689
|
-
)
|
690
|
-
print(df)
|
716
|
+
#### Notes
|
691
717
|
|
718
|
+
- Security: Ensure that the configuration file (`~/.rgwfuncsrc`) is secure and accessible only to authorized users, as it contains sensitive information.
|
719
|
+
- Pre-requisites: Ensure the necessary Python packages are installed for each database type you wish to query. For example, `pymssql` for MSSQL, `mysql-connector-python` for MySQL, and so on.
|
720
|
+
- Error Handling: The function raises a `ValueError` if the specified preset name does not exist or if the database type is unsupported. Additional exceptions may arise from network issues or database errors.
|
721
|
+
- Environment: For AWS Athena, ensure that AWS credentials are configured properly for the boto3 library to authenticate successfully. Consider using AWS IAM roles or AWS Secrets Manager for better security management.
|
692
722
|
|
693
723
|
--------------------------------------------------------------------------------
|
694
724
|
|
@@ -1,12 +1,12 @@
|
|
1
1
|
rgwfuncs/__init__.py,sha256=CLPRpLtzXxyFHEjS-MrxnhXH0LdS6THjAC5sCHg0m3c,1520
|
2
2
|
rgwfuncs/algebra_lib.py,sha256=g-sNkf9Hz4i17uRIgLUYLQlyUu8yROgsoJMujdj0U3Y,21577
|
3
|
-
rgwfuncs/df_lib.py,sha256=
|
3
|
+
rgwfuncs/df_lib.py,sha256=0bjredJvn9Al8y2S_qXPKkh3z8adi9v6IBvOUl5Jidw,68974
|
4
4
|
rgwfuncs/docs_lib.py,sha256=y3wSAOPO3qsA4HZ7xAtW8HimM8w-c8hjcEzMRLJ96ao,1960
|
5
5
|
rgwfuncs/interactive_shell_lib.py,sha256=A7EWsYxAfDev_N0-2GjRvAtp0bAwBPHIczXb8Gu9fzI,1107
|
6
6
|
rgwfuncs/str_lib.py,sha256=rtAdRlnSJIu3JhI-tA_A0wCiPK2m-zn5RoGpBxv_g-4,2228
|
7
|
-
rgwfuncs-0.0.
|
8
|
-
rgwfuncs-0.0.
|
9
|
-
rgwfuncs-0.0.
|
10
|
-
rgwfuncs-0.0.
|
11
|
-
rgwfuncs-0.0.
|
12
|
-
rgwfuncs-0.0.
|
7
|
+
rgwfuncs-0.0.38.dist-info/LICENSE,sha256=7EI8xVBu6h_7_JlVw-yPhhOZlpY9hP8wal7kHtqKT_E,1074
|
8
|
+
rgwfuncs-0.0.38.dist-info/METADATA,sha256=A5dRwHKtHe9mv3rptslwDNBZxuE8JF7mGLpcWrF21bE,49805
|
9
|
+
rgwfuncs-0.0.38.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
10
|
+
rgwfuncs-0.0.38.dist-info/entry_points.txt,sha256=j-c5IOPIQ0252EaOV6j6STio56sbXl2C4ym_fQ0lXx0,43
|
11
|
+
rgwfuncs-0.0.38.dist-info/top_level.txt,sha256=aGuVIzWsKiV1f2gCb6mynx0zx5ma0B1EwPGFKVEMTi4,9
|
12
|
+
rgwfuncs-0.0.38.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|