sagemaker-studio-dataengineering-sessions 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sagemaker_studio_dataengineering_sessions-1.0.2/.gitignore +18 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/PKG-INFO +114 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/README.md +98 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/pyproject.toml +136 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/__init__.py +0 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/py.typed +0 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_base_session_manager/__init__.py +7 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_base_session_manager/base_session_manager.py +162 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_base_session_manager/boto3_models/datazone/2018-05-10/service-2.json +31733 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_base_session_manager/boto3_models/glue/2017-03-31/service-2.json +18867 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_base_session_manager/common/__init__.py +0 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_base_session_manager/common/constants.py +137 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_base_session_manager/common/datazone_gateway.py +95 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_base_session_manager/common/debugging_utils.py +24 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_base_session_manager/common/exceptions.py +53 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_base_session_manager/common/glue_gateway.py +52 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_base_session_manager/common/ipython_display.py +31 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_base_session_manager/common/logger_utils.py +28 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_base_session_manager/common/metadata_utils.py +13 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_base_session_manager/common/sagemaker_connection_display.py +44 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_base_session_manager/common/sagemaker_toolkit_utils.py +150 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_connection_magic/__init__.py +12 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_connection_magic/dataframe_wrapper.py +30 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_connection_magic/sagemaker_connection_magic.py +1208 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_connection_magic/sagemaker_display_magic/__init__.py +0 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_connection_magic/sagemaker_display_magic/display_compute.py +476 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_connection_magic/sagemaker_display_magic/display_function.py +92 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_connection_magic/sagemaker_display_magic/display_render.py +912 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_connection_magic/sagemaker_display_magic/run_statement.py +45 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_connection_magic/sagemaker_display_magic/style_constants.py +371 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_connection_magic/sagemaker_display_magic/utils.py +15 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_connection_magic/utils/aws_profile_helper.py +77 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_connection_magic/utils/cell_transformer.py +155 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_connection_magic/utils/constants.py +39 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_connection_magic/utils/credential_process_script.py +35 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_database_session_manager/__init__.py +7 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_database_session_manager/athena/__init__.py +0 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_database_session_manager/athena/athena_config.py +6 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_database_session_manager/athena/athena_connection.py +6 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_database_session_manager/athena/athena_session.py +30 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_database_session_manager/athena/connection_transformer.py +27 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_database_session_manager/db_connection_pool.py +12 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_database_session_manager/redshift/__init__.py +0 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_database_session_manager/redshift/connection_transformer.py +86 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_database_session_manager/redshift/redshift_config.py +5 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_database_session_manager/redshift/redshift_connection.py +16 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_database_session_manager/redshift/redshift_session.py +72 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_database_session_manager/sagemaker_database_session_manager.py +133 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_database_session_manager/utils/common_utils.py +13 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/__init__.py +7 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/__init__.py +1 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/emr_on_ec2/__init__.py +0 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/emr_on_ec2/connection_transformer.py +67 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/emr_on_ec2/custom_authenticator.py +40 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/emr_on_ec2/emr_on_ec2_connection.py +19 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/emr_on_ec2/emr_on_ec2_gateway.py +25 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/emr_on_ec2/emr_on_ec2_session.py +190 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/emr_on_ec2/governance_type.py +7 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/emr_on_serverless/__init__.py +0 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/emr_on_serverless/connection_tranformer.py +37 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/emr_on_serverless/custom_authenticator.py +143 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/emr_on_serverless/emr_on_serverless_connection.py +8 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/emr_on_serverless/emr_on_serverless_session.py +158 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/emr_on_serverless/emr_serverless_gateway.py +36 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/livy_session.py +298 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/emr_session_manager/spark_magic.py +11 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/glue_session_manager/__init__.py +1 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/glue_session_manager/connection_transformer.py +114 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/glue_session_manager/glue_connection.py +19 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/glue_session_manager/glue_kernel_utils/GlueSessionsConstants.py +56 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/glue_session_manager/glue_kernel_utils/KernelGateway.py +342 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/glue_session_manager/glue_session.py +773 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/glue_session_manager/glue_session_configs/config.py +21 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/spark_commands/send_to_spark_command.py +76 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/spark_session_manager/__init__.py +0 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/spark_session_manager/spark_monitor_widget_utils.py +65 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/spark_session_manager/spark_session.py +124 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/utils/common_utils.py +18 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/utils/lib_utils.py +100 -0
- sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/sagemaker_spark_session_manager/utils/metadata_utils.py +19 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sagemaker-studio-dataengineering-sessions
|
|
3
|
+
Version: 1.0.2
|
|
4
|
+
Requires-Python: >=3.11
|
|
5
|
+
Requires-Dist: amazon-sagemaker-sql-execution<1,>=0.0.7
|
|
6
|
+
Requires-Dist: boto3
|
|
7
|
+
Requires-Dist: botocore
|
|
8
|
+
Requires-Dist: ipython
|
|
9
|
+
Requires-Dist: pandas
|
|
10
|
+
Requires-Dist: panel
|
|
11
|
+
Requires-Dist: pytest
|
|
12
|
+
Requires-Dist: pytest-cov
|
|
13
|
+
Requires-Dist: sparkmagic
|
|
14
|
+
Requires-Dist: sqlparse>=0.4.0
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# SageMakerStudioDataEngineeringSessions
|
|
18
|
+
|
|
19
|
+
SageMaker Unified Studio Data Engineering Sessions
|
|
20
|
+
|
|
21
|
+
This pacakge depends on SageMaker Unified Studio environment, if you are using SageMaker Unified Studio, see [AWS Doc](https://docs.aws.amazon.com/sagemaker-unified-studio/latest/userguide/what-is-sagemaker-unified-studio.html) for guidance.
|
|
22
|
+
|
|
23
|
+
This package contains functionality to support SageMaker Unified Studio connecting to various AWS Compute including EMR/EMR Serverless/Glue/Redshift etc.
|
|
24
|
+
|
|
25
|
+
It is utilizing [ipython magics](https://ipython.readthedocs.io/en/stable/interactive/magics.html) and [AWS DataZone Connections](https://docs.aws.amazon.com/datazone/latest/APIReference/API_ListConnections.html) to achieve the following features.
|
|
26
|
+
|
|
27
|
+
## Features
|
|
28
|
+
|
|
29
|
+
- Connect to remote compute
|
|
30
|
+
- Execute Spark code in remote compute in Python/Scala
|
|
31
|
+
- Execute SQL queries in remote compute
|
|
32
|
+
- Send local variables to remote compute
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
## How to setup
|
|
36
|
+
|
|
37
|
+
If you are using SageMaker Unifed Studio, you can skip this part, SageMaker Unifed Studio already set up the package.
|
|
38
|
+
|
|
39
|
+
This package contains various Jupyter Magics to achieve its functionality.
|
|
40
|
+
|
|
41
|
+
To load these magics, make sure you have iPython config file generated. If not, you could run `ipython profile create`, then a file with path `~/.ipython/profile_default/ipython_config.py` should be generated
|
|
42
|
+
|
|
43
|
+
Then you will need to add the following line in the end of that config file
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
c.InteractiveShellApp.extensions.extend(['sagemaker_studio_dataengineering_sessions.sagemaker_connection_magic'])
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Once that is finished, you could restart the ipython kernel and run `%help` to see a list of supported magics
|
|
50
|
+
|
|
51
|
+
## Examples
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
To connect to remote compute, a DataZone Connection is required, you could create it via [CreateConnection API](https://docs.aws.amazon.com/datazone/latest/APIReference/API_CreateConnection.html), Let's say there's an existing connection called project.spark.
|
|
55
|
+
|
|
56
|
+
### Supported Connection Type:
|
|
57
|
+
|
|
58
|
+
- IAM
|
|
59
|
+
- SPARK
|
|
60
|
+
- REDSHIFT
|
|
61
|
+
- ATHENA
|
|
62
|
+
|
|
63
|
+
### Connect to remote compute and Execute Spark Code in Python
|
|
64
|
+
The following example will connect to AWS Glue Interactive session and run the spark code in Glue.
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
%%pyspark project.spark
|
|
68
|
+
|
|
69
|
+
import sys
|
|
70
|
+
import boto3
|
|
71
|
+
from awsglue.utils import getResolvedOptions
|
|
72
|
+
from pyspark.context import SparkContext
|
|
73
|
+
from pyspark.sql import SparkSession
|
|
74
|
+
from pyspark.sql.functions import col
|
|
75
|
+
|
|
76
|
+
args = getResolvedOptions(sys.argv, ["redshift_url", "redshift_iam_role", "redshift_tempdir","redshift_jdbc_iam_url"])
|
|
77
|
+
print(f"{args}")
|
|
78
|
+
|
|
79
|
+
sc = SparkContext.getOrCreate()
|
|
80
|
+
spark = SparkSession(sc)
|
|
81
|
+
|
|
82
|
+
df = spark.read.csv(f"s3://sagemaker-example-files-prod-{boto3.session.Session().region_name}/datasets/tabular/dirty-titanic/", header=True)
|
|
83
|
+
df.show(5, truncate=False)
|
|
84
|
+
df.printSchema()
|
|
85
|
+
|
|
86
|
+
df.createOrReplaceTempView("df_sql_tempview")
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
### Execute Spark Code in Scala
|
|
90
|
+
The following example will connect to AWS Glue Interactive session and run the spark code in Scala.
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
%%scalaspark project.spark
|
|
94
|
+
val dfScala = spark.sql("SELECT count(0) FROM df_sql_tempview")
|
|
95
|
+
dfScala.show()
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Execute SQL query in remote compute
|
|
99
|
+
The following example will connect to AWS Glue Interactive session and run the spark code in Scala.
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
%%sql project.redshift
|
|
103
|
+
select current_user()
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Some other helpful magics
|
|
107
|
+
|
|
108
|
+
```
|
|
109
|
+
%help - list available magics and related information
|
|
110
|
+
|
|
111
|
+
%send_to_remote - send local variable to remote compute
|
|
112
|
+
|
|
113
|
+
%%configure - configure spark application config in remote compute
|
|
114
|
+
```
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# SageMakerStudioDataEngineeringSessions
|
|
2
|
+
|
|
3
|
+
SageMaker Unified Studio Data Engineering Sessions
|
|
4
|
+
|
|
5
|
+
This pacakge depends on SageMaker Unified Studio environment, if you are using SageMaker Unified Studio, see [AWS Doc](https://docs.aws.amazon.com/sagemaker-unified-studio/latest/userguide/what-is-sagemaker-unified-studio.html) for guidance.
|
|
6
|
+
|
|
7
|
+
This package contains functionality to support SageMaker Unified Studio connecting to various AWS Compute including EMR/EMR Serverless/Glue/Redshift etc.
|
|
8
|
+
|
|
9
|
+
It is utilizing [ipython magics](https://ipython.readthedocs.io/en/stable/interactive/magics.html) and [AWS DataZone Connections](https://docs.aws.amazon.com/datazone/latest/APIReference/API_ListConnections.html) to achieve the following features.
|
|
10
|
+
|
|
11
|
+
## Features
|
|
12
|
+
|
|
13
|
+
- Connect to remote compute
|
|
14
|
+
- Execute Spark code in remote compute in Python/Scala
|
|
15
|
+
- Execute SQL queries in remote compute
|
|
16
|
+
- Send local variables to remote compute
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
## How to setup
|
|
20
|
+
|
|
21
|
+
If you are using SageMaker Unifed Studio, you can skip this part, SageMaker Unifed Studio already set up the package.
|
|
22
|
+
|
|
23
|
+
This package contains various Jupyter Magics to achieve its functionality.
|
|
24
|
+
|
|
25
|
+
To load these magics, make sure you have iPython config file generated. If not, you could run `ipython profile create`, then a file with path `~/.ipython/profile_default/ipython_config.py` should be generated
|
|
26
|
+
|
|
27
|
+
Then you will need to add the following line in the end of that config file
|
|
28
|
+
|
|
29
|
+
```
|
|
30
|
+
c.InteractiveShellApp.extensions.extend(['sagemaker_studio_dataengineering_sessions.sagemaker_connection_magic'])
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Once that is finished, you could restart the ipython kernel and run `%help` to see a list of supported magics
|
|
34
|
+
|
|
35
|
+
## Examples
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
To connect to remote compute, a DataZone Connection is required, you could create it via [CreateConnection API](https://docs.aws.amazon.com/datazone/latest/APIReference/API_CreateConnection.html), Let's say there's an existing connection called project.spark.
|
|
39
|
+
|
|
40
|
+
### Supported Connection Type:
|
|
41
|
+
|
|
42
|
+
- IAM
|
|
43
|
+
- SPARK
|
|
44
|
+
- REDSHIFT
|
|
45
|
+
- ATHENA
|
|
46
|
+
|
|
47
|
+
### Connect to remote compute and Execute Spark Code in Python
|
|
48
|
+
The following example will connect to AWS Glue Interactive session and run the spark code in Glue.
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
%%pyspark project.spark
|
|
52
|
+
|
|
53
|
+
import sys
|
|
54
|
+
import boto3
|
|
55
|
+
from awsglue.utils import getResolvedOptions
|
|
56
|
+
from pyspark.context import SparkContext
|
|
57
|
+
from pyspark.sql import SparkSession
|
|
58
|
+
from pyspark.sql.functions import col
|
|
59
|
+
|
|
60
|
+
args = getResolvedOptions(sys.argv, ["redshift_url", "redshift_iam_role", "redshift_tempdir","redshift_jdbc_iam_url"])
|
|
61
|
+
print(f"{args}")
|
|
62
|
+
|
|
63
|
+
sc = SparkContext.getOrCreate()
|
|
64
|
+
spark = SparkSession(sc)
|
|
65
|
+
|
|
66
|
+
df = spark.read.csv(f"s3://sagemaker-example-files-prod-{boto3.session.Session().region_name}/datasets/tabular/dirty-titanic/", header=True)
|
|
67
|
+
df.show(5, truncate=False)
|
|
68
|
+
df.printSchema()
|
|
69
|
+
|
|
70
|
+
df.createOrReplaceTempView("df_sql_tempview")
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Execute Spark Code in Scala
|
|
74
|
+
The following example will connect to AWS Glue Interactive session and run the spark code in Scala.
|
|
75
|
+
|
|
76
|
+
```
|
|
77
|
+
%%scalaspark project.spark
|
|
78
|
+
val dfScala = spark.sql("SELECT count(0) FROM df_sql_tempview")
|
|
79
|
+
dfScala.show()
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Execute SQL query in remote compute
|
|
83
|
+
The following example will connect to AWS Glue Interactive session and run the spark code in Scala.
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
%%sql project.redshift
|
|
87
|
+
select current_user()
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Some other helpful magics
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
%help - list available magics and related information
|
|
94
|
+
|
|
95
|
+
%send_to_remote - send local variable to remote compute
|
|
96
|
+
|
|
97
|
+
%%configure - configure spark application config in remote compute
|
|
98
|
+
```
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sagemaker-studio-dataengineering-sessions"
|
|
7
|
+
version = "1.0.2"
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
dependencies = [
|
|
10
|
+
# Put your dependencies here!
|
|
11
|
+
"boto3",
|
|
12
|
+
"pytest",
|
|
13
|
+
"ipython",
|
|
14
|
+
# database session manager
|
|
15
|
+
"amazon_sagemaker_sql_execution>=0.0.7,<1",
|
|
16
|
+
"pandas",
|
|
17
|
+
"sqlparse>=0.4.0",
|
|
18
|
+
# spark session manager
|
|
19
|
+
"sparkmagic",
|
|
20
|
+
"botocore",
|
|
21
|
+
# connection magic
|
|
22
|
+
"panel",
|
|
23
|
+
"pytest",
|
|
24
|
+
"pytest-cov"
|
|
25
|
+
]
|
|
26
|
+
requires-python = ">=3.11"
|
|
27
|
+
|
|
28
|
+
[tool.hatch.envs.default]
|
|
29
|
+
# This controls what version of Python you want to be the default
|
|
30
|
+
# when running any scripts or tools to do things like debug test failures
|
|
31
|
+
# or do general development. It's lockfile is ./requirements.txt
|
|
32
|
+
python = "3.11"
|
|
33
|
+
dependencies = [
|
|
34
|
+
"pytest",
|
|
35
|
+
# "mypy",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[tool.pytest.ini_options]
|
|
39
|
+
addopts = [
|
|
40
|
+
"--durations=5",
|
|
41
|
+
"--color=yes",
|
|
42
|
+
]
|
|
43
|
+
testpaths = [ "tests" ]
|
|
44
|
+
|
|
45
|
+
[tool.coverage.run]
|
|
46
|
+
source_pkgs = ["sagemaker_studio_dataengineering_sessions"]
|
|
47
|
+
branch = true
|
|
48
|
+
parallel = true
|
|
49
|
+
|
|
50
|
+
[tool.coverage.paths]
|
|
51
|
+
"sagemaker_studio_dataengineering_sessions" = ["src/sagemaker_studio_dataengineering_sessions", "**/site-packages/sagemaker_studio_dataengineering_sessions"]
|
|
52
|
+
|
|
53
|
+
[tool.coverage.report]
|
|
54
|
+
fail_under = 60
|
|
55
|
+
exclude_lines = [
|
|
56
|
+
"no cov",
|
|
57
|
+
"if __name__ == .__main__.:",
|
|
58
|
+
"if TYPE_CHECKING:",
|
|
59
|
+
]
|
|
60
|
+
show_missing = true
|
|
61
|
+
|
|
62
|
+
[tool.coverage.xml]
|
|
63
|
+
output = "private/brazil-documentation/coverage/coverage.xml"
|
|
64
|
+
|
|
65
|
+
[tool.coverage.html]
|
|
66
|
+
directory = "private/brazil-documentation/coverage/"
|
|
67
|
+
|
|
68
|
+
[tool.ruff.lint]
|
|
69
|
+
isort.known-first-party = ["sagemaker_studio_dataengineering_sessions"]
|
|
70
|
+
exclude = [ "./build", ".hatch", "private" ]
|
|
71
|
+
|
|
72
|
+
[tool.ruff.lint.per-file-ignores]
|
|
73
|
+
# Tests can use magic values, assertions, and relative imports
|
|
74
|
+
"tests/**/*" = ["PLR2004", "S101", "TID252"]
|
|
75
|
+
|
|
76
|
+
[tool.hatch.build]
|
|
77
|
+
directory = "./external-distribution"
|
|
78
|
+
packages = ["src/sagemaker_studio_dataengineering_sessions"]
|
|
79
|
+
|
|
80
|
+
[tool.hatch.build.targets.sdist]
|
|
81
|
+
ignore-vcs = true
|
|
82
|
+
|
|
83
|
+
[tool.hatch.build.targets.wheel]
|
|
84
|
+
ignore-vcs = true
|
|
85
|
+
|
|
86
|
+
[tool.hatch.build.hooks.custom]
|
|
87
|
+
path = "scripts/build_hooks.py"
|
|
88
|
+
|
|
89
|
+
[tool.hatch.env]
|
|
90
|
+
requires = [ "hatch-pip-compile" ]
|
|
91
|
+
|
|
92
|
+
[tool.hatch.envs.default.scripts]
|
|
93
|
+
# These are scripts you can run using `brazil-build run <script-name>`
|
|
94
|
+
#typing = [
|
|
95
|
+
# "mkdir -p .mypy_cache",
|
|
96
|
+
# "mypy --install-types --non-interactive src/sagemaker_studio_dataengineering_sessions tests"
|
|
97
|
+
#]
|
|
98
|
+
|
|
99
|
+
# This command is for updating all your lock files across all environments
|
|
100
|
+
update = [ "hatch-pip-compile --upgrade --all" ]
|
|
101
|
+
|
|
102
|
+
release = [
|
|
103
|
+
# "typing",
|
|
104
|
+
# "hatch test --all --cover",
|
|
105
|
+
"pytest tests --cov"
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
[[tool.hatch.envs.hatch-test.matrix]]
|
|
109
|
+
# This defines multiple variables you can generate combinations
|
|
110
|
+
# to test underneath different environments. A separate environment and
|
|
111
|
+
# lock file will be created for every combination located in `./requirements/`
|
|
112
|
+
python = ["3.11", "3.12"]
|
|
113
|
+
|
|
114
|
+
## This environment is used solely to generate a lock file on hatch,
|
|
115
|
+
# and hatch-pip-compile that can be automatically updated
|
|
116
|
+
[tool.hatch.envs.build-tools]
|
|
117
|
+
# This version states what version your build tools build with. To change it,
|
|
118
|
+
# you will need to:
|
|
119
|
+
# * Remove the `requirements/requirements-build-tools.txt` file
|
|
120
|
+
# * Run `brazil-build run update` to generate a new lock file for the environment
|
|
121
|
+
python = "3.11"
|
|
122
|
+
detached = true
|
|
123
|
+
skip-install = true
|
|
124
|
+
dependencies = [
|
|
125
|
+
"hatch",
|
|
126
|
+
"hatch-pip-compile",
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
# PeruHatch repository and package locking plugin
|
|
130
|
+
[tool.hatch.env.collectors.custom]
|
|
131
|
+
path = ".hatch/hatch_plugin.py"
|
|
132
|
+
|
|
133
|
+
# This is necessary to use 'uv' as the resolver if this is the top-level package
|
|
134
|
+
# in a monorepo (which is usually the case). Remove this if copying the
|
|
135
|
+
# package into a monorepo
|
|
136
|
+
[tool.uv.workspace]
|
|
File without changes
|
sagemaker_studio_dataengineering_sessions-1.0.2/sagemaker_studio_dataengineering_sessions/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from sagemaker_studio_dataengineering_sessions.sagemaker_base_session_manager.common.logger_utils import setup_logger
|
|
4
|
+
|
|
5
|
+
logging.basicConfig(level=logging.INFO)
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
setup_logger(logger, "SageMakerBaseSessionManager", "connection_magic")
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from sagemaker_studio_dataengineering_sessions.sagemaker_base_session_manager.common.constants import Language
|
|
5
|
+
from sagemaker_studio_dataengineering_sessions.sagemaker_base_session_manager.common.sagemaker_connection_display import SageMakerConnectionDisplay
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BaseSessionManager(metaclass=abc.ABCMeta):
|
|
9
|
+
|
|
10
|
+
def __init__(self) -> None:
|
|
11
|
+
# The limit for the number of rows that can be returned in a SQL query. Defaults to 10000.
|
|
12
|
+
self.sql_result_row_limit = 10_000
|
|
13
|
+
|
|
14
|
+
@abc.abstractmethod
|
|
15
|
+
def create_session(self):
|
|
16
|
+
"""
|
|
17
|
+
Create a new session
|
|
18
|
+
"""
|
|
19
|
+
raise NotImplementedError('Must define create_session to use this BaseSessionManager')
|
|
20
|
+
|
|
21
|
+
@abc.abstractmethod
|
|
22
|
+
def run_statement(self, cell="", language: Language = None):
|
|
23
|
+
"""
|
|
24
|
+
Run a statement against the session
|
|
25
|
+
:param cell: the code to run
|
|
26
|
+
"""
|
|
27
|
+
raise NotImplementedError('Must define run_statement to use this BaseSessionManager')
|
|
28
|
+
|
|
29
|
+
@abc.abstractmethod
|
|
30
|
+
def stop_session(self):
|
|
31
|
+
"""
|
|
32
|
+
Stop the session
|
|
33
|
+
"""
|
|
34
|
+
raise NotImplementedError('Must define stop_session to use this BaseSessionManager')
|
|
35
|
+
|
|
36
|
+
@abc.abstractmethod
|
|
37
|
+
def is_session_connectable(self) -> bool:
|
|
38
|
+
"""
|
|
39
|
+
Check if the session is connectable and in a valid status for running statement
|
|
40
|
+
:return: true if the session is in a valid status, false otherwise
|
|
41
|
+
"""
|
|
42
|
+
raise NotImplementedError('Must define is_session_connectable to use this BaseSessionManager')
|
|
43
|
+
|
|
44
|
+
@abc.abstractmethod
|
|
45
|
+
def _configure_core(self, cell: str):
|
|
46
|
+
"""
|
|
47
|
+
To configure current compute to be connected and start a new session with applied configuration
|
|
48
|
+
:param cell: the content to be applied for the session
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def configure(self, cell: str, force: bool = False):
|
|
52
|
+
"""
|
|
53
|
+
To configure current compute to be connected and start a new session with applied configuration
|
|
54
|
+
:param cell: the content to be applied for the session
|
|
55
|
+
:param force: a boolean to check if a user wants to force start a new session to apply configuration
|
|
56
|
+
"""
|
|
57
|
+
if self.is_session_connectable():
|
|
58
|
+
if not force:
|
|
59
|
+
SageMakerConnectionDisplay.send_error(
|
|
60
|
+
"A session has already been started. If you intend to recreate the "
|
|
61
|
+
"session with new configurations, please include the -f or --force argument.")
|
|
62
|
+
else:
|
|
63
|
+
self.stop_session()
|
|
64
|
+
self._configure_core(cell)
|
|
65
|
+
self.create_session()
|
|
66
|
+
else:
|
|
67
|
+
self._configure_core(cell)
|
|
68
|
+
return
|
|
69
|
+
|
|
70
|
+
def send_to_remote(self, local_var: str, remote_var: str, language=Language.python):
|
|
71
|
+
# Not an abstract method because by default a session manager does not support this
|
|
72
|
+
# only Spark session manager supports this for now.
|
|
73
|
+
"""
|
|
74
|
+
Send a local variable in kernel's userspace to remote compute.
|
|
75
|
+
e.g: for an EMR cluster, send a local variable to spark
|
|
76
|
+
:param local_var: local variable name
|
|
77
|
+
:param remote_var: remote variable name
|
|
78
|
+
"""
|
|
79
|
+
raise NotImplementedError('Send_to_remote is not supported')
|
|
80
|
+
|
|
81
|
+
def get_info(self):
|
|
82
|
+
"""
|
|
83
|
+
Get information about the connected compute session
|
|
84
|
+
"""
|
|
85
|
+
raise NotImplementedError('get_info is not supported for current session')
|
|
86
|
+
|
|
87
|
+
def get_session_id(self):
|
|
88
|
+
"""
|
|
89
|
+
Get the session ID of the connected compute session
|
|
90
|
+
"""
|
|
91
|
+
raise NotImplementedError('get_session_id is not supported for current session')
|
|
92
|
+
|
|
93
|
+
def get_status(self):
|
|
94
|
+
"""
|
|
95
|
+
Get the status of the connected compute session
|
|
96
|
+
"""
|
|
97
|
+
raise NotImplementedError('get_status is not supported for current session')
|
|
98
|
+
|
|
99
|
+
def add_tags(self, tags: str):
|
|
100
|
+
"""
|
|
101
|
+
Add tags to the connected compute session resources
|
|
102
|
+
"""
|
|
103
|
+
raise NotImplementedError('add_tags is not supported for current session')
|
|
104
|
+
|
|
105
|
+
def get_logs(self):
|
|
106
|
+
"""
|
|
107
|
+
Gets the current session's Livy logs
|
|
108
|
+
"""
|
|
109
|
+
raise NotImplementedError('get_logs is not supported for current session')
|
|
110
|
+
|
|
111
|
+
def matplot(self, line: str):
|
|
112
|
+
"""
|
|
113
|
+
Using matplotlib to plot the current session's plot'
|
|
114
|
+
"""
|
|
115
|
+
raise NotImplementedError('matplot is not supported for current session')
|
|
116
|
+
|
|
117
|
+
def set_session_id_prefix(self, prefix: str, force: bool = False):
|
|
118
|
+
"""
|
|
119
|
+
Sets the session ID prefix of the session to be created
|
|
120
|
+
"""
|
|
121
|
+
raise NotImplementedError('set_session_id_prefix is not supported for current session')
|
|
122
|
+
|
|
123
|
+
def set_number_of_workers(self, number: str, force: bool = False):
|
|
124
|
+
"""
|
|
125
|
+
Sets the number of workers of the session to be created
|
|
126
|
+
"""
|
|
127
|
+
raise NotImplementedError('set_number_of_workers is not supported for current session')
|
|
128
|
+
|
|
129
|
+
def set_worker_type(self, type: str, force: bool = False):
|
|
130
|
+
"""
|
|
131
|
+
Sets the worker type of the session to be created
|
|
132
|
+
"""
|
|
133
|
+
raise NotImplementedError('set_worker_type is not supported for current session')
|
|
134
|
+
|
|
135
|
+
def set_session_type(self, session_type: str, force: bool = False):
|
|
136
|
+
"""
|
|
137
|
+
Sets the session type of the session to be created. Acceptable session_type values are: streaming and etl.
|
|
138
|
+
"""
|
|
139
|
+
raise NotImplementedError('set_session_type is not supported for current session')
|
|
140
|
+
|
|
141
|
+
def set_glue_version(self, glue_version: str, force: bool = False):
|
|
142
|
+
"""
|
|
143
|
+
Sets the glue version of the session to be created
|
|
144
|
+
"""
|
|
145
|
+
raise NotImplementedError('set_glue_version is not supported for current session')
|
|
146
|
+
|
|
147
|
+
def set_idle_timeout(self, idle_timeout: str, force: bool = False):
|
|
148
|
+
"""
|
|
149
|
+
Sets the idle timeout value of the session to be created
|
|
150
|
+
"""
|
|
151
|
+
raise NotImplementedError('set_idle_timeout is not supported for current session')
|
|
152
|
+
|
|
153
|
+
def spark_conf(self, spark_conf: str, force: bool = False):
|
|
154
|
+
"""
|
|
155
|
+
Sets the spark configuration value of the session to be created
|
|
156
|
+
"""
|
|
157
|
+
raise NotImplementedError('spark_conf is not supported for current session')
|
|
158
|
+
|
|
159
|
+
def get_logger(self):
|
|
160
|
+
if not hasattr(self, '_logger'):
|
|
161
|
+
self._logger = logging.getLogger(__name__)
|
|
162
|
+
return self._logger
|