dagster-snowflake-pyspark 0.17.17__py3-none-any.whl → 0.28.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,10 @@
1
- from dagster._core.utils import check_dagster_package_version
1
+ from dagster_shared.libraries import DagsterLibraryRegistry
2
2
 
3
- from .snowflake_pyspark_type_handler import (
3
+ from dagster_snowflake_pyspark.snowflake_pyspark_type_handler import (
4
+ SnowflakePySparkIOManager as SnowflakePySparkIOManager,
4
5
  SnowflakePySparkTypeHandler as SnowflakePySparkTypeHandler,
5
6
  snowflake_pyspark_io_manager as snowflake_pyspark_io_manager,
6
7
  )
7
- from .version import __version__ as __version__
8
+ from dagster_snowflake_pyspark.version import __version__ as __version__
8
9
 
9
- check_dagster_package_version("dagster-snowflake-pyspark", __version__)
10
+ DagsterLibraryRegistry.register("dagster-snowflake-pyspark", __version__)
@@ -0,0 +1,5 @@
1
+ # Description: This file contains the Snowflake connection identifiers for the Snowflake partner account.
2
+ # The connection identifiers are used to identify the partner account when connecting to Snowflake.
3
+ # We use different connection identifiers for different connection code paths to ensure that each is
4
+ # working as expected.
5
+ SNOWFLAKE_PARTNER_CONNECTION_IDENTIFIER_PYSPARK = "DagsterLabs_Dagster_Pyspark"
@@ -0,0 +1 @@
1
+ partial
@@ -1,11 +1,16 @@
1
- from typing import Mapping
1
+ from collections.abc import Mapping, Sequence
2
+ from typing import Optional
2
3
 
3
4
  import dagster._check as check
4
5
  from dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema
5
6
  from dagster._core.definitions.metadata import RawMetadataValue
6
7
  from dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice
7
- from dagster_snowflake import build_snowflake_io_manager
8
+ from dagster_snowflake import SnowflakeIOManager, build_snowflake_io_manager
9
+ from dagster_snowflake.snowflake_io_manager import SnowflakeDbClient
8
10
  from pyspark.sql import DataFrame, SparkSession
11
+ from pyspark.sql.types import StructType
12
+
13
+ from dagster_snowflake_pyspark.constants import SNOWFLAKE_PARTNER_CONNECTION_IDENTIFIER_PYSPARK
9
14
 
10
15
  SNOWFLAKE_CONNECTOR = "net.snowflake.spark.snowflake"
11
16
 
@@ -23,55 +28,53 @@ def _get_snowflake_options(config, table_slice: TableSlice) -> Mapping[str, str]
23
28
  "sfDatabase": config["database"],
24
29
  "sfSchema": table_slice.schema,
25
30
  "sfWarehouse": config["warehouse"],
26
- "dbtable": table_slice.table,
31
+ "APPLICATION": SNOWFLAKE_PARTNER_CONNECTION_IDENTIFIER_PYSPARK,
27
32
  }
28
33
 
29
34
  return conf
30
35
 
31
36
 
32
37
  class SnowflakePySparkTypeHandler(DbTypeHandler[DataFrame]):
33
- """
34
- Plugin for the Snowflake I/O Manager that can store and load PySpark DataFrames as Snowflake tables.
38
+ """Plugin for the Snowflake I/O Manager that can store and load PySpark DataFrames as Snowflake tables.
35
39
 
36
40
  Examples:
37
41
  .. code-block:: python
38
42
 
39
- from dagster_snowflake import build_snowflake_io_manager
43
+ from dagster_snowflake import SnowflakeIOManager
44
+ from dagster_snowflake_pandas import SnowflakePandasTypeHandler
40
45
  from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler
41
- from pyspark.sql import DataFrame
42
- from dagster import Definitions
46
+ from dagster import Definitions, EnvVar
43
47
 
44
- snowflake_io_manager = build_snowflake_io_manager([SnowflakePySparkTypeHandler()])
48
+ class MySnowflakeIOManager(SnowflakeIOManager):
49
+ @staticmethod
50
+ def type_handlers() -> Sequence[DbTypeHandler]:
51
+ return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]
45
52
 
46
- @asset
47
- def my_asset() -> DataFrame:
53
+ @asset(
54
+ key_prefix=["my_schema"] # will be used as the schema in snowflake
55
+ )
56
+ def my_table() -> pd.DataFrame: # the name of the asset will be the table name
48
57
  ...
49
58
 
50
- defs = Definitions(
51
- assets=[my_asset],
59
+ Definitions(
60
+ assets=[my_table],
52
61
  resources={
53
- "io_manager": snowflake_io_manager.configured(...)
62
+ "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), warehouse="my_warehouse", ...)
54
63
  }
55
64
  )
56
65
 
57
- # OR
58
-
59
- @job(resource_defs={'io_manager': snowflake_io_manager})
60
- def my_job():
61
- ...
62
-
63
66
  """
64
67
 
65
- def handle_output(
66
- self, context: OutputContext, table_slice: TableSlice, obj: DataFrame
68
+ def handle_output( # pyright: ignore[reportIncompatibleMethodOverride]
69
+ self, context: OutputContext, table_slice: TableSlice, obj: DataFrame, _
67
70
  ) -> Mapping[str, RawMetadataValue]:
68
71
  options = _get_snowflake_options(context.resource_config, table_slice)
69
72
 
70
73
  with_uppercase_cols = obj.toDF(*[c.upper() for c in obj.columns])
71
74
 
72
- with_uppercase_cols.write.format(SNOWFLAKE_CONNECTOR).options(**options).mode(
73
- "append"
74
- ).save()
75
+ with_uppercase_cols.write.format(SNOWFLAKE_CONNECTOR).options(**options).option(
76
+ "dbtable", table_slice.table
77
+ ).mode("append").save()
75
78
 
76
79
  return {
77
80
  "dataframe_columns": MetadataValue.table_schema(
@@ -84,12 +87,19 @@ class SnowflakePySparkTypeHandler(DbTypeHandler[DataFrame]):
84
87
  ),
85
88
  }
86
89
 
87
- def load_input(self, context: InputContext, table_slice: TableSlice) -> DataFrame:
90
+ def load_input(self, context: InputContext, table_slice: TableSlice, _) -> DataFrame: # pyright: ignore[reportIncompatibleMethodOverride]
88
91
  options = _get_snowflake_options(context.resource_config, table_slice)
89
92
 
90
- spark = SparkSession.builder.getOrCreate()
91
- df = spark.read.format(SNOWFLAKE_CONNECTOR).options(**options).load()
93
+ spark = SparkSession.builder.getOrCreate() # type: ignore
94
+ if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:
95
+ return spark.createDataFrame([], StructType([]))
92
96
 
97
+ df = (
98
+ spark.read.format(SNOWFLAKE_CONNECTOR)
99
+ .options(**options)
100
+ .option("query", SnowflakeDbClient.get_select_statement(table_slice))
101
+ .load()
102
+ )
93
103
  return df.toDF(*[c.lower() for c in df.columns])
94
104
 
95
105
  @property
@@ -97,9 +107,13 @@ class SnowflakePySparkTypeHandler(DbTypeHandler[DataFrame]):
97
107
  return [DataFrame]
98
108
 
99
109
 
100
- snowflake_pyspark_io_manager = build_snowflake_io_manager([SnowflakePySparkTypeHandler()])
110
+ snowflake_pyspark_io_manager = build_snowflake_io_manager(
111
+ [SnowflakePySparkTypeHandler()], default_load_type=DataFrame
112
+ )
101
113
  snowflake_pyspark_io_manager.__doc__ = """
102
- An IO manager definition that reads inputs from and writes PySpark DataFrames to Snowflake.
114
+ An I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When
115
+ using the snowflake_pyspark_io_manager, any inputs and outputs without type annotations will be loaded
116
+ as PySpark DataFrames.
103
117
 
104
118
  Returns:
105
119
  IOManagerDefinition
@@ -118,7 +132,7 @@ Examples:
118
132
  def my_table() -> DataFrame: # the name of the asset will be the table name
119
133
  ...
120
134
 
121
- defs = Definitions(
135
+ Definitions(
122
136
  assets=[my_table],
123
137
  resources={
124
138
  "io_manager": snowflake_pyspark_io_manager.configured({
@@ -133,10 +147,38 @@ Examples:
133
147
 
134
148
  Note that the warehouse configuration value is required when using the snowflake_pyspark_io_manager
135
149
 
136
- If you do not provide a schema, Dagster will determine a schema based on the assets and ops using
137
- the IO Manager. For assets, the schema will be determined from the asset key.
138
- For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided
139
- via config or on the asset/op, "public" will be used for the schema.
150
+ You can set a default schema to store the assets using the ``schema`` configuration value of the Snowflake I/O
151
+ Manager. This schema will be used if no other schema is specified directly on an asset or op.
152
+
153
+ .. code-block:: python
154
+
155
+ Definitions(
156
+ assets=[my_table]
157
+ resources={"io_manager" snowflake_pyspark_io_manager.configured(
158
+ {"database": "my_database", "schema": "my_schema", ...} # will be used as the schema
159
+ )}
160
+ )
161
+
162
+
163
+ On individual assets, you an also specify the schema where they should be stored using metadata or
164
+ by adding a ``key_prefix`` to the asset key. If both ``key_prefix`` and metadata are defined, the metadata will
165
+ take precedence.
166
+
167
+ .. code-block:: python
168
+
169
+ @asset(
170
+ key_prefix=["my_schema"] # will be used as the schema in snowflake
171
+ )
172
+ def my_table() -> DataFrame:
173
+ ...
174
+
175
+ @asset(
176
+ metadata={"schema": "my_schema"} # will be used as the schema in snowflake
177
+ )
178
+ def my_other_table() -> DataFrame:
179
+ ...
180
+
181
+ For ops, the schema can be specified by including a "schema" entry in output metadata.
140
182
 
141
183
  .. code-block:: python
142
184
 
@@ -144,9 +186,10 @@ Examples:
144
186
  out={"my_table": Out(metadata={"schema": "my_schema"})}
145
187
  )
146
188
  def make_my_table() -> DataFrame:
147
- # the returned value will be stored at my_schema.my_table
148
189
  ...
149
190
 
191
+ If none of these is provided, the schema will default to "public".
192
+
150
193
  To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the
151
194
  In or AssetIn.
152
195
 
@@ -160,3 +203,108 @@ Examples:
160
203
  ...
161
204
 
162
205
  """
206
+
207
+
208
+ class SnowflakePySparkIOManager(SnowflakeIOManager):
209
+ """An I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When
210
+ using the SnowflakePySparkIOManager, any inputs and outputs without type annotations will be loaded
211
+ as PySpark DataFrames.
212
+
213
+ Returns:
214
+ IOManagerDefinition
215
+
216
+ Examples:
217
+ .. code-block:: python
218
+
219
+ from dagster_snowflake_pyspark import SnowflakePySparkIOManager
220
+ from pyspark.sql import DataFrame
221
+ from dagster import Definitions, EnvVar
222
+
223
+ @asset(
224
+ key_prefix=["my_schema"] # will be used as the schema in snowflake
225
+ )
226
+ def my_table() -> DataFrame: # the name of the asset will be the table name
227
+ ...
228
+
229
+ Definitions(
230
+ assets=[my_table],
231
+ resources={
232
+ "io_manager": SnowflakePySparkIOManager(
233
+ database="my_database",
234
+ warehouse="my_warehouse", # required for SnowflakePySparkIOManager
235
+ account=EnvVar("SNOWFLAKE_ACCOUNT"),
236
+ password=EnvVar("SNOWFLAKE_PASSWORD"),
237
+ ...
238
+ )
239
+ }
240
+ )
241
+
242
+ Note that the warehouse configuration value is required when using the SnowflakePySparkIOManager
243
+
244
+ You can set a default schema to store the assets using the ``schema`` configuration value of the Snowflake I/O
245
+ Manager. This schema will be used if no other schema is specified directly on an asset or op.
246
+
247
+ .. code-block:: python
248
+
249
+ Definitions(
250
+ assets=[my_table]
251
+ resources={
252
+ "io_manager" SnowflakePySparkIOManager(database="my_database", schema="my_schema", ...)
253
+ }
254
+ )
255
+
256
+
257
+ On individual assets, you an also specify the schema where they should be stored using metadata or
258
+ by adding a ``key_prefix`` to the asset key. If both ``key_prefix`` and metadata are defined, the metadata will
259
+ take precedence.
260
+
261
+ .. code-block:: python
262
+
263
+ @asset(
264
+ key_prefix=["my_schema"] # will be used as the schema in snowflake
265
+ )
266
+ def my_table() -> DataFrame:
267
+ ...
268
+
269
+ @asset(
270
+ metadata={"schema": "my_schema"} # will be used as the schema in snowflake
271
+ )
272
+ def my_other_table() -> DataFrame:
273
+ ...
274
+
275
+ For ops, the schema can be specified by including a "schema" entry in output metadata.
276
+
277
+ .. code-block:: python
278
+
279
+ @op(
280
+ out={"my_table": Out(metadata={"schema": "my_schema"})}
281
+ )
282
+ def make_my_table() -> DataFrame:
283
+ ...
284
+
285
+ If none of these is provided, the schema will default to "public".
286
+ To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the
287
+ In or AssetIn.
288
+
289
+ .. code-block:: python
290
+
291
+ @asset(
292
+ ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}
293
+ )
294
+ def my_table_a(my_table: DataFrame) -> DataFrame:
295
+ # my_table will just contain the data from column "a"
296
+ ...
297
+
298
+ """
299
+
300
+ @classmethod
301
+ def _is_dagster_maintained(cls) -> bool:
302
+ return True
303
+
304
+ @staticmethod
305
+ def type_handlers() -> Sequence[DbTypeHandler]:
306
+ return [SnowflakePySparkTypeHandler()]
307
+
308
+ @staticmethod
309
+ def default_load_type() -> Optional[type]:
310
+ return DataFrame
@@ -1 +1 @@
1
- __version__ = "0.17.17"
1
+ __version__ = "0.28.2"
@@ -0,0 +1,28 @@
1
+ Metadata-Version: 2.4
2
+ Name: dagster-snowflake-pyspark
3
+ Version: 0.28.2
4
+ Summary: Package for integrating Snowflake and PySpark with Dagster.
5
+ Home-page: https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-snowflake-pyspark
6
+ Author: Dagster Labs
7
+ Author-email: hello@dagsterlabs.com
8
+ License: Apache-2.0
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.10,<3.14
13
+ License-File: LICENSE
14
+ Requires-Dist: dagster==1.12.2
15
+ Requires-Dist: dagster-snowflake==0.28.2
16
+ Requires-Dist: pyspark<4
17
+ Requires-Dist: requests
18
+ Requires-Dist: sqlalchemy!=1.4.42
19
+ Requires-Dist: snowflake-sqlalchemy>=1.2
20
+ Dynamic: author
21
+ Dynamic: author-email
22
+ Dynamic: classifier
23
+ Dynamic: home-page
24
+ Dynamic: license
25
+ Dynamic: license-file
26
+ Dynamic: requires-dist
27
+ Dynamic: requires-python
28
+ Dynamic: summary
@@ -0,0 +1,10 @@
1
+ dagster_snowflake_pyspark/__init__.py,sha256=6uFEmuB7ctAVeYqjIvlpUkS3H6NsfTkCTGxDCnFdDOk,472
2
+ dagster_snowflake_pyspark/constants.py,sha256=0GwhKlR3tzwIv2FbgK9e2D78iAPWXwhni_bSdfoFyNM,410
3
+ dagster_snowflake_pyspark/py.typed,sha256=la67KBlbjXN-_-DfGNcdOcjYumVpKG_Tkw-8n5dnGB4,8
4
+ dagster_snowflake_pyspark/snowflake_pyspark_type_handler.py,sha256=Hn3izqO4ctRBkFOYmocRsDtgWzyyqzEy0ZjvM1eSCcg,11157
5
+ dagster_snowflake_pyspark/version.py,sha256=K-TM2fq9AmH_Dk8Cadam72wILDZ_6qftLHvY9P1Fc3I,23
6
+ dagster_snowflake_pyspark-0.28.2.dist-info/licenses/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
7
+ dagster_snowflake_pyspark-0.28.2.dist-info/METADATA,sha256=DY1OKr4Dwnfn1Up_WY_4R_aJb3duDFOzzgJgnWi0VNs,918
8
+ dagster_snowflake_pyspark-0.28.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
9
+ dagster_snowflake_pyspark-0.28.2.dist-info/top_level.txt,sha256=NH48Qcesg34H5Ih-KKuOhwmWzvcaqVkN9lvADwCJv8U,26
10
+ dagster_snowflake_pyspark-0.28.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.33.6)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,23 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: dagster-snowflake-pyspark
3
- Version: 0.17.17
4
- Summary: Package for integrating Snowflake and PySpark with Dagster.
5
- Home-page: https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-snowflake-pyspark
6
- Author: Elementl
7
- Author-email: hello@elementl.com
8
- License: Apache-2.0
9
- Platform: UNKNOWN
10
- Classifier: Programming Language :: Python :: 3.7
11
- Classifier: Programming Language :: Python :: 3.8
12
- Classifier: Programming Language :: Python :: 3.9
13
- Classifier: Programming Language :: Python :: 3.10
14
- Classifier: License :: OSI Approved :: Apache Software License
15
- Classifier: Operating System :: OS Independent
16
- License-File: LICENSE
17
- Requires-Dist: dagster (==1.1.17)
18
- Requires-Dist: dagster-snowflake (==0.17.17)
19
- Requires-Dist: pyspark
20
- Requires-Dist: requests
21
-
22
- UNKNOWN
23
-
@@ -1,8 +0,0 @@
1
- dagster_snowflake_pyspark/__init__.py,sha256=y3sz5BMvWB5g3Ltrq1iIvCrfCwWCG-fRrPunjFB7zRk,362
2
- dagster_snowflake_pyspark/snowflake_pyspark_type_handler.py,sha256=2niwnj2mksSdoEObGrDW-Gk2hKhPpi5Ab40MCllUSSM,5522
3
- dagster_snowflake_pyspark/version.py,sha256=woU9IWXWHwuGWS0yp3QUg6UygmaKQdNq2tgod9BYEpM,24
4
- dagster_snowflake_pyspark-0.17.17.dist-info/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
5
- dagster_snowflake_pyspark-0.17.17.dist-info/METADATA,sha256=SYjGmXB0oPkMhJ8hOizMrXBYBc-TZgOYYYnfN3d1hOo,809
6
- dagster_snowflake_pyspark-0.17.17.dist-info/WHEEL,sha256=p46_5Uhzqz6AzeSosiOnxK-zmFja1i22CrQCjmYe8ec,92
7
- dagster_snowflake_pyspark-0.17.17.dist-info/top_level.txt,sha256=NH48Qcesg34H5Ih-KKuOhwmWzvcaqVkN9lvADwCJv8U,26
8
- dagster_snowflake_pyspark-0.17.17.dist-info/RECORD,,