PyPI - dagster-snowflake-pyspark - Versions diffs - 0.17.17__py3-none-any.whl → 0.28.2__py3-none-any.whl - Mend

dagster-snowflake-pyspark 0.17.17py3-none-any.whl → 0.28.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

dagster_snowflake_pyspark/__init__.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from dagster._core.utils import check_dagster_package_version
+from dagster_shared.libraries import DagsterLibraryRegistry
-from .snowflake_pyspark_type_handler import (
+from dagster_snowflake_pyspark.snowflake_pyspark_type_handler import (
+    SnowflakePySparkIOManager as SnowflakePySparkIOManager,
     SnowflakePySparkTypeHandler as SnowflakePySparkTypeHandler,
     snowflake_pyspark_io_manager as snowflake_pyspark_io_manager,
 )
-from .version import __version__ as __version__
+from dagster_snowflake_pyspark.version import __version__ as __version__
-check_dagster_package_version("dagster-snowflake-pyspark", __version__)
+DagsterLibraryRegistry.register("dagster-snowflake-pyspark", __version__)

dagster_snowflake_pyspark/constants.py ADDED Viewed

@@ -0,0 +1,5 @@
+# Description: This file contains the Snowflake connection identifiers for the Snowflake partner account.
+# The connection identifiers are used to identify the partner account when connecting to Snowflake.
+# We use different connection identifiers for different connection code paths to ensure that each is
+# working as expected.
+SNOWFLAKE_PARTNER_CONNECTION_IDENTIFIER_PYSPARK = "DagsterLabs_Dagster_Pyspark"

dagster_snowflake_pyspark/py.typed ADDED Viewed

	@@ -0,0 +1 @@
1	+ partial

dagster_snowflake_pyspark/snowflake_pyspark_type_handler.py CHANGED Viewed

@@ -1,11 +1,16 @@
-from typing import Mapping
+from collections.abc import Mapping, Sequence
+from typing import Optional
 import dagster._check as check
 from dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema
 from dagster._core.definitions.metadata import RawMetadataValue
 from dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice
-from dagster_snowflake import build_snowflake_io_manager
+from dagster_snowflake import SnowflakeIOManager, build_snowflake_io_manager
+from dagster_snowflake.snowflake_io_manager import SnowflakeDbClient
 from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql.types import StructType
+from dagster_snowflake_pyspark.constants import SNOWFLAKE_PARTNER_CONNECTION_IDENTIFIER_PYSPARK
 SNOWFLAKE_CONNECTOR = "net.snowflake.spark.snowflake"
@@ -23,55 +28,53 @@ def _get_snowflake_options(config, table_slice: TableSlice) -> Mapping[str, str]
         "sfDatabase": config["database"],
         "sfSchema": table_slice.schema,
         "sfWarehouse": config["warehouse"],
-        "dbtable": table_slice.table,
+        "APPLICATION": SNOWFLAKE_PARTNER_CONNECTION_IDENTIFIER_PYSPARK,
     }
     return conf
 class SnowflakePySparkTypeHandler(DbTypeHandler[DataFrame]):
-    """
-    Plugin for the Snowflake I/O Manager that can store and load PySpark DataFrames as Snowflake tables.
+    """Plugin for the Snowflake I/O Manager that can store and load PySpark DataFrames as Snowflake tables.
     Examples:
         .. code-block:: python
-            from dagster_snowflake import build_snowflake_io_manager
+            from dagster_snowflake import SnowflakeIOManager
+            from dagster_snowflake_pandas import SnowflakePandasTypeHandler
             from dagster_snowflake_pyspark import SnowflakePySparkTypeHandler
-            from pyspark.sql import DataFrame
-            from dagster import Definitions
+            from dagster import Definitions, EnvVar
-            snowflake_io_manager = build_snowflake_io_manager([SnowflakePySparkTypeHandler()])
+            class MySnowflakeIOManager(SnowflakeIOManager):
+                @staticmethod
+                def type_handlers() -> Sequence[DbTypeHandler]:
+                    return [SnowflakePandasTypeHandler(), SnowflakePySparkTypeHandler()]
-            @asset
-            def my_asset() -> DataFrame:
+            @asset(
+                key_prefix=["my_schema"]  # will be used as the schema in snowflake
+            )
+            def my_table() -> pd.DataFrame:  # the name of the asset will be the table name
                 ...
-            defs = Definitions(
-                assets=[my_asset],
+            Definitions(
+                assets=[my_table],
                 resources={
-                    "io_manager": snowflake_io_manager.configured(...)
+                    "io_manager": MySnowflakeIOManager(database="MY_DATABASE", account=EnvVar("SNOWFLAKE_ACCOUNT"), warehouse="my_warehouse", ...)
                 }
             )
-            # OR
-            @job(resource_defs={'io_manager': snowflake_io_manager})
-            def my_job():
-                ...
     """
-    def handle_output(
-        self, context: OutputContext, table_slice: TableSlice, obj: DataFrame
+    def handle_output(  # pyright: ignore[reportIncompatibleMethodOverride]
+        self, context: OutputContext, table_slice: TableSlice, obj: DataFrame, _
     ) -> Mapping[str, RawMetadataValue]:
         options = _get_snowflake_options(context.resource_config, table_slice)
         with_uppercase_cols = obj.toDF(*[c.upper() for c in obj.columns])
-        with_uppercase_cols.write.format(SNOWFLAKE_CONNECTOR).options(**options).mode(
-            "append"
-        ).save()
+        with_uppercase_cols.write.format(SNOWFLAKE_CONNECTOR).options(**options).option(
+            "dbtable", table_slice.table
+        ).mode("append").save()
         return {
             "dataframe_columns": MetadataValue.table_schema(
@@ -84,12 +87,19 @@ class SnowflakePySparkTypeHandler(DbTypeHandler[DataFrame]):
             ),
         }
-    def load_input(self, context: InputContext, table_slice: TableSlice) -> DataFrame:
+    def load_input(self, context: InputContext, table_slice: TableSlice, _) -> DataFrame:  # pyright: ignore[reportIncompatibleMethodOverride]
         options = _get_snowflake_options(context.resource_config, table_slice)
-        spark = SparkSession.builder.getOrCreate()
-        df = spark.read.format(SNOWFLAKE_CONNECTOR).options(**options).load()
+        spark = SparkSession.builder.getOrCreate()  # type: ignore
+        if table_slice.partition_dimensions and len(context.asset_partition_keys) == 0:
+            return spark.createDataFrame([], StructType([]))
+        df = (
+            spark.read.format(SNOWFLAKE_CONNECTOR)
+            .options(**options)
+            .option("query", SnowflakeDbClient.get_select_statement(table_slice))
+            .load()
+        )
         return df.toDF(*[c.lower() for c in df.columns])
     @property
@@ -97,9 +107,13 @@ class SnowflakePySparkTypeHandler(DbTypeHandler[DataFrame]):
         return [DataFrame]
-snowflake_pyspark_io_manager = build_snowflake_io_manager([SnowflakePySparkTypeHandler()])
+snowflake_pyspark_io_manager = build_snowflake_io_manager(
+    [SnowflakePySparkTypeHandler()], default_load_type=DataFrame
+)
 snowflake_pyspark_io_manager.__doc__ = """
-An IO manager definition that reads inputs from and writes PySpark DataFrames to Snowflake.
+An I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When
+using the snowflake_pyspark_io_manager, any inputs and outputs without type annotations will be loaded
+as PySpark DataFrames.
 Returns:
     IOManagerDefinition
@@ -118,7 +132,7 @@ Examples:
         def my_table() -> DataFrame:  # the name of the asset will be the table name
             ...
-        defs = Definitions(
+        Definitions(
             assets=[my_table],
             resources={
                 "io_manager": snowflake_pyspark_io_manager.configured({
@@ -133,10 +147,38 @@ Examples:
     Note that the warehouse configuration value is required when using the snowflake_pyspark_io_manager
-    If you do not provide a schema, Dagster will determine a schema based on the assets and ops using
-    the IO Manager. For assets, the schema will be determined from the asset key.
-    For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided
-    via config or on the asset/op, "public" will be used for the schema.
+    You can set a default schema to store the assets using the ``schema`` configuration value of the Snowflake I/O
+    Manager. This schema will be used if no other schema is specified directly on an asset or op.
+    .. code-block:: python
+        Definitions(
+            assets=[my_table]
+            resources={"io_manager" snowflake_pyspark_io_manager.configured(
+                {"database": "my_database", "schema": "my_schema", ...} # will be used as the schema
+            )}
+        )
+    On individual assets, you an also specify the schema where they should be stored using metadata or
+    by adding a ``key_prefix`` to the asset key. If both ``key_prefix`` and metadata are defined, the metadata will
+    take precedence.
+    .. code-block:: python
+        @asset(
+            key_prefix=["my_schema"]  # will be used as the schema in snowflake
+        )
+        def my_table() -> DataFrame:
+            ...
+        @asset(
+            metadata={"schema": "my_schema"}  # will be used as the schema in snowflake
+        )
+        def my_other_table() -> DataFrame:
+            ...
+    For ops, the schema can be specified by including a "schema" entry in output metadata.
     .. code-block:: python
@@ -144,9 +186,10 @@ Examples:
             out={"my_table": Out(metadata={"schema": "my_schema"})}
         )
         def make_my_table() -> DataFrame:
-            # the returned value will be stored at my_schema.my_table
             ...
+    If none of these is provided, the schema will default to "public".
     To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the
     In or AssetIn.
@@ -160,3 +203,108 @@ Examples:
             ...
 """
+class SnowflakePySparkIOManager(SnowflakeIOManager):
+    """An I/O manager definition that reads inputs from and writes PySpark DataFrames to Snowflake. When
+    using the SnowflakePySparkIOManager, any inputs and outputs without type annotations will be loaded
+    as PySpark DataFrames.
+    Returns:
+        IOManagerDefinition
+    Examples:
+        .. code-block:: python
+            from dagster_snowflake_pyspark import SnowflakePySparkIOManager
+            from pyspark.sql import DataFrame
+            from dagster import Definitions, EnvVar
+            @asset(
+                key_prefix=["my_schema"]  # will be used as the schema in snowflake
+            )
+            def my_table() -> DataFrame:  # the name of the asset will be the table name
+                ...
+            Definitions(
+                assets=[my_table],
+                resources={
+                    "io_manager": SnowflakePySparkIOManager(
+                        database="my_database",
+                        warehouse="my_warehouse", # required for SnowflakePySparkIOManager
+                        account=EnvVar("SNOWFLAKE_ACCOUNT"),
+                        password=EnvVar("SNOWFLAKE_PASSWORD"),
+                        ...
+                    )
+                }
+            )
+        Note that the warehouse configuration value is required when using the SnowflakePySparkIOManager
+        You can set a default schema to store the assets using the ``schema`` configuration value of the Snowflake I/O
+        Manager. This schema will be used if no other schema is specified directly on an asset or op.
+        .. code-block:: python
+            Definitions(
+                assets=[my_table]
+                resources={
+                    "io_manager" SnowflakePySparkIOManager(database="my_database", schema="my_schema", ...)
+                }
+            )
+        On individual assets, you an also specify the schema where they should be stored using metadata or
+        by adding a ``key_prefix`` to the asset key. If both ``key_prefix`` and metadata are defined, the metadata will
+        take precedence.
+        .. code-block:: python
+            @asset(
+                key_prefix=["my_schema"]  # will be used as the schema in snowflake
+            )
+            def my_table() -> DataFrame:
+                ...
+            @asset(
+                metadata={"schema": "my_schema"}  # will be used as the schema in snowflake
+            )
+            def my_other_table() -> DataFrame:
+                ...
+        For ops, the schema can be specified by including a "schema" entry in output metadata.
+        .. code-block:: python
+            @op(
+                out={"my_table": Out(metadata={"schema": "my_schema"})}
+            )
+            def make_my_table() -> DataFrame:
+                ...
+        If none of these is provided, the schema will default to "public".
+        To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the
+        In or AssetIn.
+        .. code-block:: python
+            @asset(
+                ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}
+            )
+            def my_table_a(my_table: DataFrame) -> DataFrame:
+                # my_table will just contain the data from column "a"
+                ...
+    """
+    @classmethod
+    def _is_dagster_maintained(cls) -> bool:
+        return True
+    @staticmethod
+    def type_handlers() -> Sequence[DbTypeHandler]:
+        return [SnowflakePySparkTypeHandler()]
+    @staticmethod
+    def default_load_type() -> Optional[type]:
+        return DataFrame

dagster_snowflake_pyspark/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.17.17"
1	+ __version__ = "0.28.2"

dagster_snowflake_pyspark-0.28.2.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,28 @@
+Metadata-Version: 2.4
+Name: dagster-snowflake-pyspark
+Version: 0.28.2
+Summary: Package for integrating Snowflake and PySpark with Dagster.
+Home-page: https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-snowflake-pyspark
+Author: Dagster Labs
+Author-email: hello@dagsterlabs.com
+License: Apache-2.0
+Classifier: Programming Language :: Python :: 3.10
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.10,<3.14
+License-File: LICENSE
+Requires-Dist: dagster==1.12.2
+Requires-Dist: dagster-snowflake==0.28.2
+Requires-Dist: pyspark<4
+Requires-Dist: requests
+Requires-Dist: sqlalchemy!=1.4.42
+Requires-Dist: snowflake-sqlalchemy>=1.2
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: home-page
+Dynamic: license
+Dynamic: license-file
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary

dagster_snowflake_pyspark-0.28.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+dagster_snowflake_pyspark/__init__.py,sha256=6uFEmuB7ctAVeYqjIvlpUkS3H6NsfTkCTGxDCnFdDOk,472
+dagster_snowflake_pyspark/constants.py,sha256=0GwhKlR3tzwIv2FbgK9e2D78iAPWXwhni_bSdfoFyNM,410
+dagster_snowflake_pyspark/py.typed,sha256=la67KBlbjXN-_-DfGNcdOcjYumVpKG_Tkw-8n5dnGB4,8
+dagster_snowflake_pyspark/snowflake_pyspark_type_handler.py,sha256=Hn3izqO4ctRBkFOYmocRsDtgWzyyqzEy0ZjvM1eSCcg,11157
+dagster_snowflake_pyspark/version.py,sha256=K-TM2fq9AmH_Dk8Cadam72wILDZ_6qftLHvY9P1Fc3I,23
+dagster_snowflake_pyspark-0.28.2.dist-info/licenses/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
+dagster_snowflake_pyspark-0.28.2.dist-info/METADATA,sha256=DY1OKr4Dwnfn1Up_WY_4R_aJb3duDFOzzgJgnWi0VNs,918
+dagster_snowflake_pyspark-0.28.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dagster_snowflake_pyspark-0.28.2.dist-info/top_level.txt,sha256=NH48Qcesg34H5Ih-KKuOhwmWzvcaqVkN9lvADwCJv8U,26
+dagster_snowflake_pyspark-0.28.2.dist-info/RECORD,,

{dagster_snowflake_pyspark-0.17.17.dist-info → dagster_snowflake_pyspark-0.28.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.33.6)
+Generator: setuptools (80.9.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

dagster_snowflake_pyspark-0.17.17.dist-info/METADATA DELETED Viewed

@@ -1,23 +0,0 @@
-Metadata-Version: 2.1
-Name: dagster-snowflake-pyspark
-Version: 0.17.17
-Summary: Package for integrating Snowflake and PySpark with Dagster.
-Home-page: https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-snowflake-pyspark
-Author: Elementl
-Author-email: hello@elementl.com
-License: Apache-2.0
-Platform: UNKNOWN
-Classifier: Programming Language :: Python :: 3.7
-Classifier: Programming Language :: Python :: 3.8
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: License :: OSI Approved :: Apache Software License
-Classifier: Operating System :: OS Independent
-License-File: LICENSE
-Requires-Dist: dagster (==1.1.17)
-Requires-Dist: dagster-snowflake (==0.17.17)
-Requires-Dist: pyspark
-Requires-Dist: requests
-UNKNOWN

dagster_snowflake_pyspark-0.17.17.dist-info/RECORD DELETED Viewed

@@ -1,8 +0,0 @@
-dagster_snowflake_pyspark/__init__.py,sha256=y3sz5BMvWB5g3Ltrq1iIvCrfCwWCG-fRrPunjFB7zRk,362
-dagster_snowflake_pyspark/snowflake_pyspark_type_handler.py,sha256=2niwnj2mksSdoEObGrDW-Gk2hKhPpi5Ab40MCllUSSM,5522
-dagster_snowflake_pyspark/version.py,sha256=woU9IWXWHwuGWS0yp3QUg6UygmaKQdNq2tgod9BYEpM,24
-dagster_snowflake_pyspark-0.17.17.dist-info/LICENSE,sha256=tAkwu8-AdEyGxGoSvJ2gVmQdcicWw3j1ZZueVV74M-E,11357
-dagster_snowflake_pyspark-0.17.17.dist-info/METADATA,sha256=SYjGmXB0oPkMhJ8hOizMrXBYBc-TZgOYYYnfN3d1hOo,809
-dagster_snowflake_pyspark-0.17.17.dist-info/WHEEL,sha256=p46_5Uhzqz6AzeSosiOnxK-zmFja1i22CrQCjmYe8ec,92
-dagster_snowflake_pyspark-0.17.17.dist-info/top_level.txt,sha256=NH48Qcesg34H5Ih-KKuOhwmWzvcaqVkN9lvADwCJv8U,26
-dagster_snowflake_pyspark-0.17.17.dist-info/RECORD,,

{dagster_snowflake_pyspark-0.17.17.dist-info → dagster_snowflake_pyspark-0.28.2.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{dagster_snowflake_pyspark-0.17.17.dist-info → dagster_snowflake_pyspark-0.28.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

dagster-snowflake-pyspark 0.17.17__py3-none-any.whl → 0.28.2__py3-none-any.whl

dagster-snowflake-pyspark 0.17.17py3-none-any.whl → 0.28.2py3-none-any.whl