PyPI - dagster-duckdb-pyspark - Versions diffs - 0.18.6__py3-none-any.whl → 0.19.0__py3-none-any.whl - Mend

dagster-duckdb-pyspark 0.18.6py3-none-any.whl → 0.19.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dagster-duckdb-pyspark might be problematic. Click here for more details.

Files changed (9) hide show

dagster_duckdb_pyspark/__init__.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from dagster._core.libraries import DagsterLibraryRegistry
 from .duckdb_pyspark_type_handler import (
+    DuckDBPySparkIOManager as DuckDBPySparkIOManager,
     DuckDBPySparkTypeHandler as DuckDBPySparkTypeHandler,
     duckdb_pyspark_io_manager as duckdb_pyspark_io_manager,
 )

dagster_duckdb_pyspark/duckdb_pyspark_type_handler.py CHANGED Viewed

@@ -1,8 +1,14 @@
+from typing import Optional, Sequence, Type
 import pyspark
 import pyspark.sql
 from dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema
 from dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice
-from dagster_duckdb.io_manager import DuckDbClient, build_duckdb_io_manager
+from dagster_duckdb.io_manager import (
+    DuckDbClient,
+    DuckDBIOManager,
+    build_duckdb_io_manager,
+)
 from pyspark.sql import SparkSession
 from pyspark.sql.types import StructType
@@ -10,28 +16,29 @@ from pyspark.sql.types import StructType
 class DuckDBPySparkTypeHandler(DbTypeHandler[pyspark.sql.DataFrame]):
     """Stores PySpark DataFrames in DuckDB.
-    **Note:** This type handler can only store outputs. It cannot currently load inputs.
-    To use this type handler, pass it to ``build_duckdb_io_manager``
+    To use this type handler, return it from the ``type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager``.
     Example:
         .. code-block:: python
-            from dagster_duckdb import build_duckdb_io_manager
+            from dagster_duckdb import DuckDBIOManager
             from dagster_duckdb_pyspark import DuckDBPySparkTypeHandler
-            @asset
-            def my_table():
-                ...
+            class MyDuckDBIOManager(DuckDBIOManager):
+                @staticmethod
+                def type_handlers() -> Sequence[DbTypeHandler]:
+                    return [DuckDBPySparkTypeHandler()]
-            duckdb_io_manager = build_duckdb_io_manager([DuckDBPySparkTypeHandler()])
+            @asset(
+                key_prefix=["my_schema"]  # will be used as the schema in duckdb
+            )
+            def my_table() -> pyspark.sql.DataFrame:  # the name of the asset will be the table name
+                ...
-            @repository
-            def my_repo():
-                return with_resources(
-                    [my_table],
-                    {"io_manager": duckdb_io_manager.configured({"database": "my_db.duckdb"})}
-                )
+            defs = Definitions(
+                assets=[my_table],
+                resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}
+            )
     """
     def handle_output(
@@ -86,7 +93,7 @@ duckdb_pyspark_io_manager = build_duckdb_io_manager(
     [DuckDBPySparkTypeHandler()], default_load_type=pyspark.sql.DataFrame
 )
 duckdb_pyspark_io_manager.__doc__ = """
-An IO manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When
+An I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When
 using the duckdb_pyspark_io_manager, any inputs and outputs without type annotations will be loaded
 as PySpark DataFrames.
@@ -113,7 +120,7 @@ Examples:
             )
     If you do not provide a schema, Dagster will determine a schema based on the assets and ops using
-    the IO Manager. For assets, the schema will be determined from the asset key.
+    the I/O Manager. For assets, the schema will be determined from the asset key.
     For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided
     via config or on the asset/op, "public" will be used for the schema.
@@ -139,3 +146,64 @@ Examples:
             ...
 """
+class DuckDBPySparkIOManager(DuckDBIOManager):
+    """An I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When
+    using the DuckDBPySparkIOManager, any inputs and outputs without type annotations will be loaded
+    as PySpark DataFrames.
+    Returns:
+        IOManagerDefinition
+    Examples:
+        .. code-block:: python
+            from dagster_duckdb_pyspark import DuckDBPySparkIOManager
+            @asset(
+                key_prefix=["my_schema"]  # will be used as the schema in DuckDB
+            )
+            def my_table() -> pyspark.sql.DataFrame:  # the name of the asset will be the table name
+                ...
+            defs = Definitions(
+                assets=[my_table],
+                resources={"io_manager": DuckDBPySparkIOManager(database="my_db.duckdb")}
+            )
+        If you do not provide a schema, Dagster will determine a schema based on the assets and ops using
+        the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.
+        For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided
+        via config or on the asset/op, "public" will be used for the schema.
+        .. code-block:: python
+            @op(
+                out={"my_table": Out(metadata={"schema": "my_schema"})}
+            )
+            def make_my_table() -> pyspark.sql.DataFrame:
+                # the returned value will be stored at my_schema.my_table
+                ...
+        To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the
+        In or AssetIn.
+        .. code-block:: python
+            @asset(
+                ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}
+            )
+            def my_table_a(my_table: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
+                # my_table will just contain the data from column "a"
+                ...
+    """
+    @staticmethod
+    def type_handlers() -> Sequence[DbTypeHandler]:
+        return [DuckDBPySparkTypeHandler()]
+    @staticmethod
+    def default_load_type() -> Optional[Type]:
+        return pyspark.sql.DataFrame

dagster_duckdb_pyspark/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.18.6"
1	+ __version__ = "0.19.0"

{dagster_duckdb_pyspark-0.18.6.dist-info → dagster_duckdb_pyspark-0.19.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: dagster-duckdb-pyspark
-Version: 0.18.6
+Version: 0.19.0
 Summary: Package for storing PySpark DataFrames in DuckDB.
 Home-page: https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-duckb-pyspark
 Author: Elementl
@@ -13,8 +13,8 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Operating System :: OS Independent
 License-File: LICENSE
-Requires-Dist: dagster (==1.2.6)
-Requires-Dist: dagster-duckdb (==0.18.6)
+Requires-Dist: dagster (==1.3.0)
+Requires-Dist: dagster-duckdb (==0.19.0)
 Requires-Dist: pandas (<2)
 Requires-Dist: pyspark (>=2.0.2) ; python_version < "3.8"
 Requires-Dist: pyspark (>=3.0.0) ; python_version >= "3.8"

dagster_duckdb_pyspark-0.19.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+dagster_duckdb_pyspark/__init__.py,sha256=KjwD42HKQJslK2WPFg2F7mvHe1hPyrp02xSWM0Az39Y,382
+dagster_duckdb_pyspark/duckdb_pyspark_type_handler.py,sha256=kRDBdDr2xuHFJUR3UvQn1sSPMYEe7n7YHYhbwmmx5UM,7484
+dagster_duckdb_pyspark/py.typed,sha256=mDShSrm8qg9qjacQc2F-rI8ATllqP6EdgHuEYxuCXZ0,7
+dagster_duckdb_pyspark/version.py,sha256=IPTpw_ZRkJdPKjp9ROF6sfDyeEv2IvChuvliVauZWvE,23
+dagster_duckdb_pyspark-0.19.0.dist-info/LICENSE,sha256=-gtoVIAZYUHYmNHISZg982FI4Oh19mV1nxgTVW8eCB8,11344
+dagster_duckdb_pyspark-0.19.0.dist-info/METADATA,sha256=3dekfx2vpkMIlVyzlZZoFFj-CbEixlZtRES39zn2UxA,856
+dagster_duckdb_pyspark-0.19.0.dist-info/WHEEL,sha256=p46_5Uhzqz6AzeSosiOnxK-zmFja1i22CrQCjmYe8ec,92
+dagster_duckdb_pyspark-0.19.0.dist-info/top_level.txt,sha256=UYh0E2YiAlK01-DAkx0eikRaH-TIk0n9jijQK2joJBs,23
+dagster_duckdb_pyspark-0.19.0.dist-info/RECORD,,

dagster_duckdb_pyspark-0.18.6.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-dagster_duckdb_pyspark/__init__.py,sha256=EsnAP_X64pR76FL4a6RicGeG6H_vl4l9k-tb441KA4M,328
-dagster_duckdb_pyspark/duckdb_pyspark_type_handler.py,sha256=LwM8_6CDRw8n6XZI4VFliPxKfChwlQRawfPTtPuWasY,4970
-dagster_duckdb_pyspark/py.typed,sha256=mDShSrm8qg9qjacQc2F-rI8ATllqP6EdgHuEYxuCXZ0,7
-dagster_duckdb_pyspark/version.py,sha256=uKA6-JXiIkK71mAwJK5D762_yrliPWqnH0bkkAczVnU,23
-dagster_duckdb_pyspark-0.18.6.dist-info/LICENSE,sha256=-gtoVIAZYUHYmNHISZg982FI4Oh19mV1nxgTVW8eCB8,11344
-dagster_duckdb_pyspark-0.18.6.dist-info/METADATA,sha256=w7-WZPyK-6xIE64za0a-quraJsasl2Rssa76LSSmq9M,856
-dagster_duckdb_pyspark-0.18.6.dist-info/WHEEL,sha256=p46_5Uhzqz6AzeSosiOnxK-zmFja1i22CrQCjmYe8ec,92
-dagster_duckdb_pyspark-0.18.6.dist-info/top_level.txt,sha256=UYh0E2YiAlK01-DAkx0eikRaH-TIk0n9jijQK2joJBs,23
-dagster_duckdb_pyspark-0.18.6.dist-info/RECORD,,

{dagster_duckdb_pyspark-0.18.6.dist-info → dagster_duckdb_pyspark-0.19.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{dagster_duckdb_pyspark-0.18.6.dist-info → dagster_duckdb_pyspark-0.19.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dagster_duckdb_pyspark-0.18.6.dist-info → dagster_duckdb_pyspark-0.19.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dagster-duckdb-pyspark 0.18.6__py3-none-any.whl → 0.19.0__py3-none-any.whl

Potentially problematic release.

dagster-duckdb-pyspark 0.18.6py3-none-any.whl → 0.19.0py3-none-any.whl