PyPI - dagster-duckdb - Versions diffs - 0.20.3__tar.gz → 0.28.6__tar.gz - Mend

dagster-duckdb 0.20.3tar.gz → 0.28.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{dagster-duckdb-0.20.3 → dagster_duckdb-0.28.6}/LICENSE RENAMED Viewed

@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
-   Copyright 2023 Elementl, Inc.
+   Copyright 2025 Dagster Labs, Inc.
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

dagster_duckdb-0.28.6/PKG-INFO ADDED Viewed

@@ -0,0 +1,29 @@
+Metadata-Version: 2.4
+Name: dagster-duckdb
+Version: 0.28.6
+Summary: Package for DuckDB-specific Dagster framework op and resource components.
+Home-page: https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-duckb
+Author: Dagster Labs
+Author-email: hello@dagsterlabs.com
+License: Apache-2.0
+Classifier: Programming Language :: Python :: 3.10
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.10,<3.14
+License-File: LICENSE
+Requires-Dist: duckdb
+Requires-Dist: dagster==1.12.6
+Provides-Extra: pandas
+Requires-Dist: pandas; extra == "pandas"
+Provides-Extra: pyspark
+Requires-Dist: pyspark<4,>=3; extra == "pyspark"
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: home-page
+Dynamic: license
+Dynamic: license-file
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary

dagster_duckdb-0.28.6/README.md ADDED Viewed

@@ -0,0 +1,4 @@
+# dagster-duckdb
+The docs for `dagster-duckdb` can be found
+[here](https://docs.dagster.io/api/python-api/libraries/dagster-duckdb).

dagster_duckdb-0.28.6/dagster_duckdb/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+from dagster_shared.libraries import DagsterLibraryRegistry
+from dagster_duckdb.io_manager import (
+    DuckDBIOManager as DuckDBIOManager,
+    build_duckdb_io_manager as build_duckdb_io_manager,
+)
+from dagster_duckdb.resource import DuckDBResource as DuckDBResource
+from dagster_duckdb.version import __version__
+DagsterLibraryRegistry.register("dagster-duckdb", __version__)

{dagster-duckdb-0.20.3 → dagster_duckdb-0.28.6}/dagster_duckdb/io_manager.py RENAMED Viewed

@@ -1,13 +1,12 @@
 from abc import abstractmethod
+from collections.abc import Sequence
 from contextlib import contextmanager
-from typing import Optional, Sequence, Type, cast
+from typing import Any, Optional, cast
 import duckdb
 from dagster import IOManagerDefinition, OutputContext, io_manager
-from dagster._config.pythonic_config import (
-    ConfigurableIOManagerFactory,
-)
-from dagster._core.definitions.time_window_partitions import TimeWindow
+from dagster._config.pythonic_config import ConfigurableIOManagerFactory
+from dagster._core.definitions.partitions.utils import TimeWindow
 from dagster._core.storage.db_io_manager import (
     DbClient,
     DbIOManager,
@@ -17,20 +16,21 @@ from dagster._core.storage.db_io_manager import (
 )
 from dagster._core.storage.io_manager import dagster_maintained_io_manager
 from dagster._utils.backoff import backoff
+from packaging.version import Version
 from pydantic import Field
 DUCKDB_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"
 def build_duckdb_io_manager(
-    type_handlers: Sequence[DbTypeHandler], default_load_type: Optional[Type] = None
+    type_handlers: Sequence[DbTypeHandler], default_load_type: Optional[type] = None
 ) -> IOManagerDefinition:
     """Builds an IO manager definition that reads inputs from and writes outputs to DuckDB.
     Args:
         type_handlers (Sequence[DbTypeHandler]): Each handler defines how to translate between
             DuckDB tables and an in-memory type - e.g. a Pandas DataFrame. If only
-            one DbTypeHandler is provided, it will be used as teh default_load_type.
+            one DbTypeHandler is provided, it will be used as the default_load_type.
         default_load_type (Type): When an input has no type annotation, load it as this type.
     Returns:
@@ -50,17 +50,43 @@ def build_duckdb_io_manager(
             duckdb_io_manager = build_duckdb_io_manager([DuckDBPandasTypeHandler()])
-            @repository
-            def my_repo():
-                return with_resources(
-                    [my_table],
-                    {"io_manager": duckdb_io_manager.configured({"database": "my_db.duckdb"})}
-                )
+            Definitions(
+                assets=[my_table]
+                resources={"io_manager" duckdb_io_manager.configured({"database": "my_db.duckdb"})}
+            )
+    You can set a default schema to store the assets using the ``schema`` configuration value of the DuckDB I/O
+    Manager. This schema will be used if no other schema is specified directly on an asset or op.
+    .. code-block:: python
+        Definitions(
+            assets=[my_table]
+            resources={"io_manager" duckdb_io_manager.configured(
+                {"database": "my_db.duckdb", "schema": "my_schema"} # will be used as the schema
+            )}
+        )
-    If you do not provide a schema, Dagster will determine a schema based on the assets and ops using
-    the IO Manager. For assets, the schema will be determined from the asset key. For ops, the schema can be
-    specified by including a "schema" entry in output metadata. If none of these is provided, the schema will
-    default to "public".
+    On individual assets, you an also specify the schema where they should be stored using metadata or
+    by adding a ``key_prefix`` to the asset key. If both ``key_prefix`` and metadata are defined, the metadata will
+    take precedence.
+    .. code-block:: python
+        @asset(
+            key_prefix=["my_schema"]  # will be used as the schema in duckdb
+        )
+        def my_table() -> pd.DataFrame:
+            ...
+        @asset(
+            metadata={"schema": "my_schema"}  # will be used as the schema in duckdb
+        )
+        def my_other_table() -> pd.DataFrame:
+            ...
+    For ops, the schema can be specified by including a "schema" entry in output metadata.
     .. code-block:: python
@@ -70,6 +96,8 @@ def build_duckdb_io_manager(
         def make_my_table() -> pd.DataFrame:
             ...
+    If none of these is provided, the schema will default to "public".
     To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the
     In or AssetIn.
@@ -126,15 +154,40 @@ class DuckDBIOManager(ConfigurableIOManagerFactory):
             def my_table() -> pd.DataFrame:  # the name of the asset will be the table name
                 ...
-            defs = Definitions(
+            Definitions(
                 assets=[my_table],
                 resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}
             )
-    If you do not provide a schema, Dagster will determine a schema based on the assets and ops using
-    the IO Manager. For assets, the schema will be determined from the asset key, as in the above example.
-    For ops, the schema can be specified by including a "schema" entry in output metadata. If none
-    of these is provided, the schema will default to "public".
+    You can set a default schema to store the assets using the ``schema`` configuration value of the DuckDB I/O
+    Manager. This schema will be used if no other schema is specified directly on an asset or op.
+    .. code-block:: python
+        Definitions(
+            assets=[my_table],
+            resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb", schema="my_schema")}
+        )
+    On individual assets, you an also specify the schema where they should be stored using metadata or
+    by adding a ``key_prefix`` to the asset key. If both ``key_prefix`` and metadata are defined, the metadata will
+    take precedence.
+    .. code-block:: python
+        @asset(
+            key_prefix=["my_schema"]  # will be used as the schema in duckdb
+        )
+        def my_table() -> pd.DataFrame:
+            ...
+        @asset(
+            metadata={"schema": "my_schema"}  # will be used as the schema in duckdb
+        )
+        def my_other_table() -> pd.DataFrame:
+            ...
+    For ops, the schema can be specified by including a "schema" entry in output metadata.
     .. code-block:: python
@@ -144,6 +197,8 @@ class DuckDBIOManager(ConfigurableIOManagerFactory):
         def make_my_table() -> pd.DataFrame:
             ...
+    If none of these is provided, the schema will default to "public".
     To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the
     In or AssetIn.
@@ -156,20 +211,37 @@ class DuckDBIOManager(ConfigurableIOManagerFactory):
             # my_table will just contain the data from column "a"
             ...
+    Set DuckDB configuration options using the connection_config field. See
+    https://duckdb.org/docs/sql/configuration.html for all available settings.
+    .. code-block:: python
+        Definitions(
+            assets=[my_table],
+            resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb",
+                                                       connection_config={"arrow_large_buffer_size": True})}
+        )
     """
     database: str = Field(description="Path to the DuckDB database.")
+    connection_config: dict[str, Any] = Field(
+        description=(
+            "DuckDB connection configuration options. See"
+            " https://duckdb.org/docs/sql/configuration.html"
+        ),
+        default={},
+    )
     schema_: Optional[str] = Field(
         default=None, alias="schema", description="Name of the schema to use."
     )  # schema is a reserved word for pydantic
     @staticmethod
     @abstractmethod
-    def type_handlers() -> Sequence[DbTypeHandler]:
-        ...
+    def type_handlers() -> Sequence[DbTypeHandler]: ...
     @staticmethod
-    def default_load_type() -> Optional[Type]:
+    def default_load_type() -> Optional[type]:
         return None
     def create_io_manager(self, context) -> DbIOManager:
@@ -200,7 +272,7 @@ class DuckDbClient(DbClient):
     def get_select_statement(table_slice: TableSlice) -> str:
         col_str = ", ".join(table_slice.columns) if table_slice.columns else "*"
-        if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:
+        if table_slice.partition_dimensions:
             query = f"SELECT {col_str} FROM {table_slice.schema}.{table_slice.table} WHERE\n"
             return query + _partition_where_clause(table_slice.partition_dimensions)
         else:
@@ -208,11 +280,25 @@ class DuckDbClient(DbClient):
     @staticmethod
     @contextmanager
-    def connect(context, _):
+    def connect(context, _):  # pyright: ignore[reportIncompatibleMethodOverride]
+        config = context.resource_config["connection_config"]
+        # support for `custom_user_agent` was added in v1.0.0
+        # https://github.com/duckdb/duckdb/commit/0c66b6007b736ed2197bca54d20c9ad9a5eeef46
+        if Version(duckdb.__version__) >= Version("1.0.0"):
+            config = {
+                "custom_user_agent": "dagster",
+                **config,
+            }
         conn = backoff(
             fn=duckdb.connect,
             retry_on=(RuntimeError, duckdb.IOException),
-            kwargs={"database": context.resource_config["database"], "read_only": False},
+            kwargs={
+                "database": context.resource_config["database"],
+                "read_only": False,
+                "config": config,
+            },
             max_retries=10,
         )
@@ -225,7 +311,7 @@ def _get_cleanup_statement(table_slice: TableSlice) -> str:
     """Returns a SQL statement that deletes data in the given table to make way for the output data
     being written.
     """
-    if table_slice.partition_dimensions and len(table_slice.partition_dimensions) > 0:
+    if table_slice.partition_dimensions:
         query = f"DELETE FROM {table_slice.schema}.{table_slice.table} WHERE\n"
         return query + _partition_where_clause(table_slice.partition_dimensions)
     else:
@@ -234,15 +320,17 @@ def _get_cleanup_statement(table_slice: TableSlice) -> str:
 def _partition_where_clause(partition_dimensions: Sequence[TablePartitionDimension]) -> str:
     return " AND\n".join(
-        _time_window_where_clause(partition_dimension)
-        if isinstance(partition_dimension.partitions, TimeWindow)
-        else _static_where_clause(partition_dimension)
+        (
+            _time_window_where_clause(partition_dimension)
+            if isinstance(partition_dimension.partitions, TimeWindow)
+            else _static_where_clause(partition_dimension)
+        )
         for partition_dimension in partition_dimensions
     )
 def _time_window_where_clause(table_partition: TablePartitionDimension) -> str:
-    partition = cast(TimeWindow, table_partition.partitions)
+    partition = cast("TimeWindow", table_partition.partitions)
     start_dt, end_dt = partition
     start_dt_str = start_dt.strftime(DUCKDB_DATETIME_FORMAT)
     end_dt_str = end_dt.strftime(DUCKDB_DATETIME_FORMAT)

{dagster-duckdb-0.20.3 → dagster_duckdb-0.28.6}/dagster_duckdb/resource.py RENAMED Viewed

@@ -1,8 +1,10 @@
 from contextlib import contextmanager
+from typing import Any
 import duckdb
 from dagster import ConfigurableResource
 from dagster._utils.backoff import backoff
+from packaging.version import Version
 from pydantic import Field
@@ -20,7 +22,7 @@ class DuckDBResource(ConfigurableResource):
                 with duckdb.get_connection() as conn:
                     conn.execute("SELECT * from MY_SCHEMA.MY_TABLE")
-            defs = Definitions(
+            Definitions(
                 assets=[my_table],
                 resources={"duckdb": DuckDBResource(database="path/to/db.duckdb")}
             )
@@ -33,6 +35,13 @@ class DuckDBResource(ConfigurableResource):
             " database "
         )
     )
+    connection_config: dict[str, Any] = Field(
+        description=(
+            "DuckDB connection configuration options. See"
+            " https://duckdb.org/docs/sql/configuration.html"
+        ),
+        default={},
+    )
     @classmethod
     def _is_dagster_maintained(cls) -> bool:
@@ -40,10 +49,24 @@ class DuckDBResource(ConfigurableResource):
     @contextmanager
     def get_connection(self):
+        config = self.connection_config
+        # support for `custom_user_agent` was added in v1.0.0
+        # https://github.com/duckdb/duckdb/commit/0c66b6007b736ed2197bca54d20c9ad9a5eeef46
+        if Version(duckdb.__version__) >= Version("1.0.0"):
+            config = {
+                "custom_user_agent": "dagster",
+                **config,
+            }
         conn = backoff(
             fn=duckdb.connect,
             retry_on=(RuntimeError, duckdb.IOException),
-            kwargs={"database": self.database, "read_only": False},
+            kwargs={
+                "database": self.database,
+                "read_only": False,
+                "config": config,
+            },
             max_retries=10,
         )

dagster_duckdb-0.28.6/dagster_duckdb/version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.28.6"

dagster_duckdb-0.28.6/dagster_duckdb.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,29 @@
+Metadata-Version: 2.4
+Name: dagster-duckdb
+Version: 0.28.6
+Summary: Package for DuckDB-specific Dagster framework op and resource components.
+Home-page: https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-duckb
+Author: Dagster Labs
+Author-email: hello@dagsterlabs.com
+License: Apache-2.0
+Classifier: Programming Language :: Python :: 3.10
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.10,<3.14
+License-File: LICENSE
+Requires-Dist: duckdb
+Requires-Dist: dagster==1.12.6
+Provides-Extra: pandas
+Requires-Dist: pandas; extra == "pandas"
+Provides-Extra: pyspark
+Requires-Dist: pyspark<4,>=3; extra == "pyspark"
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: home-page
+Dynamic: license
+Dynamic: license-file
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary

dagster_duckdb-0.28.6/dagster_duckdb.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,8 @@
+duckdb
+dagster==1.12.6
+[pandas]
+pandas
+[pyspark]
+pyspark<4,>=3

{dagster-duckdb-0.20.3 → dagster_duckdb-0.28.6}/setup.py RENAMED Viewed

@@ -1,11 +1,10 @@
 from pathlib import Path
-from typing import Dict
 from setuptools import find_packages, setup
 def get_version() -> str:
-    version: Dict[str, str] = {}
+    version: dict[str, str] = {}
     with open(Path(__file__).parent / "dagster_duckdb/version.py", encoding="utf8") as fp:
         exec(fp.read(), version)
@@ -18,31 +17,28 @@ pin = "" if ver == "1!0+dev" else f"=={ver}"
 setup(
     name="dagster-duckdb",
     version=ver,
-    author="Elementl",
-    author_email="hello@elementl.com",
+    author="Dagster Labs",
+    author_email="hello@dagsterlabs.com",
     license="Apache-2.0",
     description="Package for DuckDB-specific Dagster framework op and resource components.",
     url="https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-duckb",
     classifiers=[
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "License :: OSI Approved :: Apache Software License",
         "Operating System :: OS Independent",
     ],
     packages=find_packages(exclude=["dagster_duckdb_tests*"]),
     include_package_data=True,
+    python_requires=">=3.10,<3.14",
     install_requires=[
         "duckdb",
-        "dagster==1.4.3",
+        "dagster==1.12.6",
     ],
     extras_require={
-        "pandas": ["pandas"],
-        # Pyspark 2.x is incompatible with Python 3.8+
-        "pyspark": [
-            'pyspark>=3.0.0; python_version >= "3.8"',
-            'pyspark>=2.0.2; python_version < "3.8"',
+        "pandas": [
+            "pandas",
         ],
+        "pyspark": ["pyspark>=3,<4"],
     },
     zip_safe=False,
 )

dagster-duckdb-0.20.3/PKG-INFO DELETED Viewed

@@ -1,16 +0,0 @@
-Metadata-Version: 2.1
-Name: dagster-duckdb
-Version: 0.20.3
-Summary: Package for DuckDB-specific Dagster framework op and resource components.
-Home-page: https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-duckb
-Author: Elementl
-Author-email: hello@elementl.com
-License: Apache-2.0
-Classifier: Programming Language :: Python :: 3.8
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: License :: OSI Approved :: Apache Software License
-Classifier: Operating System :: OS Independent
-Provides-Extra: pandas
-Provides-Extra: pyspark
-License-File: LICENSE

dagster-duckdb-0.20.3/README.md DELETED Viewed

@@ -1,4 +0,0 @@
-# dagster-duckdb
-The docs for `dagster-duckdb` can be found
-[here](https://docs.dagster.io/_apidocs/libraries/dagster-duckdb).

dagster-duckdb-0.20.3/dagster_duckdb/__init__.py DELETED Viewed

@@ -1,10 +0,0 @@
-from dagster._core.libraries import DagsterLibraryRegistry
-from .io_manager import (
-    DuckDBIOManager as DuckDBIOManager,
-    build_duckdb_io_manager as build_duckdb_io_manager,
-)
-from .resource import DuckDBResource as DuckDBResource
-from .version import __version__
-DagsterLibraryRegistry.register("dagster-duckdb", __version__)

dagster-duckdb-0.20.3/dagster_duckdb/version.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __version__ = "0.20.3"

dagster-duckdb-0.20.3/dagster_duckdb.egg-info/PKG-INFO DELETED Viewed

@@ -1,16 +0,0 @@
-Metadata-Version: 2.1
-Name: dagster-duckdb
-Version: 0.20.3
-Summary: Package for DuckDB-specific Dagster framework op and resource components.
-Home-page: https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-duckb
-Author: Elementl
-Author-email: hello@elementl.com
-License: Apache-2.0
-Classifier: Programming Language :: Python :: 3.8
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: License :: OSI Approved :: Apache Software License
-Classifier: Operating System :: OS Independent
-Provides-Extra: pandas
-Provides-Extra: pyspark
-License-File: LICENSE

dagster-duckdb-0.20.3/dagster_duckdb.egg-info/requires.txt DELETED Viewed

@@ -1,13 +0,0 @@
-duckdb
-dagster==1.4.3
-[pandas]
-pandas
-[pyspark]
-[pyspark:python_version < "3.8"]
-pyspark>=2.0.2
-[pyspark:python_version >= "3.8"]
-pyspark>=3.0.0

{dagster-duckdb-0.20.3 → dagster_duckdb-0.28.6}/MANIFEST.in RENAMED Viewed

File without changes

{dagster-duckdb-0.20.3 → dagster_duckdb-0.28.6}/dagster_duckdb/py.typed RENAMED Viewed

File without changes

{dagster-duckdb-0.20.3 → dagster_duckdb-0.28.6}/dagster_duckdb.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{dagster-duckdb-0.20.3 → dagster_duckdb-0.28.6}/dagster_duckdb.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{dagster-duckdb-0.20.3 → dagster_duckdb-0.28.6}/dagster_duckdb.egg-info/not-zip-safe RENAMED Viewed

File without changes

{dagster-duckdb-0.20.3 → dagster_duckdb-0.28.6}/dagster_duckdb.egg-info/top_level.txt RENAMED Viewed

File without changes

{dagster-duckdb-0.20.3 → dagster_duckdb-0.28.6}/setup.cfg RENAMED Viewed

File without changes

dagster-duckdb 0.20.3__tar.gz → 0.28.6__tar.gz

dagster-duckdb 0.20.3tar.gz → 0.28.6tar.gz