dagster-duckdb-pyspark 0.18.6__py3-none-any.whl → 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dagster-duckdb-pyspark might be problematic. Click here for more details.
- dagster_duckdb_pyspark/__init__.py +1 -0
- dagster_duckdb_pyspark/duckdb_pyspark_type_handler.py +85 -17
- dagster_duckdb_pyspark/version.py +1 -1
- {dagster_duckdb_pyspark-0.18.6.dist-info → dagster_duckdb_pyspark-0.19.0.dist-info}/METADATA +3 -3
- dagster_duckdb_pyspark-0.19.0.dist-info/RECORD +9 -0
- dagster_duckdb_pyspark-0.18.6.dist-info/RECORD +0 -9
- {dagster_duckdb_pyspark-0.18.6.dist-info → dagster_duckdb_pyspark-0.19.0.dist-info}/LICENSE +0 -0
- {dagster_duckdb_pyspark-0.18.6.dist-info → dagster_duckdb_pyspark-0.19.0.dist-info}/WHEEL +0 -0
- {dagster_duckdb_pyspark-0.18.6.dist-info → dagster_duckdb_pyspark-0.19.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from dagster._core.libraries import DagsterLibraryRegistry
|
|
2
2
|
|
|
3
3
|
from .duckdb_pyspark_type_handler import (
|
|
4
|
+
DuckDBPySparkIOManager as DuckDBPySparkIOManager,
|
|
4
5
|
DuckDBPySparkTypeHandler as DuckDBPySparkTypeHandler,
|
|
5
6
|
duckdb_pyspark_io_manager as duckdb_pyspark_io_manager,
|
|
6
7
|
)
|
|
@@ -1,8 +1,14 @@
|
|
|
1
|
+
from typing import Optional, Sequence, Type
|
|
2
|
+
|
|
1
3
|
import pyspark
|
|
2
4
|
import pyspark.sql
|
|
3
5
|
from dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema
|
|
4
6
|
from dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice
|
|
5
|
-
from dagster_duckdb.io_manager import
|
|
7
|
+
from dagster_duckdb.io_manager import (
|
|
8
|
+
DuckDbClient,
|
|
9
|
+
DuckDBIOManager,
|
|
10
|
+
build_duckdb_io_manager,
|
|
11
|
+
)
|
|
6
12
|
from pyspark.sql import SparkSession
|
|
7
13
|
from pyspark.sql.types import StructType
|
|
8
14
|
|
|
@@ -10,28 +16,29 @@ from pyspark.sql.types import StructType
|
|
|
10
16
|
class DuckDBPySparkTypeHandler(DbTypeHandler[pyspark.sql.DataFrame]):
|
|
11
17
|
"""Stores PySpark DataFrames in DuckDB.
|
|
12
18
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
To use this type handler, pass it to ``build_duckdb_io_manager``
|
|
19
|
+
To use this type handler, return it from the ``type_handlers` method of an I/O manager that inherits from ``DuckDBIOManager``.
|
|
16
20
|
|
|
17
21
|
Example:
|
|
18
22
|
.. code-block:: python
|
|
19
23
|
|
|
20
|
-
from dagster_duckdb import
|
|
24
|
+
from dagster_duckdb import DuckDBIOManager
|
|
21
25
|
from dagster_duckdb_pyspark import DuckDBPySparkTypeHandler
|
|
22
26
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
27
|
+
class MyDuckDBIOManager(DuckDBIOManager):
|
|
28
|
+
@staticmethod
|
|
29
|
+
def type_handlers() -> Sequence[DbTypeHandler]:
|
|
30
|
+
return [DuckDBPySparkTypeHandler()]
|
|
26
31
|
|
|
27
|
-
|
|
32
|
+
@asset(
|
|
33
|
+
key_prefix=["my_schema"] # will be used as the schema in duckdb
|
|
34
|
+
)
|
|
35
|
+
def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name
|
|
36
|
+
...
|
|
28
37
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
{"io_manager": duckdb_io_manager.configured({"database": "my_db.duckdb"})}
|
|
34
|
-
)
|
|
38
|
+
defs = Definitions(
|
|
39
|
+
assets=[my_table],
|
|
40
|
+
resources={"io_manager": MyDuckDBIOManager(database="my_db.duckdb")}
|
|
41
|
+
)
|
|
35
42
|
"""
|
|
36
43
|
|
|
37
44
|
def handle_output(
|
|
@@ -86,7 +93,7 @@ duckdb_pyspark_io_manager = build_duckdb_io_manager(
|
|
|
86
93
|
[DuckDBPySparkTypeHandler()], default_load_type=pyspark.sql.DataFrame
|
|
87
94
|
)
|
|
88
95
|
duckdb_pyspark_io_manager.__doc__ = """
|
|
89
|
-
An
|
|
96
|
+
An I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When
|
|
90
97
|
using the duckdb_pyspark_io_manager, any inputs and outputs without type annotations will be loaded
|
|
91
98
|
as PySpark DataFrames.
|
|
92
99
|
|
|
@@ -113,7 +120,7 @@ Examples:
|
|
|
113
120
|
)
|
|
114
121
|
|
|
115
122
|
If you do not provide a schema, Dagster will determine a schema based on the assets and ops using
|
|
116
|
-
the
|
|
123
|
+
the I/O Manager. For assets, the schema will be determined from the asset key.
|
|
117
124
|
For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided
|
|
118
125
|
via config or on the asset/op, "public" will be used for the schema.
|
|
119
126
|
|
|
@@ -139,3 +146,64 @@ Examples:
|
|
|
139
146
|
...
|
|
140
147
|
|
|
141
148
|
"""
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class DuckDBPySparkIOManager(DuckDBIOManager):
|
|
152
|
+
"""An I/O manager definition that reads inputs from and writes PySpark DataFrames to DuckDB. When
|
|
153
|
+
using the DuckDBPySparkIOManager, any inputs and outputs without type annotations will be loaded
|
|
154
|
+
as PySpark DataFrames.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
IOManagerDefinition
|
|
158
|
+
|
|
159
|
+
Examples:
|
|
160
|
+
.. code-block:: python
|
|
161
|
+
|
|
162
|
+
from dagster_duckdb_pyspark import DuckDBPySparkIOManager
|
|
163
|
+
|
|
164
|
+
@asset(
|
|
165
|
+
key_prefix=["my_schema"] # will be used as the schema in DuckDB
|
|
166
|
+
)
|
|
167
|
+
def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name
|
|
168
|
+
...
|
|
169
|
+
|
|
170
|
+
defs = Definitions(
|
|
171
|
+
assets=[my_table],
|
|
172
|
+
resources={"io_manager": DuckDBPySparkIOManager(database="my_db.duckdb")}
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
If you do not provide a schema, Dagster will determine a schema based on the assets and ops using
|
|
176
|
+
the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.
|
|
177
|
+
For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided
|
|
178
|
+
via config or on the asset/op, "public" will be used for the schema.
|
|
179
|
+
|
|
180
|
+
.. code-block:: python
|
|
181
|
+
|
|
182
|
+
@op(
|
|
183
|
+
out={"my_table": Out(metadata={"schema": "my_schema"})}
|
|
184
|
+
)
|
|
185
|
+
def make_my_table() -> pyspark.sql.DataFrame:
|
|
186
|
+
# the returned value will be stored at my_schema.my_table
|
|
187
|
+
...
|
|
188
|
+
|
|
189
|
+
To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the
|
|
190
|
+
In or AssetIn.
|
|
191
|
+
|
|
192
|
+
.. code-block:: python
|
|
193
|
+
|
|
194
|
+
@asset(
|
|
195
|
+
ins={"my_table": AssetIn("my_table", metadata={"columns": ["a"]})}
|
|
196
|
+
)
|
|
197
|
+
def my_table_a(my_table: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
|
|
198
|
+
# my_table will just contain the data from column "a"
|
|
199
|
+
...
|
|
200
|
+
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
@staticmethod
|
|
204
|
+
def type_handlers() -> Sequence[DbTypeHandler]:
|
|
205
|
+
return [DuckDBPySparkTypeHandler()]
|
|
206
|
+
|
|
207
|
+
@staticmethod
|
|
208
|
+
def default_load_type() -> Optional[Type]:
|
|
209
|
+
return pyspark.sql.DataFrame
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.19.0"
|
{dagster_duckdb_pyspark-0.18.6.dist-info → dagster_duckdb_pyspark-0.19.0.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dagster-duckdb-pyspark
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.19.0
|
|
4
4
|
Summary: Package for storing PySpark DataFrames in DuckDB.
|
|
5
5
|
Home-page: https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-duckb-pyspark
|
|
6
6
|
Author: Elementl
|
|
@@ -13,8 +13,8 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
13
13
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
14
|
Classifier: Operating System :: OS Independent
|
|
15
15
|
License-File: LICENSE
|
|
16
|
-
Requires-Dist: dagster (==1.
|
|
17
|
-
Requires-Dist: dagster-duckdb (==0.
|
|
16
|
+
Requires-Dist: dagster (==1.3.0)
|
|
17
|
+
Requires-Dist: dagster-duckdb (==0.19.0)
|
|
18
18
|
Requires-Dist: pandas (<2)
|
|
19
19
|
Requires-Dist: pyspark (>=2.0.2) ; python_version < "3.8"
|
|
20
20
|
Requires-Dist: pyspark (>=3.0.0) ; python_version >= "3.8"
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
dagster_duckdb_pyspark/__init__.py,sha256=KjwD42HKQJslK2WPFg2F7mvHe1hPyrp02xSWM0Az39Y,382
|
|
2
|
+
dagster_duckdb_pyspark/duckdb_pyspark_type_handler.py,sha256=kRDBdDr2xuHFJUR3UvQn1sSPMYEe7n7YHYhbwmmx5UM,7484
|
|
3
|
+
dagster_duckdb_pyspark/py.typed,sha256=mDShSrm8qg9qjacQc2F-rI8ATllqP6EdgHuEYxuCXZ0,7
|
|
4
|
+
dagster_duckdb_pyspark/version.py,sha256=IPTpw_ZRkJdPKjp9ROF6sfDyeEv2IvChuvliVauZWvE,23
|
|
5
|
+
dagster_duckdb_pyspark-0.19.0.dist-info/LICENSE,sha256=-gtoVIAZYUHYmNHISZg982FI4Oh19mV1nxgTVW8eCB8,11344
|
|
6
|
+
dagster_duckdb_pyspark-0.19.0.dist-info/METADATA,sha256=3dekfx2vpkMIlVyzlZZoFFj-CbEixlZtRES39zn2UxA,856
|
|
7
|
+
dagster_duckdb_pyspark-0.19.0.dist-info/WHEEL,sha256=p46_5Uhzqz6AzeSosiOnxK-zmFja1i22CrQCjmYe8ec,92
|
|
8
|
+
dagster_duckdb_pyspark-0.19.0.dist-info/top_level.txt,sha256=UYh0E2YiAlK01-DAkx0eikRaH-TIk0n9jijQK2joJBs,23
|
|
9
|
+
dagster_duckdb_pyspark-0.19.0.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
dagster_duckdb_pyspark/__init__.py,sha256=EsnAP_X64pR76FL4a6RicGeG6H_vl4l9k-tb441KA4M,328
|
|
2
|
-
dagster_duckdb_pyspark/duckdb_pyspark_type_handler.py,sha256=LwM8_6CDRw8n6XZI4VFliPxKfChwlQRawfPTtPuWasY,4970
|
|
3
|
-
dagster_duckdb_pyspark/py.typed,sha256=mDShSrm8qg9qjacQc2F-rI8ATllqP6EdgHuEYxuCXZ0,7
|
|
4
|
-
dagster_duckdb_pyspark/version.py,sha256=uKA6-JXiIkK71mAwJK5D762_yrliPWqnH0bkkAczVnU,23
|
|
5
|
-
dagster_duckdb_pyspark-0.18.6.dist-info/LICENSE,sha256=-gtoVIAZYUHYmNHISZg982FI4Oh19mV1nxgTVW8eCB8,11344
|
|
6
|
-
dagster_duckdb_pyspark-0.18.6.dist-info/METADATA,sha256=w7-WZPyK-6xIE64za0a-quraJsasl2Rssa76LSSmq9M,856
|
|
7
|
-
dagster_duckdb_pyspark-0.18.6.dist-info/WHEEL,sha256=p46_5Uhzqz6AzeSosiOnxK-zmFja1i22CrQCjmYe8ec,92
|
|
8
|
-
dagster_duckdb_pyspark-0.18.6.dist-info/top_level.txt,sha256=UYh0E2YiAlK01-DAkx0eikRaH-TIk0n9jijQK2joJBs,23
|
|
9
|
-
dagster_duckdb_pyspark-0.18.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{dagster_duckdb_pyspark-0.18.6.dist-info → dagster_duckdb_pyspark-0.19.0.dist-info}/top_level.txt
RENAMED
|
File without changes
|