dagster-duckdb-pyspark 0.20.15__py3-none-any.whl → 0.25.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dagster-duckdb-pyspark might be problematic. Click here for more details.
- dagster_duckdb_pyspark/__init__.py +2 -2
- dagster_duckdb_pyspark/duckdb_pyspark_type_handler.py +69 -22
- dagster_duckdb_pyspark/version.py +1 -1
- {dagster_duckdb_pyspark-0.20.15.dist-info → dagster_duckdb_pyspark-0.25.9.dist-info}/METADATA +6 -6
- dagster_duckdb_pyspark-0.25.9.dist-info/RECORD +9 -0
- {dagster_duckdb_pyspark-0.20.15.dist-info → dagster_duckdb_pyspark-0.25.9.dist-info}/WHEEL +1 -1
- dagster_duckdb_pyspark-0.20.15.dist-info/RECORD +0 -9
- {dagster_duckdb_pyspark-0.20.15.dist-info → dagster_duckdb_pyspark-0.25.9.dist-info}/LICENSE +0 -0
- {dagster_duckdb_pyspark-0.20.15.dist-info → dagster_duckdb_pyspark-0.25.9.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
from dagster._core.libraries import DagsterLibraryRegistry
|
|
2
2
|
|
|
3
|
-
from .duckdb_pyspark_type_handler import (
|
|
3
|
+
from dagster_duckdb_pyspark.duckdb_pyspark_type_handler import (
|
|
4
4
|
DuckDBPySparkIOManager as DuckDBPySparkIOManager,
|
|
5
5
|
DuckDBPySparkTypeHandler as DuckDBPySparkTypeHandler,
|
|
6
6
|
duckdb_pyspark_io_manager as duckdb_pyspark_io_manager,
|
|
7
7
|
)
|
|
8
|
-
from .version import __version__
|
|
8
|
+
from dagster_duckdb_pyspark.version import __version__
|
|
9
9
|
|
|
10
10
|
DagsterLibraryRegistry.register("dagster-duckdb-pyspark", __version__)
|
|
@@ -1,15 +1,12 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
4
|
import pyarrow as pa
|
|
4
5
|
import pyspark
|
|
5
6
|
import pyspark.sql
|
|
6
7
|
from dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema
|
|
7
8
|
from dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice
|
|
8
|
-
from dagster_duckdb.io_manager import
|
|
9
|
-
DuckDbClient,
|
|
10
|
-
DuckDBIOManager,
|
|
11
|
-
build_duckdb_io_manager,
|
|
12
|
-
)
|
|
9
|
+
from dagster_duckdb.io_manager import DuckDbClient, DuckDBIOManager, build_duckdb_io_manager
|
|
13
10
|
from pyspark.sql import SparkSession
|
|
14
11
|
from pyspark.sql.types import StructType
|
|
15
12
|
|
|
@@ -120,17 +117,40 @@ Examples:
|
|
|
120
117
|
def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name
|
|
121
118
|
...
|
|
122
119
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
120
|
+
defs = Definitions(
|
|
121
|
+
assets=[my_table],
|
|
122
|
+
resources={"io_manager": duckdb_pyspark_io_manager.configured({"database": "my_db.duckdb"})}
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
You can set a default schema to store the assets using the ``schema`` configuration value of the DuckDB I/O
|
|
126
|
+
Manager. This schema will be used if no other schema is specified directly on an asset or op.
|
|
127
|
+
|
|
128
|
+
.. code-block:: python
|
|
129
|
+
|
|
130
|
+
defs = Definitions(
|
|
131
|
+
assets=[my_table],
|
|
132
|
+
resources={"io_manager": duckdb_pyspark_io_manager.configured({"database": "my_db.duckdb", "schema": "my_schema"})}
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
On individual assets, you an also specify the schema where they should be stored using metadata or
|
|
136
|
+
by adding a ``key_prefix`` to the asset key. If both ``key_prefix`` and metadata are defined, the metadata will
|
|
137
|
+
take precedence.
|
|
138
|
+
|
|
139
|
+
.. code-block:: python
|
|
140
|
+
|
|
141
|
+
@asset(
|
|
142
|
+
key_prefix=["my_schema"] # will be used as the schema in duckdb
|
|
143
|
+
)
|
|
144
|
+
def my_table() -> pyspark.sql.DataFrame:
|
|
145
|
+
...
|
|
146
|
+
|
|
147
|
+
@asset(
|
|
148
|
+
metadata={"schema": "my_schema"} # will be used as the schema in duckdb
|
|
128
149
|
)
|
|
150
|
+
def my_other_table() -> pyspark.sql.DataFrame:
|
|
151
|
+
...
|
|
129
152
|
|
|
130
|
-
|
|
131
|
-
the I/O Manager. For assets, the schema will be determined from the asset key.
|
|
132
|
-
For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided
|
|
133
|
-
via config or on the asset/op, "public" will be used for the schema.
|
|
153
|
+
For ops, the schema can be specified by including a "schema" entry in output metadata.
|
|
134
154
|
|
|
135
155
|
.. code-block:: python
|
|
136
156
|
|
|
@@ -138,9 +158,10 @@ Examples:
|
|
|
138
158
|
out={"my_table": Out(metadata={"schema": "my_schema"})}
|
|
139
159
|
)
|
|
140
160
|
def make_my_table() -> pyspark.sql.DataFrame:
|
|
141
|
-
# the returned value will be stored at my_schema.my_table
|
|
142
161
|
...
|
|
143
162
|
|
|
163
|
+
If none of these is provided, the schema will default to "public".
|
|
164
|
+
|
|
144
165
|
To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the
|
|
145
166
|
In or AssetIn.
|
|
146
167
|
|
|
@@ -180,10 +201,35 @@ class DuckDBPySparkIOManager(DuckDBIOManager):
|
|
|
180
201
|
resources={"io_manager": DuckDBPySparkIOManager(database="my_db.duckdb")}
|
|
181
202
|
)
|
|
182
203
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
204
|
+
You can set a default schema to store the assets using the ``schema`` configuration value of the DuckDB I/O
|
|
205
|
+
Manager. This schema will be used if no other schema is specified directly on an asset or op.
|
|
206
|
+
|
|
207
|
+
.. code-block:: python
|
|
208
|
+
|
|
209
|
+
defs = Definitions(
|
|
210
|
+
assets=[my_table],
|
|
211
|
+
resources={"io_manager": DuckDBPySparkIOManager(database="my_db.duckdb", schema="my_schema")}
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
On individual assets, you an also specify the schema where they should be stored using metadata or
|
|
215
|
+
by adding a ``key_prefix`` to the asset key. If both ``key_prefix`` and metadata are defined, the metadata will
|
|
216
|
+
take precedence.
|
|
217
|
+
|
|
218
|
+
.. code-block:: python
|
|
219
|
+
|
|
220
|
+
@asset(
|
|
221
|
+
key_prefix=["my_schema"] # will be used as the schema in duckdb
|
|
222
|
+
)
|
|
223
|
+
def my_table() -> pyspark.sql.DataFrame:
|
|
224
|
+
...
|
|
225
|
+
|
|
226
|
+
@asset(
|
|
227
|
+
metadata={"schema": "my_schema"} # will be used as the schema in duckdb
|
|
228
|
+
)
|
|
229
|
+
def my_other_table() -> pyspark.sql.DataFrame:
|
|
230
|
+
...
|
|
231
|
+
|
|
232
|
+
For ops, the schema can be specified by including a "schema" entry in output metadata.
|
|
187
233
|
|
|
188
234
|
.. code-block:: python
|
|
189
235
|
|
|
@@ -191,9 +237,10 @@ class DuckDBPySparkIOManager(DuckDBIOManager):
|
|
|
191
237
|
out={"my_table": Out(metadata={"schema": "my_schema"})}
|
|
192
238
|
)
|
|
193
239
|
def make_my_table() -> pyspark.sql.DataFrame:
|
|
194
|
-
# the returned value will be stored at my_schema.my_table
|
|
195
240
|
...
|
|
196
241
|
|
|
242
|
+
If none of these is provided, the schema will default to "public".
|
|
243
|
+
|
|
197
244
|
To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the
|
|
198
245
|
In or AssetIn.
|
|
199
246
|
|
|
@@ -217,5 +264,5 @@ class DuckDBPySparkIOManager(DuckDBIOManager):
|
|
|
217
264
|
return [DuckDBPySparkTypeHandler()]
|
|
218
265
|
|
|
219
266
|
@staticmethod
|
|
220
|
-
def default_load_type() -> Optional[
|
|
267
|
+
def default_load_type() -> Optional[type]:
|
|
221
268
|
return pyspark.sql.DataFrame
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.25.9"
|
{dagster_duckdb_pyspark-0.20.15.dist-info → dagster_duckdb_pyspark-0.25.9.dist-info}/METADATA
RENAMED
|
@@ -1,20 +1,20 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: dagster-duckdb-pyspark
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.25.9
|
|
4
4
|
Summary: Package for storing PySpark DataFrames in DuckDB.
|
|
5
5
|
Home-page: https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-duckb-pyspark
|
|
6
6
|
Author: Dagster Labs
|
|
7
7
|
Author-email: hello@dagsterlabs.com
|
|
8
8
|
License: Apache-2.0
|
|
9
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
10
9
|
Classifier: Programming Language :: Python :: 3.9
|
|
11
10
|
Classifier: Programming Language :: Python :: 3.10
|
|
12
11
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
12
|
Classifier: Operating System :: OS Independent
|
|
13
|
+
Requires-Python: >=3.9,<3.13
|
|
14
14
|
License-File: LICENSE
|
|
15
|
-
Requires-Dist: dagster
|
|
16
|
-
Requires-Dist: dagster-duckdb
|
|
17
|
-
Requires-Dist: pyspark
|
|
18
|
-
Requires-Dist: pandas
|
|
15
|
+
Requires-Dist: dagster ==1.9.9
|
|
16
|
+
Requires-Dist: dagster-duckdb ==0.25.9
|
|
17
|
+
Requires-Dist: pyspark >=3
|
|
18
|
+
Requires-Dist: pandas
|
|
19
19
|
Requires-Dist: pyarrow
|
|
20
20
|
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
dagster_duckdb_pyspark/__init__.py,sha256=nNQtyXTaozhmDECy2dlb2OvT-6zz5A6l9By91wxG6y0,426
|
|
2
|
+
dagster_duckdb_pyspark/duckdb_pyspark_type_handler.py,sha256=OJgWa3CMlE_LpSvqTGFzEdg_0zQnpQ_0ctrBYRNlkAU,9592
|
|
3
|
+
dagster_duckdb_pyspark/py.typed,sha256=mDShSrm8qg9qjacQc2F-rI8ATllqP6EdgHuEYxuCXZ0,7
|
|
4
|
+
dagster_duckdb_pyspark/version.py,sha256=NsKiCCQq5j7wW1paL-Bw27h63w_P0r0bIHvsX9TsjGY,23
|
|
5
|
+
dagster_duckdb_pyspark-0.25.9.dist-info/LICENSE,sha256=TMatHW4_G9ldRdodEAp-l2Xa2WvsdeOh60E3v1R2jis,11349
|
|
6
|
+
dagster_duckdb_pyspark-0.25.9.dist-info/METADATA,sha256=MytvcUUtHjknO6PoDWecMuDt60LExUrOMFqxV3e_Tzg,716
|
|
7
|
+
dagster_duckdb_pyspark-0.25.9.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
|
8
|
+
dagster_duckdb_pyspark-0.25.9.dist-info/top_level.txt,sha256=UYh0E2YiAlK01-DAkx0eikRaH-TIk0n9jijQK2joJBs,23
|
|
9
|
+
dagster_duckdb_pyspark-0.25.9.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
dagster_duckdb_pyspark/__init__.py,sha256=KjwD42HKQJslK2WPFg2F7mvHe1hPyrp02xSWM0Az39Y,382
|
|
2
|
-
dagster_duckdb_pyspark/duckdb_pyspark_type_handler.py,sha256=t9lqCpo-ibaThEFzxjqownu_yF_tFpVvQO6_ITgPLlY,7980
|
|
3
|
-
dagster_duckdb_pyspark/py.typed,sha256=mDShSrm8qg9qjacQc2F-rI8ATllqP6EdgHuEYxuCXZ0,7
|
|
4
|
-
dagster_duckdb_pyspark/version.py,sha256=qx2qBaxijq_GGtI51BGXX5lmkBS7HlcGbPCmaum1Gf8,24
|
|
5
|
-
dagster_duckdb_pyspark-0.20.15.dist-info/LICENSE,sha256=TMatHW4_G9ldRdodEAp-l2Xa2WvsdeOh60E3v1R2jis,11349
|
|
6
|
-
dagster_duckdb_pyspark-0.20.15.dist-info/METADATA,sha256=-7wm5PZNbYKRTK9emLVdKn8zFWFFY6CzT-1ABRD6hZ0,753
|
|
7
|
-
dagster_duckdb_pyspark-0.20.15.dist-info/WHEEL,sha256=p46_5Uhzqz6AzeSosiOnxK-zmFja1i22CrQCjmYe8ec,92
|
|
8
|
-
dagster_duckdb_pyspark-0.20.15.dist-info/top_level.txt,sha256=UYh0E2YiAlK01-DAkx0eikRaH-TIk0n9jijQK2joJBs,23
|
|
9
|
-
dagster_duckdb_pyspark-0.20.15.dist-info/RECORD,,
|
{dagster_duckdb_pyspark-0.20.15.dist-info → dagster_duckdb_pyspark-0.25.9.dist-info}/LICENSE
RENAMED
|
File without changes
|
{dagster_duckdb_pyspark-0.20.15.dist-info → dagster_duckdb_pyspark-0.25.9.dist-info}/top_level.txt
RENAMED
|
File without changes
|