dagster-duckdb-pyspark 0.20.15__py3-none-any.whl → 0.25.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dagster-duckdb-pyspark might be problematic. Click here for more details.

@@ -1,10 +1,10 @@
1
1
  from dagster._core.libraries import DagsterLibraryRegistry
2
2
 
3
- from .duckdb_pyspark_type_handler import (
3
+ from dagster_duckdb_pyspark.duckdb_pyspark_type_handler import (
4
4
  DuckDBPySparkIOManager as DuckDBPySparkIOManager,
5
5
  DuckDBPySparkTypeHandler as DuckDBPySparkTypeHandler,
6
6
  duckdb_pyspark_io_manager as duckdb_pyspark_io_manager,
7
7
  )
8
- from .version import __version__
8
+ from dagster_duckdb_pyspark.version import __version__
9
9
 
10
10
  DagsterLibraryRegistry.register("dagster-duckdb-pyspark", __version__)
@@ -1,15 +1,12 @@
1
- from typing import Optional, Sequence, Type
1
+ from collections.abc import Sequence
2
+ from typing import Optional
2
3
 
3
4
  import pyarrow as pa
4
5
  import pyspark
5
6
  import pyspark.sql
6
7
  from dagster import InputContext, MetadataValue, OutputContext, TableColumn, TableSchema
7
8
  from dagster._core.storage.db_io_manager import DbTypeHandler, TableSlice
8
- from dagster_duckdb.io_manager import (
9
- DuckDbClient,
10
- DuckDBIOManager,
11
- build_duckdb_io_manager,
12
- )
9
+ from dagster_duckdb.io_manager import DuckDbClient, DuckDBIOManager, build_duckdb_io_manager
13
10
  from pyspark.sql import SparkSession
14
11
  from pyspark.sql.types import StructType
15
12
 
@@ -120,17 +117,40 @@ Examples:
120
117
  def my_table() -> pyspark.sql.DataFrame: # the name of the asset will be the table name
121
118
  ...
122
119
 
123
- @repository
124
- def my_repo():
125
- return with_resources(
126
- [my_table],
127
- {"io_manager": duckdb_pyspark_io_manager.configured({"database": "my_db.duckdb"})}
120
+ defs = Definitions(
121
+ assets=[my_table],
122
+ resources={"io_manager": duckdb_pyspark_io_manager.configured({"database": "my_db.duckdb"})}
123
+ )
124
+
125
+ You can set a default schema to store the assets using the ``schema`` configuration value of the DuckDB I/O
126
+ Manager. This schema will be used if no other schema is specified directly on an asset or op.
127
+
128
+ .. code-block:: python
129
+
130
+ defs = Definitions(
131
+ assets=[my_table],
132
+ resources={"io_manager": duckdb_pyspark_io_manager.configured({"database": "my_db.duckdb", "schema": "my_schema"})}
133
+ )
134
+
135
+ On individual assets, you an also specify the schema where they should be stored using metadata or
136
+ by adding a ``key_prefix`` to the asset key. If both ``key_prefix`` and metadata are defined, the metadata will
137
+ take precedence.
138
+
139
+ .. code-block:: python
140
+
141
+ @asset(
142
+ key_prefix=["my_schema"] # will be used as the schema in duckdb
143
+ )
144
+ def my_table() -> pyspark.sql.DataFrame:
145
+ ...
146
+
147
+ @asset(
148
+ metadata={"schema": "my_schema"} # will be used as the schema in duckdb
128
149
  )
150
+ def my_other_table() -> pyspark.sql.DataFrame:
151
+ ...
129
152
 
130
- If you do not provide a schema, Dagster will determine a schema based on the assets and ops using
131
- the I/O Manager. For assets, the schema will be determined from the asset key.
132
- For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided
133
- via config or on the asset/op, "public" will be used for the schema.
153
+ For ops, the schema can be specified by including a "schema" entry in output metadata.
134
154
 
135
155
  .. code-block:: python
136
156
 
@@ -138,9 +158,10 @@ Examples:
138
158
  out={"my_table": Out(metadata={"schema": "my_schema"})}
139
159
  )
140
160
  def make_my_table() -> pyspark.sql.DataFrame:
141
- # the returned value will be stored at my_schema.my_table
142
161
  ...
143
162
 
163
+ If none of these is provided, the schema will default to "public".
164
+
144
165
  To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the
145
166
  In or AssetIn.
146
167
 
@@ -180,10 +201,35 @@ class DuckDBPySparkIOManager(DuckDBIOManager):
180
201
  resources={"io_manager": DuckDBPySparkIOManager(database="my_db.duckdb")}
181
202
  )
182
203
 
183
- If you do not provide a schema, Dagster will determine a schema based on the assets and ops using
184
- the I/O Manager. For assets, the schema will be determined from the asset key, as in the above example.
185
- For ops, the schema can be specified by including a "schema" entry in output metadata. If "schema" is not provided
186
- via config or on the asset/op, "public" will be used for the schema.
204
+ You can set a default schema to store the assets using the ``schema`` configuration value of the DuckDB I/O
205
+ Manager. This schema will be used if no other schema is specified directly on an asset or op.
206
+
207
+ .. code-block:: python
208
+
209
+ defs = Definitions(
210
+ assets=[my_table],
211
+ resources={"io_manager": DuckDBPySparkIOManager(database="my_db.duckdb", schema="my_schema")}
212
+ )
213
+
214
+ On individual assets, you an also specify the schema where they should be stored using metadata or
215
+ by adding a ``key_prefix`` to the asset key. If both ``key_prefix`` and metadata are defined, the metadata will
216
+ take precedence.
217
+
218
+ .. code-block:: python
219
+
220
+ @asset(
221
+ key_prefix=["my_schema"] # will be used as the schema in duckdb
222
+ )
223
+ def my_table() -> pyspark.sql.DataFrame:
224
+ ...
225
+
226
+ @asset(
227
+ metadata={"schema": "my_schema"} # will be used as the schema in duckdb
228
+ )
229
+ def my_other_table() -> pyspark.sql.DataFrame:
230
+ ...
231
+
232
+ For ops, the schema can be specified by including a "schema" entry in output metadata.
187
233
 
188
234
  .. code-block:: python
189
235
 
@@ -191,9 +237,10 @@ class DuckDBPySparkIOManager(DuckDBIOManager):
191
237
  out={"my_table": Out(metadata={"schema": "my_schema"})}
192
238
  )
193
239
  def make_my_table() -> pyspark.sql.DataFrame:
194
- # the returned value will be stored at my_schema.my_table
195
240
  ...
196
241
 
242
+ If none of these is provided, the schema will default to "public".
243
+
197
244
  To only use specific columns of a table as input to a downstream op or asset, add the metadata "columns" to the
198
245
  In or AssetIn.
199
246
 
@@ -217,5 +264,5 @@ class DuckDBPySparkIOManager(DuckDBIOManager):
217
264
  return [DuckDBPySparkTypeHandler()]
218
265
 
219
266
  @staticmethod
220
- def default_load_type() -> Optional[Type]:
267
+ def default_load_type() -> Optional[type]:
221
268
  return pyspark.sql.DataFrame
@@ -1 +1 @@
1
- __version__ = "0.20.15"
1
+ __version__ = "0.25.9"
@@ -1,20 +1,20 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dagster-duckdb-pyspark
3
- Version: 0.20.15
3
+ Version: 0.25.9
4
4
  Summary: Package for storing PySpark DataFrames in DuckDB.
5
5
  Home-page: https://github.com/dagster-io/dagster/tree/master/python_modules/libraries/dagster-duckb-pyspark
6
6
  Author: Dagster Labs
7
7
  Author-email: hello@dagsterlabs.com
8
8
  License: Apache-2.0
9
- Classifier: Programming Language :: Python :: 3.8
10
9
  Classifier: Programming Language :: Python :: 3.9
11
10
  Classifier: Programming Language :: Python :: 3.10
12
11
  Classifier: License :: OSI Approved :: Apache Software License
13
12
  Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.9,<3.13
14
14
  License-File: LICENSE
15
- Requires-Dist: dagster (==1.4.15)
16
- Requires-Dist: dagster-duckdb (==0.20.15)
17
- Requires-Dist: pyspark (>=3)
18
- Requires-Dist: pandas (<2.1)
15
+ Requires-Dist: dagster ==1.9.9
16
+ Requires-Dist: dagster-duckdb ==0.25.9
17
+ Requires-Dist: pyspark >=3
18
+ Requires-Dist: pandas
19
19
  Requires-Dist: pyarrow
20
20
 
@@ -0,0 +1,9 @@
1
+ dagster_duckdb_pyspark/__init__.py,sha256=nNQtyXTaozhmDECy2dlb2OvT-6zz5A6l9By91wxG6y0,426
2
+ dagster_duckdb_pyspark/duckdb_pyspark_type_handler.py,sha256=OJgWa3CMlE_LpSvqTGFzEdg_0zQnpQ_0ctrBYRNlkAU,9592
3
+ dagster_duckdb_pyspark/py.typed,sha256=mDShSrm8qg9qjacQc2F-rI8ATllqP6EdgHuEYxuCXZ0,7
4
+ dagster_duckdb_pyspark/version.py,sha256=NsKiCCQq5j7wW1paL-Bw27h63w_P0r0bIHvsX9TsjGY,23
5
+ dagster_duckdb_pyspark-0.25.9.dist-info/LICENSE,sha256=TMatHW4_G9ldRdodEAp-l2Xa2WvsdeOh60E3v1R2jis,11349
6
+ dagster_duckdb_pyspark-0.25.9.dist-info/METADATA,sha256=MytvcUUtHjknO6PoDWecMuDt60LExUrOMFqxV3e_Tzg,716
7
+ dagster_duckdb_pyspark-0.25.9.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
8
+ dagster_duckdb_pyspark-0.25.9.dist-info/top_level.txt,sha256=UYh0E2YiAlK01-DAkx0eikRaH-TIk0n9jijQK2joJBs,23
9
+ dagster_duckdb_pyspark-0.25.9.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.33.6)
2
+ Generator: bdist_wheel (0.41.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,9 +0,0 @@
1
- dagster_duckdb_pyspark/__init__.py,sha256=KjwD42HKQJslK2WPFg2F7mvHe1hPyrp02xSWM0Az39Y,382
2
- dagster_duckdb_pyspark/duckdb_pyspark_type_handler.py,sha256=t9lqCpo-ibaThEFzxjqownu_yF_tFpVvQO6_ITgPLlY,7980
3
- dagster_duckdb_pyspark/py.typed,sha256=mDShSrm8qg9qjacQc2F-rI8ATllqP6EdgHuEYxuCXZ0,7
4
- dagster_duckdb_pyspark/version.py,sha256=qx2qBaxijq_GGtI51BGXX5lmkBS7HlcGbPCmaum1Gf8,24
5
- dagster_duckdb_pyspark-0.20.15.dist-info/LICENSE,sha256=TMatHW4_G9ldRdodEAp-l2Xa2WvsdeOh60E3v1R2jis,11349
6
- dagster_duckdb_pyspark-0.20.15.dist-info/METADATA,sha256=-7wm5PZNbYKRTK9emLVdKn8zFWFFY6CzT-1ABRD6hZ0,753
7
- dagster_duckdb_pyspark-0.20.15.dist-info/WHEEL,sha256=p46_5Uhzqz6AzeSosiOnxK-zmFja1i22CrQCjmYe8ec,92
8
- dagster_duckdb_pyspark-0.20.15.dist-info/top_level.txt,sha256=UYh0E2YiAlK01-DAkx0eikRaH-TIk0n9jijQK2joJBs,23
9
- dagster_duckdb_pyspark-0.20.15.dist-info/RECORD,,