dagster-pandas 0.13.12rc2__py3-none-any.whl → 0.27.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dagster_pandas/__init__.py +13 -13
- dagster_pandas/constraints.py +144 -181
- dagster_pandas/data_frame.py +82 -129
- dagster_pandas/examples/__init__.py +9 -9
- dagster_pandas/examples/pandas_hello_world/environments/pandas_hello_world_prod.yaml +1 -1
- dagster_pandas/examples/pandas_hello_world/environments/pandas_hello_world_test.yaml +1 -1
- dagster_pandas/examples/pandas_hello_world/environments/papermill_pandas_hello_world_prod.yaml +1 -1
- dagster_pandas/examples/pandas_hello_world/environments/papermill_pandas_hello_world_test.yaml +1 -1
- dagster_pandas/examples/pandas_hello_world/ops.py +7 -7
- dagster_pandas/examples/pandas_hello_world.yaml +1 -1
- dagster_pandas/py.typed +1 -0
- dagster_pandas/validation.py +27 -32
- dagster_pandas/version.py +1 -1
- dagster_pandas-0.27.15.dist-info/METADATA +36 -0
- dagster_pandas-0.27.15.dist-info/RECORD +22 -0
- {dagster_pandas-0.13.12rc2.dist-info → dagster_pandas-0.27.15.dist-info}/WHEEL +1 -1
- {dagster_pandas-0.13.12rc2.dist-info → dagster_pandas-0.27.15.dist-info/licenses}/LICENSE +1 -1
- dagster_pandas-0.27.15.dist-info/top_level.txt +1 -0
- dagster_pandas-0.13.12rc2.dist-info/METADATA +0 -24
- dagster_pandas-0.13.12rc2.dist-info/RECORD +0 -23
- dagster_pandas-0.13.12rc2.dist-info/top_level.txt +0 -2
- dagster_pandas_tests/pandas_hello_world/__init__.py +0 -0
- dagster_pandas_tests/pandas_hello_world/test_pandas_hello_world.py +0 -93
dagster_pandas/data_frame.py
CHANGED
|
@@ -1,23 +1,24 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from dagster import (
|
|
3
|
-
AssetMaterialization,
|
|
4
3
|
DagsterInvariantViolationError,
|
|
5
4
|
DagsterType,
|
|
6
|
-
EventMetadataEntry,
|
|
7
5
|
Field,
|
|
6
|
+
MetadataValue,
|
|
8
7
|
StringSource,
|
|
8
|
+
TableColumn,
|
|
9
|
+
TableSchema,
|
|
10
|
+
TableSchemaMetadataValue,
|
|
9
11
|
TypeCheck,
|
|
10
|
-
check,
|
|
12
|
+
_check as check,
|
|
11
13
|
dagster_type_loader,
|
|
12
|
-
dagster_type_materializer,
|
|
13
14
|
)
|
|
14
|
-
from dagster.
|
|
15
|
-
from dagster.
|
|
16
|
-
from dagster.
|
|
17
|
-
from dagster.
|
|
18
|
-
|
|
19
|
-
from dagster.utils.backcompat import experimental
|
|
15
|
+
from dagster._annotations import beta
|
|
16
|
+
from dagster._config import Selector
|
|
17
|
+
from dagster._core.definitions.metadata import normalize_metadata
|
|
18
|
+
from dagster._utils import dict_without_keys
|
|
19
|
+
|
|
20
20
|
from dagster_pandas.constraints import (
|
|
21
|
+
CONSTRAINT_METADATA_KEY,
|
|
21
22
|
ColumnDTypeFnConstraint,
|
|
22
23
|
ColumnDTypeInSetConstraint,
|
|
23
24
|
ConstraintViolationException,
|
|
@@ -27,38 +28,6 @@ from dagster_pandas.validation import PandasColumn, validate_constraints
|
|
|
27
28
|
CONSTRAINT_BLACKLIST = {ColumnDTypeFnConstraint, ColumnDTypeInSetConstraint}
|
|
28
29
|
|
|
29
30
|
|
|
30
|
-
@dagster_type_materializer(
|
|
31
|
-
Selector(
|
|
32
|
-
{
|
|
33
|
-
"csv": {
|
|
34
|
-
"path": StringSource,
|
|
35
|
-
"sep": Field(StringSource, is_required=False, default_value=","),
|
|
36
|
-
},
|
|
37
|
-
"parquet": {"path": StringSource},
|
|
38
|
-
"table": {"path": StringSource},
|
|
39
|
-
"pickle": {"path": StringSource},
|
|
40
|
-
},
|
|
41
|
-
)
|
|
42
|
-
)
|
|
43
|
-
def dataframe_materializer(_context, config, pandas_df):
|
|
44
|
-
check.inst_param(pandas_df, "pandas_df", pd.DataFrame)
|
|
45
|
-
file_type, file_options = list(config.items())[0]
|
|
46
|
-
|
|
47
|
-
if file_type == "csv":
|
|
48
|
-
path = file_options["path"]
|
|
49
|
-
pandas_df.to_csv(path, index=False, **dict_without_keys(file_options, "path"))
|
|
50
|
-
elif file_type == "parquet":
|
|
51
|
-
pandas_df.to_parquet(file_options["path"])
|
|
52
|
-
elif file_type == "table":
|
|
53
|
-
pandas_df.to_csv(file_options["path"], sep="\t", index=False)
|
|
54
|
-
elif file_type == "pickle":
|
|
55
|
-
pandas_df.to_pickle(file_options["path"])
|
|
56
|
-
else:
|
|
57
|
-
check.failed("Unsupported file_type {file_type}".format(file_type=file_type))
|
|
58
|
-
|
|
59
|
-
return AssetMaterialization.file(file_options["path"])
|
|
60
|
-
|
|
61
|
-
|
|
62
31
|
@dagster_type_loader(
|
|
63
32
|
Selector(
|
|
64
33
|
{
|
|
@@ -73,7 +42,7 @@ def dataframe_materializer(_context, config, pandas_df):
|
|
|
73
42
|
)
|
|
74
43
|
)
|
|
75
44
|
def dataframe_loader(_context, config):
|
|
76
|
-
file_type, file_options =
|
|
45
|
+
file_type, file_options = next(iter(config.items()))
|
|
77
46
|
|
|
78
47
|
if file_type == "csv":
|
|
79
48
|
path = file_options["path"]
|
|
@@ -85,9 +54,7 @@ def dataframe_loader(_context, config):
|
|
|
85
54
|
elif file_type == "pickle":
|
|
86
55
|
return pd.read_pickle(file_options["path"])
|
|
87
56
|
else:
|
|
88
|
-
raise DagsterInvariantViolationError(
|
|
89
|
-
"Unsupported file_type {file_type}".format(file_type=file_type)
|
|
90
|
-
)
|
|
57
|
+
raise DagsterInvariantViolationError(f"Unsupported file_type {file_type}")
|
|
91
58
|
|
|
92
59
|
|
|
93
60
|
def df_type_check(_, value):
|
|
@@ -95,11 +62,11 @@ def df_type_check(_, value):
|
|
|
95
62
|
return TypeCheck(success=False)
|
|
96
63
|
return TypeCheck(
|
|
97
64
|
success=True,
|
|
98
|
-
|
|
99
|
-
|
|
65
|
+
metadata={
|
|
66
|
+
"row_count": str(len(value)),
|
|
100
67
|
# string cast columns since they may be things like datetime
|
|
101
|
-
|
|
102
|
-
|
|
68
|
+
"metadata": {"columns": list(map(str, value.columns))},
|
|
69
|
+
},
|
|
103
70
|
)
|
|
104
71
|
|
|
105
72
|
|
|
@@ -109,16 +76,14 @@ DataFrame = DagsterType(
|
|
|
109
76
|
tabular data structure with labeled axes (rows and columns).
|
|
110
77
|
See http://pandas.pydata.org/""",
|
|
111
78
|
loader=dataframe_loader,
|
|
112
|
-
materializer=dataframe_materializer,
|
|
113
79
|
type_check_fn=df_type_check,
|
|
80
|
+
typing_type=pd.DataFrame,
|
|
114
81
|
)
|
|
115
82
|
|
|
116
83
|
|
|
117
84
|
def _construct_constraint_list(constraints):
|
|
118
85
|
def add_bullet(constraint_list, constraint_description):
|
|
119
|
-
return constraint_list + "+ {constraint_description}\n"
|
|
120
|
-
constraint_description=constraint_description
|
|
121
|
-
)
|
|
86
|
+
return constraint_list + f"+ {constraint_description}\n"
|
|
122
87
|
|
|
123
88
|
constraint_list = ""
|
|
124
89
|
for constraint in constraints:
|
|
@@ -128,17 +93,13 @@ def _construct_constraint_list(constraints):
|
|
|
128
93
|
|
|
129
94
|
|
|
130
95
|
def _build_column_header(column_name, constraints):
|
|
131
|
-
header = "**{column_name}**"
|
|
96
|
+
header = f"**{column_name}**"
|
|
132
97
|
for constraint in constraints:
|
|
133
98
|
if isinstance(constraint, ColumnDTypeInSetConstraint):
|
|
134
99
|
dtypes_tuple = tuple(constraint.expected_dtype_set)
|
|
135
|
-
return header + ": `{
|
|
136
|
-
expected_dtypes=dtypes_tuple if len(dtypes_tuple) > 1 else dtypes_tuple[0]
|
|
137
|
-
)
|
|
100
|
+
return header + f": `{dtypes_tuple if len(dtypes_tuple) > 1 else dtypes_tuple[0]}`" # pyright: ignore[reportGeneralTypeIssues]
|
|
138
101
|
elif isinstance(constraint, ColumnDTypeFnConstraint):
|
|
139
|
-
return header + ": Validator `{
|
|
140
|
-
expected_dtype_fn=constraint.type_fn.__name__
|
|
141
|
-
)
|
|
102
|
+
return header + f": Validator `{constraint.type_fn.__name__}`"
|
|
142
103
|
return header
|
|
143
104
|
|
|
144
105
|
|
|
@@ -146,47 +107,64 @@ def create_dagster_pandas_dataframe_description(description, columns):
|
|
|
146
107
|
title = "\n".join([description, "### Columns", ""])
|
|
147
108
|
buildme = title
|
|
148
109
|
for column in columns:
|
|
149
|
-
buildme += "{}\n{}\n"
|
|
150
|
-
_build_column_header(column.name, column.constraints),
|
|
151
|
-
_construct_constraint_list(column.constraints),
|
|
152
|
-
)
|
|
110
|
+
buildme += f"{_build_column_header(column.name, column.constraints)}\n{_construct_constraint_list(column.constraints)}\n"
|
|
153
111
|
return buildme
|
|
154
112
|
|
|
155
113
|
|
|
114
|
+
def create_table_schema_metadata_from_dataframe(
|
|
115
|
+
pandas_df: pd.DataFrame,
|
|
116
|
+
) -> TableSchemaMetadataValue:
|
|
117
|
+
"""This function takes a pandas DataFrame and returns its metadata as a Dagster TableSchema.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
pandas_df (pandas.DataFrame): A pandas DataFrame for which to create metadata.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
TableSchemaMetadataValue: returns an object with the TableSchema for the DataFrame.
|
|
124
|
+
"""
|
|
125
|
+
check.inst_param(
|
|
126
|
+
pandas_df, "pandas_df", pd.DataFrame, "Input must be a pandas DataFrame object"
|
|
127
|
+
)
|
|
128
|
+
return MetadataValue.table_schema(
|
|
129
|
+
TableSchema(
|
|
130
|
+
columns=[
|
|
131
|
+
TableColumn(name=str(name), type=str(dtype))
|
|
132
|
+
for name, dtype in pandas_df.dtypes.items()
|
|
133
|
+
]
|
|
134
|
+
)
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@beta
|
|
156
139
|
def create_dagster_pandas_dataframe_type(
|
|
157
140
|
name,
|
|
158
141
|
description=None,
|
|
159
142
|
columns=None,
|
|
160
|
-
|
|
143
|
+
metadata_fn=None,
|
|
161
144
|
dataframe_constraints=None,
|
|
162
145
|
loader=None,
|
|
163
|
-
materializer=None,
|
|
164
146
|
):
|
|
165
|
-
"""
|
|
166
|
-
Constructs a custom pandas dataframe dagster type.
|
|
147
|
+
"""Constructs a custom pandas dataframe dagster type.
|
|
167
148
|
|
|
168
149
|
Args:
|
|
169
150
|
name (str): Name of the dagster pandas type.
|
|
170
151
|
description (Optional[str]): A markdown-formatted string, displayed in tooling.
|
|
171
152
|
columns (Optional[List[PandasColumn]]): A list of :py:class:`~dagster.PandasColumn` objects
|
|
172
153
|
which express dataframe column schemas and constraints.
|
|
173
|
-
|
|
154
|
+
metadata_fn (Optional[Callable[[], Union[Dict[str, Union[str, float, int, Dict, MetadataValue]])
|
|
174
155
|
A callable which takes your dataframe and returns a dict with string label keys and
|
|
175
|
-
|
|
156
|
+
MetadataValue values.
|
|
176
157
|
dataframe_constraints (Optional[List[DataFrameConstraint]]): A list of objects that inherit from
|
|
177
158
|
:py:class:`~dagster.DataFrameConstraint`. This allows you to express dataframe-level constraints.
|
|
178
159
|
loader (Optional[DagsterTypeLoader]): An instance of a class that
|
|
179
160
|
inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default
|
|
180
161
|
to using `dataframe_loader`.
|
|
181
|
-
materializer (Optional[DagsterTypeMaterializer]): An instance of a class
|
|
182
|
-
that inherits from :py:class:`~dagster.DagsterTypeMaterializer`. If None, we will
|
|
183
|
-
default to using `dataframe_materializer`.
|
|
184
162
|
"""
|
|
185
|
-
# We allow for the plugging in of
|
|
186
|
-
#
|
|
187
|
-
#
|
|
163
|
+
# We allow for the plugging in of a dagster_type_loader so that users can load their custom
|
|
164
|
+
# dataframes via configuration their own way if the default configs don't suffice. This is
|
|
165
|
+
# purely optional.
|
|
188
166
|
check.str_param(name, "name")
|
|
189
|
-
|
|
167
|
+
metadata_fn = check.opt_callable_param(metadata_fn, "metadata_fn")
|
|
190
168
|
description = create_dagster_pandas_dataframe_description(
|
|
191
169
|
check.opt_str_param(description, "description", default=""),
|
|
192
170
|
check.opt_list_param(columns, "columns", of_type=PandasColumn),
|
|
@@ -196,35 +174,35 @@ def create_dagster_pandas_dataframe_type(
|
|
|
196
174
|
if not isinstance(value, pd.DataFrame):
|
|
197
175
|
return TypeCheck(
|
|
198
176
|
success=False,
|
|
199
|
-
description=
|
|
200
|
-
|
|
177
|
+
description=(
|
|
178
|
+
f"Must be a pandas.DataFrame. Got value of type. {type(value).__name__}"
|
|
201
179
|
),
|
|
202
180
|
)
|
|
203
181
|
|
|
204
182
|
try:
|
|
205
183
|
validate_constraints(
|
|
206
|
-
value,
|
|
184
|
+
value,
|
|
185
|
+
pandas_columns=columns,
|
|
186
|
+
dataframe_constraints=dataframe_constraints,
|
|
207
187
|
)
|
|
208
188
|
except ConstraintViolationException as e:
|
|
209
189
|
return TypeCheck(success=False, description=str(e))
|
|
210
190
|
|
|
211
191
|
return TypeCheck(
|
|
212
192
|
success=True,
|
|
213
|
-
|
|
214
|
-
if event_metadata_fn
|
|
215
|
-
else None,
|
|
193
|
+
metadata=_execute_summary_stats(name, value, metadata_fn) if metadata_fn else None, # pyright: ignore[reportArgumentType]
|
|
216
194
|
)
|
|
217
195
|
|
|
218
196
|
return DagsterType(
|
|
219
197
|
name=name,
|
|
220
198
|
type_check_fn=_dagster_type_check,
|
|
221
199
|
loader=loader if loader else dataframe_loader,
|
|
222
|
-
materializer=materializer if materializer else dataframe_materializer,
|
|
223
200
|
description=description,
|
|
201
|
+
typing_type=pd.DataFrame,
|
|
224
202
|
)
|
|
225
203
|
|
|
226
204
|
|
|
227
|
-
@
|
|
205
|
+
@beta
|
|
228
206
|
def create_structured_dataframe_type(
|
|
229
207
|
name,
|
|
230
208
|
description=None,
|
|
@@ -232,11 +210,8 @@ def create_structured_dataframe_type(
|
|
|
232
210
|
columns_aggregate_validator=None,
|
|
233
211
|
dataframe_validator=None,
|
|
234
212
|
loader=None,
|
|
235
|
-
materializer=None,
|
|
236
213
|
):
|
|
237
|
-
"""
|
|
238
|
-
|
|
239
|
-
Args:
|
|
214
|
+
"""Args:
|
|
240
215
|
name (str): the name of the new type
|
|
241
216
|
description (Optional[str]): the description of the new type
|
|
242
217
|
columns_validator (Optional[Union[ColumnConstraintWithMetadata, MultiColumnConstraintWithMetadata]]):
|
|
@@ -252,9 +227,6 @@ def create_structured_dataframe_type(
|
|
|
252
227
|
loader (Optional[DagsterTypeLoader]): An instance of a class that
|
|
253
228
|
inherits from :py:class:`~dagster.DagsterTypeLoader`. If None, we will default
|
|
254
229
|
to using `dataframe_loader`.
|
|
255
|
-
materializer (Optional[DagsterTypeMaterializer]): An instance of a class
|
|
256
|
-
that inherits from :py:class:`~dagster.DagsterTypeMaterializer`. If None, we will
|
|
257
|
-
default to using `dataframe_materializer`.
|
|
258
230
|
|
|
259
231
|
Returns:
|
|
260
232
|
a DagsterType with the corresponding name and packaged validation.
|
|
@@ -265,8 +237,8 @@ def create_structured_dataframe_type(
|
|
|
265
237
|
if not isinstance(value, pd.DataFrame):
|
|
266
238
|
return TypeCheck(
|
|
267
239
|
success=False,
|
|
268
|
-
description=
|
|
269
|
-
|
|
240
|
+
description=(
|
|
241
|
+
f"Must be a pandas.DataFrame. Got value of type. {type(value).__name__}"
|
|
270
242
|
),
|
|
271
243
|
)
|
|
272
244
|
individual_result_dict = {}
|
|
@@ -282,7 +254,7 @@ def create_structured_dataframe_type(
|
|
|
282
254
|
)
|
|
283
255
|
|
|
284
256
|
typechecks_succeeded = True
|
|
285
|
-
metadata =
|
|
257
|
+
metadata = {}
|
|
286
258
|
overall_description = "Failed Constraints: {}"
|
|
287
259
|
constraint_clauses = []
|
|
288
260
|
for key, result in individual_result_dict.items():
|
|
@@ -290,19 +262,14 @@ def create_structured_dataframe_type(
|
|
|
290
262
|
if result_val:
|
|
291
263
|
continue
|
|
292
264
|
typechecks_succeeded = typechecks_succeeded and result_val
|
|
293
|
-
result_dict = result.
|
|
294
|
-
metadata.
|
|
295
|
-
|
|
296
|
-
result_dict,
|
|
297
|
-
"{}-constraint-metadata".format(key),
|
|
298
|
-
)
|
|
299
|
-
)
|
|
300
|
-
constraint_clauses.append("{} failing constraints, {}".format(key, result.description))
|
|
265
|
+
result_dict = result.metadata[CONSTRAINT_METADATA_KEY].data
|
|
266
|
+
metadata[f"{key}-constraint-metadata"] = MetadataValue.json(result_dict)
|
|
267
|
+
constraint_clauses.append(f"{key} failing constraints, {result.description}")
|
|
301
268
|
# returns aggregates, then column, then dataframe
|
|
302
269
|
return TypeCheck(
|
|
303
270
|
success=typechecks_succeeded,
|
|
304
271
|
description=overall_description.format(constraint_clauses),
|
|
305
|
-
|
|
272
|
+
metadata=metadata,
|
|
306
273
|
)
|
|
307
274
|
|
|
308
275
|
description = check.opt_str_param(description, "description", default="")
|
|
@@ -310,34 +277,20 @@ def create_structured_dataframe_type(
|
|
|
310
277
|
name=name,
|
|
311
278
|
type_check_fn=_dagster_type_check,
|
|
312
279
|
loader=loader if loader else dataframe_loader,
|
|
313
|
-
materializer=materializer if loader else dataframe_materializer,
|
|
314
280
|
description=description,
|
|
315
281
|
)
|
|
316
282
|
|
|
317
283
|
|
|
318
|
-
def _execute_summary_stats(type_name, value,
|
|
319
|
-
if not
|
|
284
|
+
def _execute_summary_stats(type_name, value, metadata_fn):
|
|
285
|
+
if not metadata_fn:
|
|
320
286
|
return []
|
|
321
287
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
invalid_message = (
|
|
325
|
-
"The return value of the user-defined summary_statistics function for pandas "
|
|
326
|
-
f"data frame type {type_name} returned {value}. This function must return "
|
|
327
|
-
"Union[Dict[str, Union[str, float, int, Dict, EventMetadata]], List[EventMetadataEntry]]"
|
|
328
|
-
)
|
|
329
|
-
|
|
330
|
-
metadata = None
|
|
331
|
-
metadata_entries = None
|
|
332
|
-
|
|
333
|
-
if isinstance(metadata_or_metadata_entries, list):
|
|
334
|
-
metadata_entries = metadata_or_metadata_entries
|
|
335
|
-
elif isinstance(metadata_or_metadata_entries, dict):
|
|
336
|
-
metadata = metadata_or_metadata_entries
|
|
337
|
-
else:
|
|
338
|
-
raise DagsterInvariantViolationError(invalid_message)
|
|
339
|
-
|
|
288
|
+
user_metadata = metadata_fn(value)
|
|
340
289
|
try:
|
|
341
|
-
return
|
|
342
|
-
except
|
|
343
|
-
raise DagsterInvariantViolationError(
|
|
290
|
+
return normalize_metadata(user_metadata)
|
|
291
|
+
except:
|
|
292
|
+
raise DagsterInvariantViolationError(
|
|
293
|
+
"The return value of the user-defined summary_statistics function for pandas "
|
|
294
|
+
f"data frame type {type_name} returned {value}. This function must return "
|
|
295
|
+
"Dict[str, RawMetadataValue]."
|
|
296
|
+
)
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
from dagster import (
|
|
2
|
-
|
|
3
|
-
InputDefinition,
|
|
4
|
-
Out,
|
|
5
|
-
OutputDefinition,
|
|
2
|
+
FilesystemIOManager,
|
|
6
3
|
config_from_files,
|
|
7
4
|
file_relative_path,
|
|
8
|
-
fs_io_manager,
|
|
9
5
|
graph,
|
|
10
6
|
in_process_executor,
|
|
11
7
|
repository,
|
|
12
8
|
)
|
|
13
9
|
|
|
14
|
-
from
|
|
15
|
-
|
|
10
|
+
from dagster_pandas.examples.pandas_hello_world.ops import (
|
|
11
|
+
always_fails_op,
|
|
12
|
+
papermill_pandas_hello_world,
|
|
13
|
+
sum_op,
|
|
14
|
+
sum_sq_op,
|
|
15
|
+
)
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
@graph
|
|
@@ -56,7 +56,7 @@ def papermill_pandas_hello_world_graph():
|
|
|
56
56
|
|
|
57
57
|
|
|
58
58
|
papermill_pandas_hello_world_test = papermill_pandas_hello_world_graph.to_job(
|
|
59
|
-
resource_defs={"io_manager":
|
|
59
|
+
resource_defs={"io_manager": FilesystemIOManager()},
|
|
60
60
|
config=config_from_files(
|
|
61
61
|
[
|
|
62
62
|
file_relative_path(
|
|
@@ -68,7 +68,7 @@ papermill_pandas_hello_world_test = papermill_pandas_hello_world_graph.to_job(
|
|
|
68
68
|
)
|
|
69
69
|
|
|
70
70
|
papermill_pandas_hello_world_prod = papermill_pandas_hello_world_graph.to_job(
|
|
71
|
-
resource_defs={"io_manager":
|
|
71
|
+
resource_defs={"io_manager": FilesystemIOManager()},
|
|
72
72
|
config=config_from_files(
|
|
73
73
|
[
|
|
74
74
|
file_relative_path(
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import dagster_pandas as dagster_pd
|
|
2
1
|
import dagstermill
|
|
3
|
-
from dagster import In,
|
|
2
|
+
from dagster import In, Out, file_relative_path, op
|
|
4
3
|
|
|
5
|
-
|
|
4
|
+
import dagster_pandas as dagster_pd
|
|
5
|
+
from dagster_pandas.data_frame import DataFrame
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
@op(
|
|
@@ -34,12 +34,12 @@ def always_fails_op(**_kwargs):
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def nb_test_path(name):
|
|
37
|
-
return file_relative_path(__file__, "../notebooks/{name}.ipynb"
|
|
37
|
+
return file_relative_path(__file__, f"../notebooks/{name}.ipynb")
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
papermill_pandas_hello_world = dagstermill.
|
|
40
|
+
papermill_pandas_hello_world = dagstermill.factory.define_dagstermill_op(
|
|
41
41
|
name="papermill_pandas_hello_world",
|
|
42
42
|
notebook_path=nb_test_path("papermill_pandas_hello_world"),
|
|
43
|
-
|
|
44
|
-
|
|
43
|
+
ins={"df": In(DataFrame)},
|
|
44
|
+
outs={"result": Out(DataFrame)},
|
|
45
45
|
)
|
dagster_pandas/py.typed
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
partial
|
dagster_pandas/validation.py
CHANGED
|
@@ -1,4 +1,17 @@
|
|
|
1
|
-
from dagster import
|
|
1
|
+
from dagster import (
|
|
2
|
+
DagsterInvariantViolationError,
|
|
3
|
+
_check as check,
|
|
4
|
+
)
|
|
5
|
+
from dagster._annotations import beta
|
|
6
|
+
from pandas import DataFrame, Timestamp
|
|
7
|
+
from pandas.core.dtypes.common import (
|
|
8
|
+
is_bool_dtype,
|
|
9
|
+
is_float_dtype,
|
|
10
|
+
is_integer_dtype,
|
|
11
|
+
is_numeric_dtype,
|
|
12
|
+
is_string_dtype,
|
|
13
|
+
)
|
|
14
|
+
|
|
2
15
|
from dagster_pandas.constraints import (
|
|
3
16
|
CategoricalColumnConstraint,
|
|
4
17
|
ColumnDTypeFnConstraint,
|
|
@@ -10,14 +23,6 @@ from dagster_pandas.constraints import (
|
|
|
10
23
|
NonNullableColumnConstraint,
|
|
11
24
|
UniqueColumnConstraint,
|
|
12
25
|
)
|
|
13
|
-
from pandas import DataFrame, Timestamp
|
|
14
|
-
from pandas.core.dtypes.common import (
|
|
15
|
-
is_bool_dtype,
|
|
16
|
-
is_float_dtype,
|
|
17
|
-
is_integer_dtype,
|
|
18
|
-
is_numeric_dtype,
|
|
19
|
-
is_string_dtype,
|
|
20
|
-
)
|
|
21
26
|
|
|
22
27
|
PANDAS_NUMERIC_TYPES = {"int64", "float"}
|
|
23
28
|
|
|
@@ -38,9 +43,9 @@ def _construct_keyword_constraints(non_nullable, unique, ignore_missing_vals):
|
|
|
38
43
|
return constraints
|
|
39
44
|
|
|
40
45
|
|
|
46
|
+
@beta
|
|
41
47
|
class PandasColumn:
|
|
42
|
-
"""
|
|
43
|
-
The main API for expressing column level schemas and constraints for your custom dataframe
|
|
48
|
+
"""The main API for expressing column level schemas and constraints for your custom dataframe
|
|
44
49
|
types.
|
|
45
50
|
|
|
46
51
|
Args:
|
|
@@ -62,18 +67,15 @@ class PandasColumn:
|
|
|
62
67
|
# Ignore validation if column is missing from dataframe and is not required
|
|
63
68
|
if self.is_required:
|
|
64
69
|
raise ConstraintViolationException(
|
|
65
|
-
"Required column {
|
|
66
|
-
column_name=self.name, dataframe_columns=dataframe.columns
|
|
67
|
-
)
|
|
70
|
+
f"Required column {self.name} not in dataframe with columns {dataframe.columns}"
|
|
68
71
|
)
|
|
69
72
|
else:
|
|
70
73
|
for constraint in self.constraints:
|
|
71
|
-
constraint.validate(dataframe, self.name)
|
|
74
|
+
constraint.validate(dataframe, self.name) # pyright: ignore[reportAttributeAccessIssue]
|
|
72
75
|
|
|
73
76
|
@staticmethod
|
|
74
77
|
def exists(name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None):
|
|
75
|
-
"""
|
|
76
|
-
Simple constructor for PandasColumns that expresses existence constraints.
|
|
78
|
+
"""Simple constructor for PandasColumns that expresses existence constraints.
|
|
77
79
|
|
|
78
80
|
Args:
|
|
79
81
|
name (str): Name of the column. This must match up with the column name in the dataframe you
|
|
@@ -98,8 +100,7 @@ class PandasColumn:
|
|
|
98
100
|
def boolean_column(
|
|
99
101
|
name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None
|
|
100
102
|
):
|
|
101
|
-
"""
|
|
102
|
-
Simple constructor for PandasColumns that expresses boolean constraints on boolean dtypes.
|
|
103
|
+
"""Simple constructor for PandasColumns that expresses boolean constraints on boolean dtypes.
|
|
103
104
|
|
|
104
105
|
Args:
|
|
105
106
|
name (str): Name of the column. This must match up with the column name in the dataframe you
|
|
@@ -131,8 +132,7 @@ class PandasColumn:
|
|
|
131
132
|
ignore_missing_vals=False,
|
|
132
133
|
is_required=None,
|
|
133
134
|
):
|
|
134
|
-
"""
|
|
135
|
-
Simple constructor for PandasColumns that expresses numeric constraints numeric dtypes.
|
|
135
|
+
"""Simple constructor for PandasColumns that expresses numeric constraints numeric dtypes.
|
|
136
136
|
|
|
137
137
|
Args:
|
|
138
138
|
name (str): Name of the column. This must match up with the column name in the dataframe you
|
|
@@ -173,8 +173,7 @@ class PandasColumn:
|
|
|
173
173
|
ignore_missing_vals=False,
|
|
174
174
|
is_required=None,
|
|
175
175
|
):
|
|
176
|
-
"""
|
|
177
|
-
Simple constructor for PandasColumns that expresses numeric constraints on integer dtypes.
|
|
176
|
+
"""Simple constructor for PandasColumns that expresses numeric constraints on integer dtypes.
|
|
178
177
|
|
|
179
178
|
Args:
|
|
180
179
|
name (str): Name of the column. This must match up with the column name in the dataframe you
|
|
@@ -215,8 +214,7 @@ class PandasColumn:
|
|
|
215
214
|
ignore_missing_vals=False,
|
|
216
215
|
is_required=None,
|
|
217
216
|
):
|
|
218
|
-
"""
|
|
219
|
-
Simple constructor for PandasColumns that expresses numeric constraints on float dtypes.
|
|
217
|
+
"""Simple constructor for PandasColumns that expresses numeric constraints on float dtypes.
|
|
220
218
|
|
|
221
219
|
Args:
|
|
222
220
|
name (str): Name of the column. This must match up with the column name in the dataframe you
|
|
@@ -258,8 +256,7 @@ class PandasColumn:
|
|
|
258
256
|
is_required=None,
|
|
259
257
|
tz=None,
|
|
260
258
|
):
|
|
261
|
-
"""
|
|
262
|
-
Simple constructor for PandasColumns that expresses datetime constraints on 'datetime64[ns]' dtypes.
|
|
259
|
+
"""Simple constructor for PandasColumns that expresses datetime constraints on 'datetime64[ns]' dtypes.
|
|
263
260
|
|
|
264
261
|
Args:
|
|
265
262
|
name (str): Name of the column. This must match up with the column name in the dataframe you
|
|
@@ -311,8 +308,7 @@ class PandasColumn:
|
|
|
311
308
|
def string_column(
|
|
312
309
|
name, non_nullable=False, unique=False, ignore_missing_vals=False, is_required=None
|
|
313
310
|
):
|
|
314
|
-
"""
|
|
315
|
-
Simple constructor for PandasColumns that expresses constraints on string dtypes.
|
|
311
|
+
"""Simple constructor for PandasColumns that expresses constraints on string dtypes.
|
|
316
312
|
|
|
317
313
|
Args:
|
|
318
314
|
name (str): Name of the column. This must match up with the column name in the dataframe you
|
|
@@ -338,14 +334,13 @@ class PandasColumn:
|
|
|
338
334
|
def categorical_column(
|
|
339
335
|
name,
|
|
340
336
|
categories,
|
|
341
|
-
of_types="object",
|
|
337
|
+
of_types=frozenset({"category", "object"}),
|
|
342
338
|
non_nullable=False,
|
|
343
339
|
unique=False,
|
|
344
340
|
ignore_missing_vals=False,
|
|
345
341
|
is_required=None,
|
|
346
342
|
):
|
|
347
|
-
"""
|
|
348
|
-
Simple constructor for PandasColumns that expresses categorical constraints on specified dtypes.
|
|
343
|
+
"""Simple constructor for PandasColumns that expresses categorical constraints on specified dtypes.
|
|
349
344
|
|
|
350
345
|
Args:
|
|
351
346
|
name (str): Name of the column. This must match up with the column name in the dataframe you
|
dagster_pandas/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.27.15"
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dagster-pandas
|
|
3
|
+
Version: 0.27.15
|
|
4
|
+
Summary: Utilities and examples for working with pandas and dagster, an opinionated framework for expressing data pipelines
|
|
5
|
+
Home-page: https://github.com/dagster-io/dagster
|
|
6
|
+
Author: Dagster Labs
|
|
7
|
+
Author-email: hello@dagsterlabs.com
|
|
8
|
+
License: Apache-2.0
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Requires-Python: >=3.9,<3.14
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: dagster==1.11.15
|
|
20
|
+
Requires-Dist: pandas
|
|
21
|
+
Dynamic: author
|
|
22
|
+
Dynamic: author-email
|
|
23
|
+
Dynamic: classifier
|
|
24
|
+
Dynamic: description
|
|
25
|
+
Dynamic: description-content-type
|
|
26
|
+
Dynamic: home-page
|
|
27
|
+
Dynamic: license
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
Dynamic: requires-dist
|
|
30
|
+
Dynamic: requires-python
|
|
31
|
+
Dynamic: summary
|
|
32
|
+
|
|
33
|
+
# dagster-pandas
|
|
34
|
+
|
|
35
|
+
The docs for `dagster-pandas` can be found
|
|
36
|
+
[here](https://docs.dagster.io/api/python-api/libraries/dagster-pandas).
|