dagster-pandas 0.13.18rc0__py3-none-any.whl → 0.27.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dagster_pandas/__init__.py +13 -13
- dagster_pandas/constraints.py +144 -181
- dagster_pandas/data_frame.py +82 -129
- dagster_pandas/examples/__init__.py +9 -9
- dagster_pandas/examples/pandas_hello_world/environments/pandas_hello_world_prod.yaml +1 -1
- dagster_pandas/examples/pandas_hello_world/environments/pandas_hello_world_test.yaml +1 -1
- dagster_pandas/examples/pandas_hello_world/environments/papermill_pandas_hello_world_prod.yaml +1 -1
- dagster_pandas/examples/pandas_hello_world/environments/papermill_pandas_hello_world_test.yaml +1 -1
- dagster_pandas/examples/pandas_hello_world/ops.py +7 -7
- dagster_pandas/examples/pandas_hello_world.yaml +1 -1
- dagster_pandas/py.typed +1 -0
- dagster_pandas/validation.py +26 -31
- dagster_pandas/version.py +1 -1
- dagster_pandas-0.27.15.dist-info/METADATA +36 -0
- dagster_pandas-0.27.15.dist-info/RECORD +22 -0
- {dagster_pandas-0.13.18rc0.dist-info → dagster_pandas-0.27.15.dist-info}/WHEEL +1 -1
- {dagster_pandas-0.13.18rc0.dist-info → dagster_pandas-0.27.15.dist-info/licenses}/LICENSE +1 -1
- dagster_pandas-0.27.15.dist-info/top_level.txt +1 -0
- dagster_pandas-0.13.18rc0.dist-info/METADATA +0 -24
- dagster_pandas-0.13.18rc0.dist-info/RECORD +0 -23
- dagster_pandas-0.13.18rc0.dist-info/top_level.txt +0 -2
- dagster_pandas_tests/pandas_hello_world/__init__.py +0 -0
- dagster_pandas_tests/pandas_hello_world/test_pandas_hello_world.py +0 -93
dagster_pandas/constraints.py
CHANGED
|
@@ -2,20 +2,27 @@ import sys
|
|
|
2
2
|
from collections import defaultdict
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
from functools import wraps
|
|
5
|
+
from typing import Final
|
|
5
6
|
|
|
6
7
|
import pandas as pd
|
|
7
|
-
from dagster import
|
|
8
|
-
|
|
8
|
+
from dagster import (
|
|
9
|
+
DagsterType,
|
|
10
|
+
TypeCheck,
|
|
11
|
+
_check as check,
|
|
12
|
+
)
|
|
13
|
+
from dagster._annotations import beta
|
|
9
14
|
from pandas import DataFrame
|
|
10
15
|
|
|
16
|
+
CONSTRAINT_METADATA_KEY: Final = "constraint_metadata"
|
|
17
|
+
|
|
11
18
|
|
|
12
19
|
class ConstraintViolationException(Exception):
|
|
13
20
|
"""Indicates that a constraint has been violated."""
|
|
14
21
|
|
|
15
22
|
|
|
23
|
+
@beta
|
|
16
24
|
class ConstraintWithMetadataException(Exception):
|
|
17
|
-
"""
|
|
18
|
-
This class defines the response generated when a pandas DF fails validation -- it can be used to generate either a
|
|
25
|
+
"""This class defines the response generated when a pandas DF fails validation -- it can be used to generate either a
|
|
19
26
|
failed typecheck or an exception.
|
|
20
27
|
|
|
21
28
|
Args:
|
|
@@ -39,31 +46,30 @@ class ConstraintWithMetadataException(Exception):
|
|
|
39
46
|
self.expectation = check.opt_inst_param(expectation, "expectation", (dict, list, str, set))
|
|
40
47
|
self.offending = check.opt_inst_param(offending, "offending", (dict, list, str, set))
|
|
41
48
|
self.actual = check.opt_inst_param(actual, "actual", (dict, list, str, set))
|
|
42
|
-
super(
|
|
43
|
-
"Violated {} - {}, {} was/were expected, but we received {} which was/were {}"
|
|
44
|
-
constraint_name,
|
|
45
|
-
constraint_description,
|
|
46
|
-
expectation,
|
|
47
|
-
offending,
|
|
48
|
-
actual,
|
|
49
|
-
)
|
|
49
|
+
super().__init__(
|
|
50
|
+
f"Violated {constraint_name} - {constraint_description}, {expectation} was/were expected, but we received {offending} which was/were {actual}"
|
|
50
51
|
)
|
|
51
52
|
|
|
53
|
+
def normalize_metadata_json_value(self, val):
|
|
54
|
+
if isinstance(val, set):
|
|
55
|
+
return list(val)
|
|
56
|
+
else:
|
|
57
|
+
return val
|
|
58
|
+
|
|
52
59
|
def convert_to_metadata(self):
|
|
53
|
-
return
|
|
54
|
-
{
|
|
60
|
+
return {
|
|
61
|
+
CONSTRAINT_METADATA_KEY: {
|
|
55
62
|
"constraint_name": self.constraint_name,
|
|
56
63
|
"constraint_description": self.constraint_description,
|
|
57
|
-
"expected": self.expectation,
|
|
58
|
-
"offending": self.offending,
|
|
59
|
-
"actual": self.actual,
|
|
64
|
+
"expected": self.normalize_metadata_json_value(self.expectation),
|
|
65
|
+
"offending": self.normalize_metadata_json_value(self.offending),
|
|
66
|
+
"actual": self.normalize_metadata_json_value(self.actual),
|
|
60
67
|
},
|
|
61
|
-
|
|
62
|
-
)
|
|
68
|
+
}
|
|
63
69
|
|
|
64
70
|
def return_as_typecheck(self):
|
|
65
71
|
return TypeCheck(
|
|
66
|
-
success=False, description=self.args[0],
|
|
72
|
+
success=False, description=self.args[0], metadata=self.convert_to_metadata()
|
|
67
73
|
)
|
|
68
74
|
|
|
69
75
|
|
|
@@ -71,16 +77,12 @@ class DataFrameConstraintViolationException(ConstraintViolationException):
|
|
|
71
77
|
"""Indicates a dataframe level constraint has been violated."""
|
|
72
78
|
|
|
73
79
|
def __init__(self, constraint_name, constraint_description):
|
|
74
|
-
super(
|
|
75
|
-
"Violated {constraint_name} - {constraint_description}".format(
|
|
76
|
-
constraint_name=constraint_name, constraint_description=constraint_description
|
|
77
|
-
)
|
|
78
|
-
)
|
|
80
|
+
super().__init__(f"Violated {constraint_name} - {constraint_description}")
|
|
79
81
|
|
|
80
82
|
|
|
81
83
|
class DataFrameWithMetadataException(ConstraintWithMetadataException):
|
|
82
84
|
def __init__(self, constraint_name, constraint_description, expectation, actual):
|
|
83
|
-
super(
|
|
85
|
+
super().__init__(
|
|
84
86
|
constraint_name, constraint_description, expectation, "a malformed dataframe", actual
|
|
85
87
|
)
|
|
86
88
|
|
|
@@ -93,24 +95,21 @@ class ColumnConstraintViolationException(ConstraintViolationException):
|
|
|
93
95
|
self.constraint_description = constraint_description
|
|
94
96
|
self.column_name = column_name
|
|
95
97
|
self.offending_rows = offending_rows
|
|
96
|
-
super(
|
|
98
|
+
super().__init__(self.construct_message())
|
|
97
99
|
|
|
98
100
|
def construct_message(self):
|
|
99
|
-
base_message = 'Violated "{constraint_name}" for column "{column_name}" - {constraint_description}'
|
|
100
|
-
constraint_name=self.constraint_name,
|
|
101
|
-
constraint_description=self.constraint_description,
|
|
102
|
-
column_name=self.column_name,
|
|
103
|
-
)
|
|
101
|
+
base_message = f'Violated "{self.constraint_name}" for column "{self.column_name}" - {self.constraint_description}'
|
|
104
102
|
if self.offending_rows is not None:
|
|
105
|
-
base_message +=
|
|
106
|
-
self.offending_rows
|
|
103
|
+
base_message += (
|
|
104
|
+
f"The offending (index, row values) are the following: {self.offending_rows}"
|
|
107
105
|
)
|
|
108
106
|
return base_message
|
|
109
107
|
|
|
110
108
|
|
|
109
|
+
@beta
|
|
111
110
|
class ColumnWithMetadataException(ConstraintWithMetadataException):
|
|
112
111
|
def __init__(self, constraint_name, constraint_description, expectation, offending, actual):
|
|
113
|
-
super(
|
|
112
|
+
super().__init__(
|
|
114
113
|
"the column constraint " + constraint_name,
|
|
115
114
|
constraint_description,
|
|
116
115
|
expectation,
|
|
@@ -120,12 +119,11 @@ class ColumnWithMetadataException(ConstraintWithMetadataException):
|
|
|
120
119
|
|
|
121
120
|
|
|
122
121
|
class Constraint:
|
|
123
|
-
"""
|
|
124
|
-
Base constraint object that all constraints inherit from.
|
|
122
|
+
"""Base constraint object that all constraints inherit from.
|
|
125
123
|
|
|
126
124
|
Args:
|
|
127
125
|
error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.
|
|
128
|
-
markdown_description (Optional[str]): A markdown supported description that is
|
|
126
|
+
markdown_description (Optional[str]): A markdown supported description that is shown in the Dagster UI if the constraint fails.
|
|
129
127
|
"""
|
|
130
128
|
|
|
131
129
|
def __init__(self, error_description=None, markdown_description=None):
|
|
@@ -134,11 +132,11 @@ class Constraint:
|
|
|
134
132
|
self.error_description = check.str_param(error_description, "error_description")
|
|
135
133
|
|
|
136
134
|
|
|
135
|
+
@beta
|
|
137
136
|
class ConstraintWithMetadata:
|
|
138
|
-
"""
|
|
139
|
-
This class defines a base constraint over pandas DFs with organized metadata
|
|
137
|
+
"""This class defines a base constraint over pandas DFs with organized metadata.
|
|
140
138
|
|
|
141
|
-
|
|
139
|
+
Args:
|
|
142
140
|
description (str): description of the constraint
|
|
143
141
|
validation_fn (Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:
|
|
144
142
|
the validation function to run over inputted data
|
|
@@ -156,7 +154,6 @@ class ConstraintWithMetadata:
|
|
|
156
154
|
def __init__(
|
|
157
155
|
self, description, validation_fn, resulting_exception, raise_or_typecheck=True, name=None
|
|
158
156
|
):
|
|
159
|
-
experimental_class_warning(self.__class__.__name__)
|
|
160
157
|
if name is None:
|
|
161
158
|
self.name = self.__class__.__name__
|
|
162
159
|
else:
|
|
@@ -191,19 +188,17 @@ class ConstraintWithMetadata:
|
|
|
191
188
|
)
|
|
192
189
|
return DagsterType(
|
|
193
190
|
name=self.name,
|
|
194
|
-
description="A Pandas DataFrame with the following validation: {}"
|
|
195
|
-
|
|
196
|
-
),
|
|
197
|
-
type_check_fn=lambda x: self.validate(x, *args),
|
|
191
|
+
description=f"A Pandas DataFrame with the following validation: {self.description}",
|
|
192
|
+
type_check_fn=lambda x: self.validate(x, *args), # pyright: ignore[reportArgumentType]
|
|
198
193
|
**kwargs,
|
|
199
194
|
)
|
|
200
195
|
|
|
201
196
|
|
|
197
|
+
@beta
|
|
202
198
|
class MultiConstraintWithMetadata(ConstraintWithMetadata):
|
|
203
|
-
"""
|
|
204
|
-
Use this class if you have multiple constraints to check over the entire dataframe
|
|
199
|
+
"""Use this class if you have multiple constraints to check over the entire dataframe.
|
|
205
200
|
|
|
206
|
-
|
|
201
|
+
Args:
|
|
207
202
|
description (str): description of the constraint
|
|
208
203
|
validation_fn_arr(List[Callable[[DataFrame], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):
|
|
209
204
|
a list of the validation functions to run over inputted data
|
|
@@ -227,7 +222,6 @@ class MultiConstraintWithMetadata(ConstraintWithMetadata):
|
|
|
227
222
|
validation_fn_arr = check.list_param(validation_fn_arr, "validation_fn_arr")
|
|
228
223
|
|
|
229
224
|
def validation_fn(data, *args, **kwargs):
|
|
230
|
-
|
|
231
225
|
results = [f(data, *args, **kwargs) for f in validation_fn_arr]
|
|
232
226
|
truthparam = all(item[0] for item in results)
|
|
233
227
|
metadict = defaultdict(dict)
|
|
@@ -237,7 +231,7 @@ class MultiConstraintWithMetadata(ConstraintWithMetadata):
|
|
|
237
231
|
metadict[key][validation_fn_arr[i].__name__] = dicta[key]
|
|
238
232
|
return (truthparam, metadict)
|
|
239
233
|
|
|
240
|
-
super(
|
|
234
|
+
super().__init__(
|
|
241
235
|
description,
|
|
242
236
|
validation_fn,
|
|
243
237
|
resulting_exception,
|
|
@@ -246,6 +240,7 @@ class MultiConstraintWithMetadata(ConstraintWithMetadata):
|
|
|
246
240
|
)
|
|
247
241
|
|
|
248
242
|
|
|
243
|
+
@beta
|
|
249
244
|
class StrictColumnsWithMetadata(ConstraintWithMetadata):
|
|
250
245
|
def __init__(self, column_list, enforce_ordering=False, raise_or_typecheck=True, name=None):
|
|
251
246
|
self.enforce_ordering = check.bool_param(enforce_ordering, "enforce_ordering")
|
|
@@ -270,10 +265,10 @@ class StrictColumnsWithMetadata(ConstraintWithMetadata):
|
|
|
270
265
|
}
|
|
271
266
|
return (False, resdict)
|
|
272
267
|
|
|
273
|
-
basestr = "ensuring that the right columns, {} were present"
|
|
268
|
+
basestr = f"ensuring that the right columns, {self.column_list} were present"
|
|
274
269
|
if enforce_ordering:
|
|
275
270
|
basestr += " in the right order"
|
|
276
|
-
super(
|
|
271
|
+
super().__init__(
|
|
277
272
|
basestr,
|
|
278
273
|
validation_fcn,
|
|
279
274
|
DataFrameWithMetadataException,
|
|
@@ -283,16 +278,15 @@ class StrictColumnsWithMetadata(ConstraintWithMetadata):
|
|
|
283
278
|
|
|
284
279
|
|
|
285
280
|
class DataFrameConstraint(Constraint):
|
|
286
|
-
"""
|
|
287
|
-
Base constraint object that represent Dataframe shape constraints.
|
|
281
|
+
"""Base constraint object that represent Dataframe shape constraints.
|
|
288
282
|
|
|
289
283
|
Args:
|
|
290
284
|
error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.
|
|
291
|
-
markdown_description (Optional[str]): A markdown supported description that is
|
|
285
|
+
markdown_description (Optional[str]): A markdown supported description that is shown in the Dagster UI if the constraint fails.
|
|
292
286
|
"""
|
|
293
287
|
|
|
294
288
|
def __init__(self, error_description=None, markdown_description=None):
|
|
295
|
-
super(
|
|
289
|
+
super().__init__(
|
|
296
290
|
error_description=error_description, markdown_description=markdown_description
|
|
297
291
|
)
|
|
298
292
|
|
|
@@ -300,9 +294,9 @@ class DataFrameConstraint(Constraint):
|
|
|
300
294
|
raise NotImplementedError()
|
|
301
295
|
|
|
302
296
|
|
|
297
|
+
@beta
|
|
303
298
|
class StrictColumnsConstraint(DataFrameConstraint):
|
|
304
|
-
"""
|
|
305
|
-
A dataframe constraint that validates column existence and ordering.
|
|
299
|
+
"""A dataframe constraint that validates column existence and ordering.
|
|
306
300
|
|
|
307
301
|
Args:
|
|
308
302
|
strict_column_list (List[str]): The exact list of columns that your dataframe must have.
|
|
@@ -315,12 +309,10 @@ class StrictColumnsConstraint(DataFrameConstraint):
|
|
|
315
309
|
self.strict_column_list = check.list_param(
|
|
316
310
|
strict_column_list, "strict_column_list", of_type=str
|
|
317
311
|
)
|
|
318
|
-
description = "No columns outside of {
|
|
312
|
+
description = f"No columns outside of {self.strict_column_list} allowed. "
|
|
319
313
|
if enforce_ordering:
|
|
320
314
|
description += "Columns must be in that order."
|
|
321
|
-
super(
|
|
322
|
-
error_description=description, markdown_description=description
|
|
323
|
-
)
|
|
315
|
+
super().__init__(error_description=description, markdown_description=description)
|
|
324
316
|
|
|
325
317
|
def validate(self, dataframe):
|
|
326
318
|
check.inst_param(dataframe, "dataframe", DataFrame)
|
|
@@ -329,23 +321,22 @@ class StrictColumnsConstraint(DataFrameConstraint):
|
|
|
329
321
|
if self.strict_column_list != columns_received:
|
|
330
322
|
raise DataFrameConstraintViolationException(
|
|
331
323
|
constraint_name=self.name,
|
|
332
|
-
constraint_description=
|
|
333
|
-
|
|
324
|
+
constraint_description=(
|
|
325
|
+
f"Expected the following ordering of columns {self.strict_column_list}. Received:"
|
|
326
|
+
f" {columns_received}"
|
|
334
327
|
),
|
|
335
328
|
)
|
|
336
329
|
for column in columns_received:
|
|
337
330
|
if column not in self.strict_column_list:
|
|
338
331
|
raise DataFrameConstraintViolationException(
|
|
339
332
|
constraint_name=self.name,
|
|
340
|
-
constraint_description="Expected {}. Recevied {}."
|
|
341
|
-
self.strict_column_list, columns_received
|
|
342
|
-
),
|
|
333
|
+
constraint_description=f"Expected {self.strict_column_list}. Recevied {columns_received}.",
|
|
343
334
|
)
|
|
344
335
|
|
|
345
336
|
|
|
337
|
+
@beta
|
|
346
338
|
class RowCountConstraint(DataFrameConstraint):
|
|
347
|
-
"""
|
|
348
|
-
A dataframe constraint that validates the expected count of rows.
|
|
339
|
+
"""A dataframe constraint that validates the expected count of rows.
|
|
349
340
|
|
|
350
341
|
Args:
|
|
351
342
|
num_allowed_rows (int): The number of allowed rows in your dataframe.
|
|
@@ -357,12 +348,8 @@ class RowCountConstraint(DataFrameConstraint):
|
|
|
357
348
|
self.error_tolerance = abs(check.int_param(error_tolerance, "error_tolerance"))
|
|
358
349
|
if self.error_tolerance > self.num_allowed_rows:
|
|
359
350
|
raise ValueError("Tolerance can't be greater than the number of rows you expect.")
|
|
360
|
-
description = "Dataframe must have {} +- {} rows."
|
|
361
|
-
|
|
362
|
-
)
|
|
363
|
-
super(RowCountConstraint, self).__init__(
|
|
364
|
-
error_description=description, markdown_description=description
|
|
365
|
-
)
|
|
351
|
+
description = f"Dataframe must have {self.num_allowed_rows} +- {self.error_tolerance} rows."
|
|
352
|
+
super().__init__(error_description=description, markdown_description=description)
|
|
366
353
|
|
|
367
354
|
def validate(self, dataframe):
|
|
368
355
|
check.inst_param(dataframe, "dataframe", DataFrame)
|
|
@@ -374,10 +361,8 @@ class RowCountConstraint(DataFrameConstraint):
|
|
|
374
361
|
):
|
|
375
362
|
raise DataFrameConstraintViolationException(
|
|
376
363
|
constraint_name=self.name,
|
|
377
|
-
constraint_description=
|
|
378
|
-
|
|
379
|
-
tolerance=self.error_tolerance,
|
|
380
|
-
received=len(dataframe),
|
|
364
|
+
constraint_description=(
|
|
365
|
+
f"Expected {self.num_allowed_rows} +- {self.error_tolerance} rows. Got {len(dataframe)}"
|
|
381
366
|
),
|
|
382
367
|
)
|
|
383
368
|
|
|
@@ -387,9 +372,9 @@ def apply_ignore_missing_data_to_mask(mask, column):
|
|
|
387
372
|
|
|
388
373
|
|
|
389
374
|
class ColumnAggregateConstraintWithMetadata(ConstraintWithMetadata):
|
|
390
|
-
"""
|
|
391
|
-
|
|
392
|
-
|
|
375
|
+
"""Similar to the base class, but now your validation functions should take in columns (pd.Series) not Dataframes.
|
|
376
|
+
|
|
377
|
+
Args:
|
|
393
378
|
description (str): description of the constraint
|
|
394
379
|
validation_fn (Callable[[pd.Series], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:
|
|
395
380
|
the validation function to run over inputted data
|
|
@@ -415,7 +400,7 @@ class ColumnAggregateConstraintWithMetadata(ConstraintWithMetadata):
|
|
|
415
400
|
res = self.validation_fn(relevant_data[column])
|
|
416
401
|
if not res[0]:
|
|
417
402
|
offending_columns.add(column)
|
|
418
|
-
if
|
|
403
|
+
if res[1].get("actual") is not None:
|
|
419
404
|
offending_values[column] = [x.item() for x in res[1].get("actual").to_numpy()]
|
|
420
405
|
else:
|
|
421
406
|
offending_values[column] = [x.item() for x in relevant_data[column].to_numpy()]
|
|
@@ -438,12 +423,13 @@ class ColumnAggregateConstraintWithMetadata(ConstraintWithMetadata):
|
|
|
438
423
|
|
|
439
424
|
|
|
440
425
|
class ColumnConstraintWithMetadata(ConstraintWithMetadata):
|
|
441
|
-
"""
|
|
442
|
-
|
|
443
|
-
|
|
426
|
+
"""This class is useful for constructing single constraints that you want to apply to multiple
|
|
427
|
+
columns of your dataframe.
|
|
428
|
+
|
|
444
429
|
The main difference from the base class in terms of construction is that now, your validation_fns should operate on
|
|
445
430
|
individual values.
|
|
446
|
-
|
|
431
|
+
|
|
432
|
+
Args:
|
|
447
433
|
description (str): description of the constraint
|
|
448
434
|
validation_fn (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]:
|
|
449
435
|
the validation function to run over inputted data
|
|
@@ -490,12 +476,12 @@ class ColumnConstraintWithMetadata(ConstraintWithMetadata):
|
|
|
490
476
|
return exc.return_as_typecheck()
|
|
491
477
|
|
|
492
478
|
|
|
479
|
+
@beta
|
|
493
480
|
class MultiColumnConstraintWithMetadata(ColumnConstraintWithMetadata):
|
|
494
|
-
"""
|
|
495
|
-
This class is useful for constructing more complicated relationships between columns
|
|
481
|
+
"""This class is useful for constructing more complicated relationships between columns
|
|
496
482
|
and expectations -- i.e. you want some validations on column A, others on column B, etc.
|
|
497
|
-
This lets you package up the metadata neatly,
|
|
498
|
-
|
|
483
|
+
This lets you package up the metadata neatly, and also allows for cases like 'fail if any one of
|
|
484
|
+
these constraints fails but still run all of them'.
|
|
499
485
|
|
|
500
486
|
Args:
|
|
501
487
|
description (str): description of the overall set of validations
|
|
@@ -539,10 +525,10 @@ class MultiColumnConstraintWithMetadata(ColumnConstraintWithMetadata):
|
|
|
539
525
|
result = new_validator.validate(
|
|
540
526
|
DataFrame(data[column]), column, *args, **kwargs
|
|
541
527
|
)
|
|
542
|
-
result_val = result.success
|
|
528
|
+
result_val = result.success # pyright: ignore[reportOptionalMemberAccess]
|
|
543
529
|
if result_val:
|
|
544
530
|
continue
|
|
545
|
-
result_dict = result.
|
|
531
|
+
result_dict = result.metadata[CONSTRAINT_METADATA_KEY].data # pyright: ignore[reportAttributeAccessIssue,reportOptionalMemberAccess]
|
|
546
532
|
truthparam = truthparam and result_val
|
|
547
533
|
for key in result_dict.keys():
|
|
548
534
|
if "constraint" not in key:
|
|
@@ -561,7 +547,7 @@ class MultiColumnConstraintWithMetadata(ColumnConstraintWithMetadata):
|
|
|
561
547
|
metadict[key][column][fn.__name__] = "a violation"
|
|
562
548
|
return truthparam, metadict
|
|
563
549
|
|
|
564
|
-
super(
|
|
550
|
+
super().__init__(
|
|
565
551
|
description,
|
|
566
552
|
validation_fn,
|
|
567
553
|
resulting_exception,
|
|
@@ -573,9 +559,9 @@ class MultiColumnConstraintWithMetadata(ColumnConstraintWithMetadata):
|
|
|
573
559
|
return ConstraintWithMetadata.validate(self, data, *args, **kwargs)
|
|
574
560
|
|
|
575
561
|
|
|
562
|
+
@beta
|
|
576
563
|
class MultiAggregateConstraintWithMetadata(MultiColumnConstraintWithMetadata):
|
|
577
|
-
"""
|
|
578
|
-
This class is similar to multicolumn, but takes in functions that operate on the whole column at once
|
|
564
|
+
"""This class is similar to multicolumn, but takes in functions that operate on the whole column at once
|
|
579
565
|
rather than ones that operate on each value --
|
|
580
566
|
consider this similar to the difference between apply-map and apply aggregate.
|
|
581
567
|
|
|
@@ -601,19 +587,20 @@ class MultiAggregateConstraintWithMetadata(MultiColumnConstraintWithMetadata):
|
|
|
601
587
|
raise_or_typecheck=True,
|
|
602
588
|
name=None,
|
|
603
589
|
):
|
|
604
|
-
super(
|
|
590
|
+
super().__init__(
|
|
605
591
|
description,
|
|
606
592
|
fn_and_columns_dict,
|
|
607
593
|
resulting_exception,
|
|
608
594
|
raise_or_typecheck=raise_or_typecheck,
|
|
609
|
-
type_for_internal=ColumnAggregateConstraintWithMetadata,
|
|
595
|
+
type_for_internal=ColumnAggregateConstraintWithMetadata, # pyright: ignore[reportArgumentType]
|
|
610
596
|
name=name,
|
|
611
597
|
)
|
|
612
598
|
|
|
613
599
|
|
|
600
|
+
@beta
|
|
614
601
|
def non_null_validation(x):
|
|
615
|
-
"""
|
|
616
|
-
|
|
602
|
+
"""Validates that a particular value in a column is not null.
|
|
603
|
+
|
|
617
604
|
Usage:
|
|
618
605
|
pass this as a column validator to
|
|
619
606
|
:py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'
|
|
@@ -624,10 +611,11 @@ def non_null_validation(x):
|
|
|
624
611
|
return not pd.isnull(x), {}
|
|
625
612
|
|
|
626
613
|
|
|
614
|
+
@beta
|
|
627
615
|
def all_unique_validator(column, ignore_missing_vals=False):
|
|
628
|
-
"""
|
|
629
|
-
|
|
630
|
-
Returns duplicated values as metadata
|
|
616
|
+
"""Validates that all values in an iterable are unique.
|
|
617
|
+
|
|
618
|
+
Returns duplicated values as metadata.
|
|
631
619
|
|
|
632
620
|
Usage:
|
|
633
621
|
As a validation function for a
|
|
@@ -662,16 +650,17 @@ def all_unique_validator(column, ignore_missing_vals=False):
|
|
|
662
650
|
return not duplicated.any(), {"actual": column[duplicated]}
|
|
663
651
|
|
|
664
652
|
|
|
653
|
+
@beta
|
|
665
654
|
def nonnull(func):
|
|
666
|
-
"""
|
|
667
|
-
|
|
655
|
+
"""Decorator for column validation functions to make them error on nulls.
|
|
656
|
+
|
|
668
657
|
Usage:
|
|
669
658
|
pass decorated functions as column validators to
|
|
670
659
|
:py:class:'~dagster_pandas.constraints.ColumnConstraintWithMetadata'
|
|
671
660
|
or :py:class:'~dagster_pandas.constraints.MultiColumnConstraintWithMetadata'
|
|
672
661
|
Args:
|
|
673
662
|
func (Callable[[Any], Tuple[bool, dict[str, Union[dict,list, str, set]]]]]):
|
|
674
|
-
the column validator you want to error on nulls
|
|
663
|
+
the column validator you want to error on nulls.
|
|
675
664
|
"""
|
|
676
665
|
|
|
677
666
|
@wraps(func)
|
|
@@ -680,18 +669,19 @@ def nonnull(func):
|
|
|
680
669
|
nval = non_null_validation(val)
|
|
681
670
|
return origval[0] and nval[0], {}
|
|
682
671
|
|
|
683
|
-
nvalidator.__doc__ += " and ensures no values are null"
|
|
672
|
+
nvalidator.__doc__ += " and ensures no values are null" # pyright: ignore[reportOperatorIssue]
|
|
684
673
|
|
|
685
674
|
return nvalidator
|
|
686
675
|
|
|
687
676
|
|
|
677
|
+
@beta
|
|
688
678
|
def column_range_validation_factory(minim=None, maxim=None, ignore_missing_vals=False):
|
|
689
|
-
"""
|
|
690
|
-
|
|
679
|
+
"""Factory for validators testing if column values are within a range.
|
|
680
|
+
|
|
691
681
|
Args:
|
|
692
682
|
minim(Optional[Comparable]): the low end of the range
|
|
693
683
|
maxim(Optional[Comparable]): the high end of the range
|
|
694
|
-
ignore_missing_vals(Optional[bool]): whether to ignore nulls
|
|
684
|
+
ignore_missing_vals(Optional[bool]): whether to ignore nulls.
|
|
695
685
|
|
|
696
686
|
Returns: a validation function for this constraint
|
|
697
687
|
Usage:
|
|
@@ -738,21 +728,20 @@ def column_range_validation_factory(minim=None, maxim=None, ignore_missing_vals=
|
|
|
738
728
|
return True, {}
|
|
739
729
|
return (isinstance(x, (type(minim), type(maxim)))) and (x <= maxim) and (x >= minim), {}
|
|
740
730
|
|
|
741
|
-
in_range_validation_fn.__doc__ = "checks whether values are between {} and {}"
|
|
742
|
-
minim, maxim
|
|
743
|
-
)
|
|
731
|
+
in_range_validation_fn.__doc__ = f"checks whether values are between {minim} and {maxim}"
|
|
744
732
|
if ignore_missing_vals:
|
|
745
733
|
in_range_validation_fn.__doc__ += ", ignoring nulls"
|
|
746
734
|
|
|
747
735
|
return in_range_validation_fn
|
|
748
736
|
|
|
749
737
|
|
|
738
|
+
@beta
|
|
750
739
|
def categorical_column_validator_factory(categories, ignore_missing_vals=False):
|
|
751
|
-
"""
|
|
752
|
-
|
|
740
|
+
"""Factory for validators testing if all values are in some set.
|
|
741
|
+
|
|
753
742
|
Args:
|
|
754
743
|
categories(Union[Sequence, set]): the set of allowed values
|
|
755
|
-
ignore_missing_vals(Optional[bool]): whether to ignore nulls
|
|
744
|
+
ignore_missing_vals(Optional[bool]): whether to ignore nulls.
|
|
756
745
|
|
|
757
746
|
Returns: a validation function for this constraint
|
|
758
747
|
|
|
@@ -785,7 +774,6 @@ def categorical_column_validator_factory(categories, ignore_missing_vals=False):
|
|
|
785
774
|
metadata['actual'] == {'foo': {'categorical_validation_fn': [7]}}
|
|
786
775
|
|
|
787
776
|
"""
|
|
788
|
-
|
|
789
777
|
categories = set(categories)
|
|
790
778
|
|
|
791
779
|
def categorical_validation_fn(x):
|
|
@@ -794,7 +782,7 @@ def categorical_column_validator_factory(categories, ignore_missing_vals=False):
|
|
|
794
782
|
return (x in categories), {}
|
|
795
783
|
|
|
796
784
|
categorical_validation_fn.__doc__ = (
|
|
797
|
-
"checks whether values are within this set of values: {}"
|
|
785
|
+
f"checks whether values are within this set of values: {categories}"
|
|
798
786
|
)
|
|
799
787
|
if ignore_missing_vals:
|
|
800
788
|
categorical_validation_fn.__doc__ += ", ignoring nulls"
|
|
@@ -802,9 +790,10 @@ def categorical_column_validator_factory(categories, ignore_missing_vals=False):
|
|
|
802
790
|
return categorical_validation_fn
|
|
803
791
|
|
|
804
792
|
|
|
793
|
+
@beta
|
|
805
794
|
def dtype_in_set_validation_factory(datatypes, ignore_missing_vals=False):
|
|
806
|
-
"""
|
|
807
|
-
|
|
795
|
+
"""Factory for testing if the dtype of a val falls within some allowed set.
|
|
796
|
+
|
|
808
797
|
Args:
|
|
809
798
|
datatypes(Union[set[type], type]): which datatype/datatypes are allowed
|
|
810
799
|
ignore_missing_vals(Optional[bool]): whether to ignore nulls
|
|
@@ -846,9 +835,7 @@ def dtype_in_set_validation_factory(datatypes, ignore_missing_vals=False):
|
|
|
846
835
|
return True, {}
|
|
847
836
|
return isinstance(x, datatypes), {}
|
|
848
837
|
|
|
849
|
-
dtype_in_set_validation_fn.__doc__ = "checks whether values are this type/types: {}"
|
|
850
|
-
datatypes
|
|
851
|
-
)
|
|
838
|
+
dtype_in_set_validation_fn.__doc__ = f"checks whether values are this type/types: {datatypes}"
|
|
852
839
|
if ignore_missing_vals:
|
|
853
840
|
dtype_in_set_validation_fn.__doc__ += ", ignoring nulls"
|
|
854
841
|
|
|
@@ -859,8 +846,8 @@ class ColumnRangeConstraintWithMetadata(ColumnConstraintWithMetadata):
|
|
|
859
846
|
def __init__(self, minim=None, maxim=None, columns=None, raise_or_typecheck=True):
|
|
860
847
|
self.name = self.__class__.__name__
|
|
861
848
|
|
|
862
|
-
description = "Confirms values are between {} and {}"
|
|
863
|
-
super(
|
|
849
|
+
description = f"Confirms values are between {minim} and {maxim}"
|
|
850
|
+
super().__init__(
|
|
864
851
|
description=description,
|
|
865
852
|
validation_fn=column_range_validation_factory(minim=minim, maxim=maxim),
|
|
866
853
|
resulting_exception=ColumnWithMetadataException,
|
|
@@ -872,22 +859,19 @@ class ColumnRangeConstraintWithMetadata(ColumnConstraintWithMetadata):
|
|
|
872
859
|
if self.columns is None:
|
|
873
860
|
self.columns = list(data.columns)
|
|
874
861
|
self.columns.extend(args)
|
|
875
|
-
return super(
|
|
876
|
-
data, *self.columns, **kwargs
|
|
877
|
-
)
|
|
862
|
+
return super().validate(data, *self.columns, **kwargs)
|
|
878
863
|
|
|
879
864
|
|
|
880
865
|
class ColumnConstraint(Constraint):
|
|
881
|
-
"""
|
|
882
|
-
Base constraint object that represent dataframe column shape constraints.
|
|
866
|
+
"""Base constraint object that represent dataframe column shape constraints.
|
|
883
867
|
|
|
884
868
|
Args:
|
|
885
869
|
error_description (Optional[str]): The plain string description that is output in the terminal if the constraint fails.
|
|
886
|
-
markdown_description (Optional[str]): A markdown supported description that is
|
|
870
|
+
markdown_description (Optional[str]): A markdown supported description that is shown in the Dagster UI if the constraint fails.
|
|
887
871
|
"""
|
|
888
872
|
|
|
889
873
|
def __init__(self, error_description=None, markdown_description=None):
|
|
890
|
-
super(
|
|
874
|
+
super().__init__(
|
|
891
875
|
error_description=error_description, markdown_description=markdown_description
|
|
892
876
|
)
|
|
893
877
|
|
|
@@ -900,8 +884,7 @@ class ColumnConstraint(Constraint):
|
|
|
900
884
|
|
|
901
885
|
|
|
902
886
|
class ColumnDTypeFnConstraint(ColumnConstraint):
|
|
903
|
-
"""
|
|
904
|
-
A column constraint that applies a pandas dtype validation function to a columns dtype.
|
|
887
|
+
"""A column constraint that applies a pandas dtype validation function to a columns dtype.
|
|
905
888
|
|
|
906
889
|
Args:
|
|
907
890
|
type_fn (Callable[[Set[str]], bool]): This is a function that takes the pandas columns dtypes and
|
|
@@ -911,9 +894,7 @@ class ColumnDTypeFnConstraint(ColumnConstraint):
|
|
|
911
894
|
def __init__(self, type_fn):
|
|
912
895
|
self.type_fn = check.callable_param(type_fn, "type_fn")
|
|
913
896
|
description = f'Dtype must satisfy "{self.type_fn.__name__}"'
|
|
914
|
-
super(
|
|
915
|
-
error_description=description, markdown_description=description
|
|
916
|
-
)
|
|
897
|
+
super().__init__(error_description=description, markdown_description=description)
|
|
917
898
|
|
|
918
899
|
def validate(self, dataframe, column_name):
|
|
919
900
|
column_dtype = dataframe[column_name].dtype
|
|
@@ -926,8 +907,7 @@ class ColumnDTypeFnConstraint(ColumnConstraint):
|
|
|
926
907
|
|
|
927
908
|
|
|
928
909
|
class ColumnDTypeInSetConstraint(ColumnConstraint):
|
|
929
|
-
"""
|
|
930
|
-
A column constraint that validates the pandas column dtypes based on the expected set of dtypes.
|
|
910
|
+
"""A column constraint that validates the pandas column dtypes based on the expected set of dtypes.
|
|
931
911
|
|
|
932
912
|
Args:
|
|
933
913
|
expected_dtype_set (Set[str]): The set of pandas dtypes that the pandas column dtypes must match.
|
|
@@ -935,35 +915,27 @@ class ColumnDTypeInSetConstraint(ColumnConstraint):
|
|
|
935
915
|
|
|
936
916
|
def __init__(self, expected_dtype_set):
|
|
937
917
|
self.expected_dtype_set = check.set_param(expected_dtype_set, "expected_dtype_set")
|
|
938
|
-
description = "Column dtype must be in the following set {}."
|
|
939
|
-
|
|
940
|
-
)
|
|
941
|
-
super(ColumnDTypeInSetConstraint, self).__init__(
|
|
942
|
-
error_description=description, markdown_description=description
|
|
943
|
-
)
|
|
918
|
+
description = f"Column dtype must be in the following set {self.expected_dtype_set}."
|
|
919
|
+
super().__init__(error_description=description, markdown_description=description)
|
|
944
920
|
|
|
945
921
|
def validate(self, dataframe, column_name):
|
|
946
922
|
received_dtypes = dataframe[column_name].dtype
|
|
947
923
|
if str(received_dtypes) not in self.expected_dtype_set:
|
|
948
924
|
raise ColumnConstraintViolationException(
|
|
949
925
|
constraint_name=self.name,
|
|
950
|
-
constraint_description=
|
|
951
|
-
|
|
926
|
+
constraint_description=(
|
|
927
|
+
f"{self.error_description}. DTypes received: {received_dtypes}"
|
|
952
928
|
),
|
|
953
929
|
column_name=column_name,
|
|
954
930
|
)
|
|
955
931
|
|
|
956
932
|
|
|
957
933
|
class NonNullableColumnConstraint(ColumnConstraint):
|
|
958
|
-
"""
|
|
959
|
-
A column constraint that ensures all values in a pandas column are not null.
|
|
960
|
-
"""
|
|
934
|
+
"""A column constraint that ensures all values in a pandas column are not null."""
|
|
961
935
|
|
|
962
936
|
def __init__(self):
|
|
963
937
|
description = "No Null values allowed."
|
|
964
|
-
super(
|
|
965
|
-
error_description=description, markdown_description=description
|
|
966
|
-
)
|
|
938
|
+
super().__init__(error_description=description, markdown_description=description)
|
|
967
939
|
|
|
968
940
|
def validate(self, dataframe, column_name):
|
|
969
941
|
rows_with_null_columns = dataframe[dataframe[column_name].isna()]
|
|
@@ -977,8 +949,7 @@ class NonNullableColumnConstraint(ColumnConstraint):
|
|
|
977
949
|
|
|
978
950
|
|
|
979
951
|
class UniqueColumnConstraint(ColumnConstraint):
|
|
980
|
-
"""
|
|
981
|
-
A column constraint that ensures all values in a pandas column are unique.
|
|
952
|
+
"""A column constraint that ensures all values in a pandas column are unique.
|
|
982
953
|
|
|
983
954
|
Args:
|
|
984
955
|
ignore_missing_vals (bool): If true, this constraint will enforce the constraint on non missing values.
|
|
@@ -987,9 +958,7 @@ class UniqueColumnConstraint(ColumnConstraint):
|
|
|
987
958
|
def __init__(self, ignore_missing_vals):
|
|
988
959
|
description = "Column must be unique."
|
|
989
960
|
self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")
|
|
990
|
-
super(
|
|
991
|
-
error_description=description, markdown_description=description
|
|
992
|
-
)
|
|
961
|
+
super().__init__(error_description=description, markdown_description=description)
|
|
993
962
|
|
|
994
963
|
def validate(self, dataframe, column_name):
|
|
995
964
|
invalid = dataframe[column_name].duplicated()
|
|
@@ -1006,8 +975,7 @@ class UniqueColumnConstraint(ColumnConstraint):
|
|
|
1006
975
|
|
|
1007
976
|
|
|
1008
977
|
class CategoricalColumnConstraint(ColumnConstraint):
|
|
1009
|
-
"""
|
|
1010
|
-
A column constraint that ensures all values in a pandas column are a valid category.
|
|
978
|
+
"""A column constraint that ensures all values in a pandas column are a valid category.
|
|
1011
979
|
|
|
1012
980
|
Args:
|
|
1013
981
|
categories (Set[str]): Set of categories that values in your pandas column must match.
|
|
@@ -1017,9 +985,9 @@ class CategoricalColumnConstraint(ColumnConstraint):
|
|
|
1017
985
|
def __init__(self, categories, ignore_missing_vals):
|
|
1018
986
|
self.categories = list(check.set_param(categories, "categories", of_type=str))
|
|
1019
987
|
self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")
|
|
1020
|
-
super(
|
|
1021
|
-
error_description="Expected Categories are {
|
|
1022
|
-
markdown_description="Category examples are {
|
|
988
|
+
super().__init__(
|
|
989
|
+
error_description=f"Expected Categories are {self.categories}",
|
|
990
|
+
markdown_description=f"Category examples are {self.categories[:5]}...",
|
|
1023
991
|
)
|
|
1024
992
|
|
|
1025
993
|
def validate(self, dataframe, column_name):
|
|
@@ -1037,8 +1005,7 @@ class CategoricalColumnConstraint(ColumnConstraint):
|
|
|
1037
1005
|
|
|
1038
1006
|
|
|
1039
1007
|
class MinValueColumnConstraint(ColumnConstraint):
|
|
1040
|
-
"""
|
|
1041
|
-
A column constraint that ensures all values in a pandas column are greater than the provided
|
|
1008
|
+
"""A column constraint that ensures all values in a pandas column are greater than the provided
|
|
1042
1009
|
lower bound [inclusive].
|
|
1043
1010
|
|
|
1044
1011
|
Args:
|
|
@@ -1049,9 +1016,9 @@ class MinValueColumnConstraint(ColumnConstraint):
|
|
|
1049
1016
|
def __init__(self, min_value, ignore_missing_vals):
|
|
1050
1017
|
self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))
|
|
1051
1018
|
self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")
|
|
1052
|
-
super(
|
|
1053
|
-
markdown_description="values > {
|
|
1054
|
-
error_description="Column must have values > {
|
|
1019
|
+
super().__init__(
|
|
1020
|
+
markdown_description=f"values > {self.min_value}",
|
|
1021
|
+
error_description=f"Column must have values > {self.min_value}",
|
|
1055
1022
|
)
|
|
1056
1023
|
|
|
1057
1024
|
def validate(self, dataframe, column_name):
|
|
@@ -1069,8 +1036,7 @@ class MinValueColumnConstraint(ColumnConstraint):
|
|
|
1069
1036
|
|
|
1070
1037
|
|
|
1071
1038
|
class MaxValueColumnConstraint(ColumnConstraint):
|
|
1072
|
-
"""
|
|
1073
|
-
A column constraint that ensures all values in a pandas column are less than the provided
|
|
1039
|
+
"""A column constraint that ensures all values in a pandas column are less than the provided
|
|
1074
1040
|
upper bound [inclusive].
|
|
1075
1041
|
|
|
1076
1042
|
Args:
|
|
@@ -1081,9 +1047,9 @@ class MaxValueColumnConstraint(ColumnConstraint):
|
|
|
1081
1047
|
def __init__(self, max_value, ignore_missing_vals):
|
|
1082
1048
|
self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))
|
|
1083
1049
|
self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")
|
|
1084
|
-
super(
|
|
1085
|
-
markdown_description="values < {
|
|
1086
|
-
error_description="Column must have values < {
|
|
1050
|
+
super().__init__(
|
|
1051
|
+
markdown_description=f"values < {self.max_value}",
|
|
1052
|
+
error_description=f"Column must have values < {self.max_value}",
|
|
1087
1053
|
)
|
|
1088
1054
|
|
|
1089
1055
|
def validate(self, dataframe, column_name):
|
|
@@ -1101,8 +1067,7 @@ class MaxValueColumnConstraint(ColumnConstraint):
|
|
|
1101
1067
|
|
|
1102
1068
|
|
|
1103
1069
|
class InRangeColumnConstraint(ColumnConstraint):
|
|
1104
|
-
"""
|
|
1105
|
-
A column constraint that ensures all values in a pandas column are between the lower and upper
|
|
1070
|
+
"""A column constraint that ensures all values in a pandas column are between the lower and upper
|
|
1106
1071
|
bound [inclusive].
|
|
1107
1072
|
|
|
1108
1073
|
Args:
|
|
@@ -1116,11 +1081,9 @@ class InRangeColumnConstraint(ColumnConstraint):
|
|
|
1116
1081
|
self.min_value = check.inst_param(min_value, "min_value", (int, float, datetime))
|
|
1117
1082
|
self.max_value = check.inst_param(max_value, "max_value", (int, float, datetime))
|
|
1118
1083
|
self.ignore_missing_vals = check.bool_param(ignore_missing_vals, "ignore_missing_vals")
|
|
1119
|
-
super(
|
|
1120
|
-
markdown_description="{} < values < {
|
|
1121
|
-
error_description="Column must have values between {} and {} inclusive."
|
|
1122
|
-
self.min_value, self.max_value
|
|
1123
|
-
),
|
|
1084
|
+
super().__init__(
|
|
1085
|
+
markdown_description=f"{self.min_value} < values < {self.max_value}",
|
|
1086
|
+
error_description=f"Column must have values between {self.min_value} and {self.max_value} inclusive.",
|
|
1124
1087
|
)
|
|
1125
1088
|
|
|
1126
1089
|
def validate(self, dataframe, column_name):
|