pointblank 0.11.3__py3-none-any.whl → 0.11.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +9 -0
- pointblank/cli.py +226 -77
- pointblank/yaml.py +1386 -0
- {pointblank-0.11.3.dist-info → pointblank-0.11.5.dist-info}/METADATA +2 -1
- {pointblank-0.11.3.dist-info → pointblank-0.11.5.dist-info}/RECORD +9 -8
- {pointblank-0.11.3.dist-info → pointblank-0.11.5.dist-info}/WHEEL +0 -0
- {pointblank-0.11.3.dist-info → pointblank-0.11.5.dist-info}/entry_points.txt +0 -0
- {pointblank-0.11.3.dist-info → pointblank-0.11.5.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.11.3.dist-info → pointblank-0.11.5.dist-info}/top_level.txt +0 -0
pointblank/yaml.py
ADDED
|
@@ -0,0 +1,1386 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Union
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
from pointblank.thresholds import Actions
|
|
9
|
+
from pointblank.validate import Validate, load_dataset
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class YAMLValidationError(Exception):
|
|
13
|
+
"""Exception raised for YAML validation errors."""
|
|
14
|
+
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _safe_eval_python_code(code: str) -> Any:
|
|
19
|
+
"""Safely evaluate Python code with restricted namespace.
|
|
20
|
+
|
|
21
|
+
This function provides a controlled environment for executing Python code embedded in YAML
|
|
22
|
+
configurations. It includes common libraries and functions while restricting access to
|
|
23
|
+
dangerous operations.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
code
|
|
28
|
+
The Python code to evaluate.
|
|
29
|
+
|
|
30
|
+
Returns
|
|
31
|
+
-------
|
|
32
|
+
Any
|
|
33
|
+
The result of evaluating the Python code.
|
|
34
|
+
|
|
35
|
+
Raises
|
|
36
|
+
------
|
|
37
|
+
YAMLValidationError
|
|
38
|
+
If the code execution fails or contains unsafe operations.
|
|
39
|
+
"""
|
|
40
|
+
import ast
|
|
41
|
+
import re
|
|
42
|
+
from pathlib import Path
|
|
43
|
+
|
|
44
|
+
from pointblank._utils import _is_lib_present
|
|
45
|
+
|
|
46
|
+
# Create a safe namespace with commonly needed imports
|
|
47
|
+
safe_namespace = {
|
|
48
|
+
"Path": Path, # pathlib.Path
|
|
49
|
+
"__builtins__": {
|
|
50
|
+
# Allow basic built-in functions
|
|
51
|
+
"len": len,
|
|
52
|
+
"str": str,
|
|
53
|
+
"int": int,
|
|
54
|
+
"float": float,
|
|
55
|
+
"bool": bool,
|
|
56
|
+
"list": list,
|
|
57
|
+
"dict": dict,
|
|
58
|
+
"tuple": tuple,
|
|
59
|
+
"set": set,
|
|
60
|
+
"range": range,
|
|
61
|
+
"enumerate": enumerate,
|
|
62
|
+
"zip": zip,
|
|
63
|
+
"sum": sum,
|
|
64
|
+
"min": min,
|
|
65
|
+
"max": max,
|
|
66
|
+
"abs": abs,
|
|
67
|
+
"round": round,
|
|
68
|
+
"print": print,
|
|
69
|
+
},
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# Add pointblank itself to the namespace
|
|
73
|
+
import pointblank as pb
|
|
74
|
+
|
|
75
|
+
safe_namespace["pb"] = pb
|
|
76
|
+
|
|
77
|
+
# Add polars if available
|
|
78
|
+
if _is_lib_present("polars"):
|
|
79
|
+
import polars as pl
|
|
80
|
+
|
|
81
|
+
safe_namespace["pl"] = pl
|
|
82
|
+
|
|
83
|
+
# Add pandas if available
|
|
84
|
+
if _is_lib_present("pandas"):
|
|
85
|
+
import pandas as pd
|
|
86
|
+
|
|
87
|
+
safe_namespace["pd"] = pd
|
|
88
|
+
|
|
89
|
+
# Check for dangerous patterns
|
|
90
|
+
dangerous_patterns = [
|
|
91
|
+
r"import\s+os",
|
|
92
|
+
r"import\s+sys",
|
|
93
|
+
r"import\s+subprocess",
|
|
94
|
+
r"__import__",
|
|
95
|
+
r"exec\s*\(",
|
|
96
|
+
r"eval\s*\(",
|
|
97
|
+
r"open\s*\(",
|
|
98
|
+
r"file\s*\(",
|
|
99
|
+
r"input\s*\(",
|
|
100
|
+
r"raw_input\s*\(",
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
for pattern in dangerous_patterns:
|
|
104
|
+
if re.search(pattern, code, re.IGNORECASE):
|
|
105
|
+
raise YAMLValidationError(
|
|
106
|
+
f"Potentially unsafe Python code detected: '{code}'. "
|
|
107
|
+
f"Pattern '{pattern}' is not allowed."
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
try:
|
|
111
|
+
# First try to parse as expression for simple cases
|
|
112
|
+
try:
|
|
113
|
+
parsed = ast.parse(code, mode="eval")
|
|
114
|
+
return eval(compile(parsed, "<string>", "eval"), safe_namespace)
|
|
115
|
+
except SyntaxError:
|
|
116
|
+
# If that fails, try as a statement (for more complex code)
|
|
117
|
+
# For multi-statement code, we need to capture the result of the last expression
|
|
118
|
+
parsed = ast.parse(code, mode="exec")
|
|
119
|
+
|
|
120
|
+
# Check if the last node is an expression
|
|
121
|
+
if parsed.body and isinstance(parsed.body[-1], ast.Expr):
|
|
122
|
+
# Split the last expression from the statements
|
|
123
|
+
statements = parsed.body[:-1]
|
|
124
|
+
last_expr = parsed.body[-1].value
|
|
125
|
+
|
|
126
|
+
# Execute the statements first
|
|
127
|
+
if statements:
|
|
128
|
+
statements_module = ast.Module(body=statements, type_ignores=[])
|
|
129
|
+
exec(compile(statements_module, "<string>", "exec"), safe_namespace)
|
|
130
|
+
|
|
131
|
+
# Then evaluate the last expression and return its value
|
|
132
|
+
expr_module = ast.Expression(body=last_expr)
|
|
133
|
+
return eval(compile(expr_module, "<string>", "eval"), safe_namespace)
|
|
134
|
+
else:
|
|
135
|
+
# No expression at the end, just execute statements
|
|
136
|
+
exec(compile(parsed, "<string>", "exec"), safe_namespace)
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
except Exception as e:
|
|
140
|
+
raise YAMLValidationError(f"Error executing Python code '{code}': {e}")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _process_python_expressions(value: Any) -> Any:
|
|
144
|
+
"""Process Python code snippets embedded in YAML values.
|
|
145
|
+
|
|
146
|
+
This function supports the python: block syntax for embedding Python code:
|
|
147
|
+
|
|
148
|
+
python: |
|
|
149
|
+
import polars as pl
|
|
150
|
+
pl.scan_csv("data.csv").head(10)
|
|
151
|
+
|
|
152
|
+
Note: col_vals_expr() also supports a shortcut syntax where the expr parameter
|
|
153
|
+
can be written directly without the python: wrapper:
|
|
154
|
+
|
|
155
|
+
col_vals_expr:
|
|
156
|
+
expr: |
|
|
157
|
+
pl.col("column") > 0
|
|
158
|
+
|
|
159
|
+
Parameters
|
|
160
|
+
----------
|
|
161
|
+
value
|
|
162
|
+
The value to process, can be any YAML type.
|
|
163
|
+
|
|
164
|
+
Returns
|
|
165
|
+
-------
|
|
166
|
+
Any
|
|
167
|
+
The processed value with Python expressions evaluated.
|
|
168
|
+
|
|
169
|
+
Examples
|
|
170
|
+
--------
|
|
171
|
+
>>> _process_python_expressions({"python": "pl.scan_csv('data.csv').head(10)"})
|
|
172
|
+
# Returns the result of the Python expression
|
|
173
|
+
|
|
174
|
+
>>> _process_python_expressions({"python": "import polars as pl\\npl.scan_csv('data.csv')"})
|
|
175
|
+
# Returns the result of multiline Python code
|
|
176
|
+
"""
|
|
177
|
+
if isinstance(value, dict):
|
|
178
|
+
# Handle python: block syntax
|
|
179
|
+
if "python" in value and len(value) == 1:
|
|
180
|
+
code = value["python"]
|
|
181
|
+
return _safe_eval_python_code(code)
|
|
182
|
+
|
|
183
|
+
# Recursively process dictionary values
|
|
184
|
+
return {k: _process_python_expressions(v) for k, v in value.items()}
|
|
185
|
+
|
|
186
|
+
elif isinstance(value, list):
|
|
187
|
+
# Recursively process list items
|
|
188
|
+
return [_process_python_expressions(item) for item in value]
|
|
189
|
+
|
|
190
|
+
else:
|
|
191
|
+
# Return primitive types unchanged
|
|
192
|
+
return value
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class YAMLValidator:
|
|
196
|
+
"""Validates YAML configuration and converts to Validate objects."""
|
|
197
|
+
|
|
198
|
+
# Map YAML method names to Python method names
|
|
199
|
+
validation_method_map = {
|
|
200
|
+
"col_exists": "col_exists",
|
|
201
|
+
"col_vals_gt": "col_vals_gt",
|
|
202
|
+
"col_vals_ge": "col_vals_ge",
|
|
203
|
+
"col_vals_lt": "col_vals_lt",
|
|
204
|
+
"col_vals_le": "col_vals_le",
|
|
205
|
+
"col_vals_eq": "col_vals_eq",
|
|
206
|
+
"col_vals_ne": "col_vals_ne",
|
|
207
|
+
"col_vals_between": "col_vals_between",
|
|
208
|
+
"col_vals_outside": "col_vals_outside",
|
|
209
|
+
"col_vals_regex": "col_vals_regex",
|
|
210
|
+
"col_vals_in_set": "col_vals_in_set",
|
|
211
|
+
"col_vals_not_in_set": "col_vals_not_in_set",
|
|
212
|
+
"col_vals_not_null": "col_vals_not_null",
|
|
213
|
+
"col_vals_null": "col_vals_null",
|
|
214
|
+
"col_vals_expr": "col_vals_expr",
|
|
215
|
+
"rows_distinct": "rows_distinct",
|
|
216
|
+
"rows_complete": "rows_complete",
|
|
217
|
+
"col_count_match": "col_count_match",
|
|
218
|
+
"row_count_match": "row_count_match",
|
|
219
|
+
"col_schema_match": "col_schema_match",
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
def __init__(self):
|
|
223
|
+
"""Initialize the YAML validator."""
|
|
224
|
+
pass
|
|
225
|
+
|
|
226
|
+
def load_config(self, source: Union[str, Path]) -> dict:
|
|
227
|
+
"""Load and validate YAML configuration.
|
|
228
|
+
|
|
229
|
+
Parameters
|
|
230
|
+
----------
|
|
231
|
+
source
|
|
232
|
+
YAML string or Path to YAML file.
|
|
233
|
+
|
|
234
|
+
Returns
|
|
235
|
+
-------
|
|
236
|
+
dict
|
|
237
|
+
Parsed and validated configuration dictionary.
|
|
238
|
+
|
|
239
|
+
Raises
|
|
240
|
+
------
|
|
241
|
+
YAMLValidationError
|
|
242
|
+
If the YAML is invalid or malformed.
|
|
243
|
+
"""
|
|
244
|
+
try:
|
|
245
|
+
if isinstance(source, (str, Path)):
|
|
246
|
+
if isinstance(source, Path):
|
|
247
|
+
# It's definitely a file path
|
|
248
|
+
with open(source, "r", encoding="utf-8") as f:
|
|
249
|
+
config = yaml.safe_load(f)
|
|
250
|
+
elif isinstance(source, str):
|
|
251
|
+
# Check if it looks like YAML content
|
|
252
|
+
stripped = source.strip()
|
|
253
|
+
if (
|
|
254
|
+
stripped.startswith(("tbl:", "steps:"))
|
|
255
|
+
or "\n" in stripped
|
|
256
|
+
or ":" in stripped
|
|
257
|
+
):
|
|
258
|
+
# Looks like YAML content
|
|
259
|
+
config = yaml.safe_load(source)
|
|
260
|
+
else:
|
|
261
|
+
# Assume it's a file path
|
|
262
|
+
with open(source, "r", encoding="utf-8") as f:
|
|
263
|
+
config = yaml.safe_load(f)
|
|
264
|
+
else:
|
|
265
|
+
raise YAMLValidationError(
|
|
266
|
+
f"Invalid source type: {type(source)}. Only YAML strings and file paths supported."
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
if not isinstance(config, dict):
|
|
270
|
+
raise YAMLValidationError("YAML must contain a dictionary at the root level")
|
|
271
|
+
|
|
272
|
+
self._validate_schema(config)
|
|
273
|
+
return config
|
|
274
|
+
|
|
275
|
+
except yaml.YAMLError as e:
|
|
276
|
+
raise YAMLValidationError(f"Invalid YAML syntax: {e}")
|
|
277
|
+
except Exception as e:
|
|
278
|
+
raise YAMLValidationError(f"Error loading YAML configuration: {e}")
|
|
279
|
+
|
|
280
|
+
def _validate_schema(self, config: dict) -> None:
|
|
281
|
+
"""Validate the YAML configuration schema.
|
|
282
|
+
|
|
283
|
+
Parameters
|
|
284
|
+
----------
|
|
285
|
+
config
|
|
286
|
+
Configuration dictionary to validate.
|
|
287
|
+
|
|
288
|
+
Raises
|
|
289
|
+
------
|
|
290
|
+
YAMLValidationError
|
|
291
|
+
If the schema is invalid.
|
|
292
|
+
"""
|
|
293
|
+
# Check required fields
|
|
294
|
+
if "tbl" not in config:
|
|
295
|
+
raise YAMLValidationError("YAML must contain 'tbl' field")
|
|
296
|
+
|
|
297
|
+
if "steps" not in config:
|
|
298
|
+
raise YAMLValidationError("YAML must contain 'steps' field")
|
|
299
|
+
|
|
300
|
+
if not isinstance(config["steps"], list):
|
|
301
|
+
raise YAMLValidationError("'steps' must be a list")
|
|
302
|
+
|
|
303
|
+
if len(config["steps"]) == 0:
|
|
304
|
+
raise YAMLValidationError("'steps' cannot be empty")
|
|
305
|
+
|
|
306
|
+
# Validate thresholds if present
|
|
307
|
+
if "thresholds" in config:
|
|
308
|
+
thresholds = config["thresholds"]
|
|
309
|
+
if not isinstance(thresholds, dict):
|
|
310
|
+
raise YAMLValidationError("'thresholds' must be a dictionary")
|
|
311
|
+
|
|
312
|
+
for key, value in thresholds.items():
|
|
313
|
+
if key not in ["warning", "error", "critical"]:
|
|
314
|
+
raise YAMLValidationError(
|
|
315
|
+
f"Invalid threshold key: {key}. Must be 'warning', 'error', or 'critical'"
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
if not isinstance(value, (int, float)):
|
|
319
|
+
raise YAMLValidationError(f"Threshold '{key}' must be a number")
|
|
320
|
+
|
|
321
|
+
if value < 0:
|
|
322
|
+
raise YAMLValidationError(f"Threshold '{key}' must be non-negative")
|
|
323
|
+
|
|
324
|
+
# Validate actions if present
|
|
325
|
+
if "actions" in config:
|
|
326
|
+
actions = config["actions"]
|
|
327
|
+
if not isinstance(actions, dict):
|
|
328
|
+
raise YAMLValidationError("'actions' must be a dictionary")
|
|
329
|
+
|
|
330
|
+
for key, value in actions.items():
|
|
331
|
+
if key not in ["warning", "error", "critical", "default", "highest_only"]:
|
|
332
|
+
raise YAMLValidationError(
|
|
333
|
+
f"Invalid action key: {key}. Must be 'warning', 'error', 'critical', "
|
|
334
|
+
f"'default', or 'highest_only'"
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
if key == "highest_only":
|
|
338
|
+
if not isinstance(value, bool):
|
|
339
|
+
raise YAMLValidationError(f"Action '{key}' must be a boolean")
|
|
340
|
+
else:
|
|
341
|
+
# Action values can be strings or have python: block syntax for callables
|
|
342
|
+
if not isinstance(value, (str, dict, list)):
|
|
343
|
+
raise YAMLValidationError(
|
|
344
|
+
f"Action '{key}' must be a string, dictionary (for python: block), "
|
|
345
|
+
f"or list of strings/dictionaries"
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
def _load_data_source(self, tbl_spec: str) -> Any:
|
|
349
|
+
"""Load data source based on table specification.
|
|
350
|
+
|
|
351
|
+
Parameters
|
|
352
|
+
----------
|
|
353
|
+
tbl_spec
|
|
354
|
+
Data source specification. Can be (1) a dataset name for `load_dataset()`, (2) a CSV file
|
|
355
|
+
path (relative or absolute), (3) a Parquet file path (relative or absolute), or (4) a
|
|
356
|
+
Python code snippet to be executed for dynamic data loading.
|
|
357
|
+
|
|
358
|
+
Returns
|
|
359
|
+
-------
|
|
360
|
+
Loaded data object.
|
|
361
|
+
|
|
362
|
+
Raises
|
|
363
|
+
------
|
|
364
|
+
YAMLValidationError
|
|
365
|
+
If data source cannot be loaded.
|
|
366
|
+
"""
|
|
367
|
+
from pointblank.validate import _process_data
|
|
368
|
+
|
|
369
|
+
try:
|
|
370
|
+
# First, try to process as Python expression
|
|
371
|
+
processed_tbl_spec = _process_python_expressions(tbl_spec)
|
|
372
|
+
|
|
373
|
+
# If processing returned a different object (not a string), use it directly
|
|
374
|
+
if processed_tbl_spec is not tbl_spec or not isinstance(processed_tbl_spec, str):
|
|
375
|
+
return processed_tbl_spec
|
|
376
|
+
|
|
377
|
+
# Use the centralized data processing pipeline from validate.py
|
|
378
|
+
# This handles CSV files, Parquet files, and other data sources
|
|
379
|
+
processed_data = _process_data(processed_tbl_spec)
|
|
380
|
+
|
|
381
|
+
# If _process_data returns the original string unchanged,
|
|
382
|
+
# then it's not a file path, so try load_dataset
|
|
383
|
+
if processed_data is processed_tbl_spec and isinstance(processed_tbl_spec, str):
|
|
384
|
+
return load_dataset(processed_tbl_spec)
|
|
385
|
+
else:
|
|
386
|
+
return processed_data
|
|
387
|
+
|
|
388
|
+
except Exception as e:
|
|
389
|
+
raise YAMLValidationError(f"Failed to load data source '{tbl_spec}': {e}")
|
|
390
|
+
|
|
391
|
+
def _parse_column_spec(self, columns_expr: Any) -> list[str]:
|
|
392
|
+
"""Parse column specification from YAML.
|
|
393
|
+
|
|
394
|
+
Handles standard YAML syntax for columns.
|
|
395
|
+
|
|
396
|
+
Parameters
|
|
397
|
+
----------
|
|
398
|
+
columns_expr
|
|
399
|
+
Column specification (list, or string).
|
|
400
|
+
|
|
401
|
+
Returns
|
|
402
|
+
-------
|
|
403
|
+
list[str]
|
|
404
|
+
List of column names.
|
|
405
|
+
"""
|
|
406
|
+
if isinstance(columns_expr, list):
|
|
407
|
+
return [str(col) for col in columns_expr]
|
|
408
|
+
|
|
409
|
+
if isinstance(columns_expr, str):
|
|
410
|
+
# Single column name
|
|
411
|
+
return [columns_expr]
|
|
412
|
+
|
|
413
|
+
# Fallback: convert to string
|
|
414
|
+
return [str(columns_expr)]
|
|
415
|
+
|
|
416
|
+
def _parse_schema_spec(self, schema_spec: Any) -> Any:
|
|
417
|
+
"""Parse schema specification from YAML.
|
|
418
|
+
|
|
419
|
+
Converts dictionary-based schema definitions into Schema objects.
|
|
420
|
+
|
|
421
|
+
Column specifications support multiple formats:
|
|
422
|
+
- Scalar strings: "column_name" (name only, no type checking)
|
|
423
|
+
- Lists with name and type: ["column_name", "data_type"]
|
|
424
|
+
- Lists with name only: ["column_name"] (equivalent to scalar)
|
|
425
|
+
|
|
426
|
+
Parameters
|
|
427
|
+
----------
|
|
428
|
+
schema_spec
|
|
429
|
+
Schema specification as a dictionary with 'columns' field.
|
|
430
|
+
|
|
431
|
+
Returns
|
|
432
|
+
-------
|
|
433
|
+
Schema
|
|
434
|
+
A Schema object created from the specification.
|
|
435
|
+
|
|
436
|
+
Raises
|
|
437
|
+
------
|
|
438
|
+
YAMLValidationError
|
|
439
|
+
If schema specification is invalid.
|
|
440
|
+
"""
|
|
441
|
+
from pointblank.schema import Schema
|
|
442
|
+
|
|
443
|
+
# Handle dictionary specification only
|
|
444
|
+
if isinstance(schema_spec, dict):
|
|
445
|
+
if "columns" in schema_spec:
|
|
446
|
+
# Convert columns list to a `Schema` object
|
|
447
|
+
columns_spec = schema_spec["columns"]
|
|
448
|
+
|
|
449
|
+
if not isinstance(columns_spec, list):
|
|
450
|
+
raise YAMLValidationError(
|
|
451
|
+
"Schema 'columns' must be a list of column specifications"
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
# Convert YAML column specs to `Schema` format
|
|
455
|
+
schema_columns = []
|
|
456
|
+
for col_spec in columns_spec:
|
|
457
|
+
if isinstance(col_spec, list):
|
|
458
|
+
if len(col_spec) == 1:
|
|
459
|
+
# Column name only: ["column_name"]
|
|
460
|
+
schema_columns.append((col_spec[0],))
|
|
461
|
+
elif len(col_spec) == 2:
|
|
462
|
+
# Column name and type: ["column_name", "type"]
|
|
463
|
+
schema_columns.append((col_spec[0], col_spec[1]))
|
|
464
|
+
else:
|
|
465
|
+
raise YAMLValidationError(
|
|
466
|
+
f"Column specification must have 1-2 elements, got: {col_spec}"
|
|
467
|
+
)
|
|
468
|
+
elif isinstance(col_spec, str):
|
|
469
|
+
# Just column name as string
|
|
470
|
+
schema_columns.append((col_spec,))
|
|
471
|
+
else:
|
|
472
|
+
raise YAMLValidationError(
|
|
473
|
+
f"Invalid column specification type: {type(col_spec)}"
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
# Create Schema object
|
|
477
|
+
return Schema(columns=schema_columns)
|
|
478
|
+
else:
|
|
479
|
+
raise YAMLValidationError("Schema specification must contain 'columns' field")
|
|
480
|
+
else:
|
|
481
|
+
raise YAMLValidationError(
|
|
482
|
+
f"Schema specification must be a dictionary, got: {type(schema_spec)}"
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
def _parse_validation_step(self, step_config: Union[str, dict]) -> tuple[str, dict]:
|
|
486
|
+
"""Parse a single validation step from YAML configuration.
|
|
487
|
+
|
|
488
|
+
Parameters
|
|
489
|
+
----------
|
|
490
|
+
step_config
|
|
491
|
+
Step configuration (string for parameterless steps, dict for others).
|
|
492
|
+
|
|
493
|
+
Returns
|
|
494
|
+
-------
|
|
495
|
+
tuple[str, dict]
|
|
496
|
+
Tuple of (method_name, parameters).
|
|
497
|
+
|
|
498
|
+
Raises
|
|
499
|
+
------
|
|
500
|
+
YAMLValidationError
|
|
501
|
+
If step configuration is invalid.
|
|
502
|
+
"""
|
|
503
|
+
if isinstance(step_config, str):
|
|
504
|
+
# Simple step with no parameters (e.g., "rows_distinct")
|
|
505
|
+
method_name = step_config
|
|
506
|
+
parameters = {}
|
|
507
|
+
elif isinstance(step_config, dict):
|
|
508
|
+
# Step with parameters
|
|
509
|
+
if len(step_config) != 1:
|
|
510
|
+
raise YAMLValidationError(
|
|
511
|
+
"Step configuration must contain exactly one validation method, "
|
|
512
|
+
f"got: {list(step_config.keys())}"
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
method_name = list(step_config.keys())[0]
|
|
516
|
+
parameters = step_config[method_name] or {}
|
|
517
|
+
|
|
518
|
+
if not isinstance(parameters, dict):
|
|
519
|
+
raise YAMLValidationError(f"Parameters for '{method_name}' must be a dictionary")
|
|
520
|
+
else:
|
|
521
|
+
raise YAMLValidationError(f"Invalid step configuration type: {type(step_config)}")
|
|
522
|
+
|
|
523
|
+
# Validate that we know this method
|
|
524
|
+
if method_name not in self.validation_method_map:
|
|
525
|
+
available_methods = list(self.validation_method_map.keys())
|
|
526
|
+
raise YAMLValidationError(
|
|
527
|
+
f"Unknown validation method '{method_name}'. Available methods: {available_methods}"
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
# Process Python expressions in all parameters
|
|
531
|
+
processed_parameters = {}
|
|
532
|
+
for key, value in parameters.items():
|
|
533
|
+
# Special case: `col_vals_expr()`'s `expr=` parameter can use shortcut syntax
|
|
534
|
+
if method_name == "col_vals_expr" and key == "expr" and isinstance(value, str):
|
|
535
|
+
# Treat string directly as Python code (shortcut syntax)
|
|
536
|
+
processed_parameters[key] = _safe_eval_python_code(value)
|
|
537
|
+
# Special case: `pre=` parameter can use shortcut syntax (like `expr=`)
|
|
538
|
+
elif key == "pre" and isinstance(value, str):
|
|
539
|
+
# Treat string directly as Python code (shortcut syntax)
|
|
540
|
+
processed_parameters[key] = _safe_eval_python_code(value)
|
|
541
|
+
else:
|
|
542
|
+
# Normal processing (requires python: block syntax)
|
|
543
|
+
processed_parameters[key] = _process_python_expressions(value)
|
|
544
|
+
parameters = processed_parameters
|
|
545
|
+
|
|
546
|
+
# Convert `columns=` specification
|
|
547
|
+
if "columns" in parameters:
|
|
548
|
+
parameters["columns"] = self._parse_column_spec(parameters["columns"])
|
|
549
|
+
|
|
550
|
+
#
|
|
551
|
+
# Convert special parameter formats
|
|
552
|
+
#
|
|
553
|
+
|
|
554
|
+
# Convert `columns_subset=` if present (from `rows_[distinct|complete]()`)
|
|
555
|
+
if "columns_subset" in parameters:
|
|
556
|
+
parameters["columns_subset"] = self._parse_column_spec(parameters["columns_subset"])
|
|
557
|
+
|
|
558
|
+
# Convert `schema=` if present (for `col_schema_match()`)
|
|
559
|
+
if "schema" in parameters and method_name == "col_schema_match":
|
|
560
|
+
parameters["schema"] = self._parse_schema_spec(parameters["schema"])
|
|
561
|
+
|
|
562
|
+
# Convert `actions=` if present (ensure it's an Actions object)
|
|
563
|
+
if "actions" in parameters:
|
|
564
|
+
if isinstance(parameters["actions"], dict):
|
|
565
|
+
parameters["actions"] = Actions(**parameters["actions"])
|
|
566
|
+
|
|
567
|
+
# Handle `inclusive=` parameter for `col_vals_[inside|outside]()` (convert list to tuple)
|
|
568
|
+
if "inclusive" in parameters and isinstance(parameters["inclusive"], list):
|
|
569
|
+
parameters["inclusive"] = tuple(parameters["inclusive"])
|
|
570
|
+
|
|
571
|
+
return self.validation_method_map[method_name], parameters
|
|
572
|
+
|
|
573
|
+
def build_validation(self, config: dict) -> Validate:
|
|
574
|
+
"""Convert YAML config to Validate object.
|
|
575
|
+
|
|
576
|
+
Parameters
|
|
577
|
+
----------
|
|
578
|
+
config
|
|
579
|
+
Validated configuration dictionary.
|
|
580
|
+
|
|
581
|
+
Returns
|
|
582
|
+
-------
|
|
583
|
+
Validate
|
|
584
|
+
Validate object with configured validation steps.
|
|
585
|
+
"""
|
|
586
|
+
# Load data source
|
|
587
|
+
data = self._load_data_source(config["tbl"])
|
|
588
|
+
|
|
589
|
+
# Create Validate object
|
|
590
|
+
validate_kwargs = {}
|
|
591
|
+
|
|
592
|
+
# Set table name if provided
|
|
593
|
+
if "tbl_name" in config:
|
|
594
|
+
validate_kwargs["tbl_name"] = config["tbl_name"]
|
|
595
|
+
|
|
596
|
+
# Set label if provided
|
|
597
|
+
if "label" in config:
|
|
598
|
+
validate_kwargs["label"] = config["label"]
|
|
599
|
+
|
|
600
|
+
# Set thresholds if provided
|
|
601
|
+
if "thresholds" in config:
|
|
602
|
+
validate_kwargs["thresholds"] = config["thresholds"]
|
|
603
|
+
|
|
604
|
+
# Set actions if provided
|
|
605
|
+
if "actions" in config:
|
|
606
|
+
# Process actions - handle python: block syntax for callables
|
|
607
|
+
processed_actions = _process_python_expressions(config["actions"])
|
|
608
|
+
# Convert to Actions object
|
|
609
|
+
validate_kwargs["actions"] = Actions(**processed_actions)
|
|
610
|
+
|
|
611
|
+
# Set language if provided
|
|
612
|
+
if "lang" in config:
|
|
613
|
+
validate_kwargs["lang"] = config["lang"]
|
|
614
|
+
|
|
615
|
+
# Set locale if provided
|
|
616
|
+
if "locale" in config:
|
|
617
|
+
validate_kwargs["locale"] = config["locale"]
|
|
618
|
+
|
|
619
|
+
# Set global brief if provided
|
|
620
|
+
if "brief" in config:
|
|
621
|
+
validate_kwargs["brief"] = config["brief"]
|
|
622
|
+
|
|
623
|
+
validation = Validate(data, **validate_kwargs)
|
|
624
|
+
|
|
625
|
+
# Add validation steps
|
|
626
|
+
for step_config in config["steps"]:
|
|
627
|
+
method_name, parameters = self._parse_validation_step(step_config)
|
|
628
|
+
|
|
629
|
+
# Get the method from the validation object
|
|
630
|
+
method = getattr(validation, method_name)
|
|
631
|
+
|
|
632
|
+
# Call the method with parameters
|
|
633
|
+
validation = method(**parameters)
|
|
634
|
+
|
|
635
|
+
return validation
|
|
636
|
+
|
|
637
|
+
def execute_workflow(self, config: dict) -> Validate:
|
|
638
|
+
"""Execute a complete YAML validation workflow.
|
|
639
|
+
|
|
640
|
+
Parameters
|
|
641
|
+
----------
|
|
642
|
+
config
|
|
643
|
+
Validated configuration dictionary.
|
|
644
|
+
|
|
645
|
+
Returns
|
|
646
|
+
-------
|
|
647
|
+
Validate
|
|
648
|
+
Interrogated Validate object with results.
|
|
649
|
+
"""
|
|
650
|
+
# Build the validation plan
|
|
651
|
+
validation = self.build_validation(config)
|
|
652
|
+
|
|
653
|
+
# Execute interrogation to get results
|
|
654
|
+
validation = validation.interrogate()
|
|
655
|
+
|
|
656
|
+
return validation
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
def yaml_interrogate(yaml: Union[str, Path]) -> Validate:
|
|
660
|
+
"""Execute a YAML-based validation workflow.
|
|
661
|
+
|
|
662
|
+
This is the main entry point for YAML-based validation workflows. It takes YAML configuration
|
|
663
|
+
(as a string or file path) and returns a validated `Validate` object with interrogation results.
|
|
664
|
+
|
|
665
|
+
The YAML configuration defines the data source, validation steps, and optional settings like
|
|
666
|
+
thresholds and labels. This function automatically loads the data, builds the validation plan,
|
|
667
|
+
executes all validation steps, and returns the interrogated results.
|
|
668
|
+
|
|
669
|
+
Parameters
|
|
670
|
+
----------
|
|
671
|
+
yaml
|
|
672
|
+
YAML configuration as string or file path. Can be: (1) a YAML string containing the
|
|
673
|
+
validation configuration, or (2) a Path object or string path to a YAML file.
|
|
674
|
+
|
|
675
|
+
Returns
|
|
676
|
+
-------
|
|
677
|
+
Validate
|
|
678
|
+
An instance of the `Validate` class that has been configured based on the YAML input.
|
|
679
|
+
This object contains the results of the validation steps defined in the YAML configuration.
|
|
680
|
+
It includes metadata like table name, label, language, and thresholds if specified.
|
|
681
|
+
|
|
682
|
+
Raises
|
|
683
|
+
------
|
|
684
|
+
YAMLValidationError
|
|
685
|
+
If the YAML is invalid, malformed, or execution fails. This includes syntax errors, missing
|
|
686
|
+
required fields, unknown validation methods, or data loading failures.
|
|
687
|
+
|
|
688
|
+
Examples
|
|
689
|
+
--------
|
|
690
|
+
```{python}
|
|
691
|
+
#| echo: false
|
|
692
|
+
#| output: false
|
|
693
|
+
import pointblank as pb
|
|
694
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
695
|
+
```
|
|
696
|
+
For the examples here, we'll use YAML configurations to define validation workflows. Let's start
|
|
697
|
+
with a basic YAML workflow that validates the built-in `small_table` dataset.
|
|
698
|
+
|
|
699
|
+
```{python}
|
|
700
|
+
import pointblank as pb
|
|
701
|
+
|
|
702
|
+
# Define a basic YAML validation workflow
|
|
703
|
+
yaml_config = '''
|
|
704
|
+
tbl: small_table
|
|
705
|
+
steps:
|
|
706
|
+
- rows_distinct
|
|
707
|
+
- col_exists:
|
|
708
|
+
columns: [date, a, b]
|
|
709
|
+
'''
|
|
710
|
+
|
|
711
|
+
# Execute the validation workflow
|
|
712
|
+
result = pb.yaml_interrogate(yaml_config)
|
|
713
|
+
result
|
|
714
|
+
```
|
|
715
|
+
|
|
716
|
+
The validation table shows the results of our YAML-defined workflow. We can see that the
|
|
717
|
+
`rows_distinct()` validation failed (because there are duplicate rows in the table), while the
|
|
718
|
+
column existence checks passed.
|
|
719
|
+
|
|
720
|
+
Now let's create a more comprehensive validation workflow with thresholds and metadata:
|
|
721
|
+
|
|
722
|
+
```{python}
|
|
723
|
+
# Advanced YAML configuration with thresholds and metadata
|
|
724
|
+
yaml_config = '''
|
|
725
|
+
tbl: small_table
|
|
726
|
+
tbl_name: small_table_demo
|
|
727
|
+
label: Comprehensive data validation
|
|
728
|
+
thresholds:
|
|
729
|
+
warning: 0.1
|
|
730
|
+
error: 0.25
|
|
731
|
+
critical: 0.35
|
|
732
|
+
steps:
|
|
733
|
+
- col_vals_gt:
|
|
734
|
+
columns: [d]
|
|
735
|
+
value: 100
|
|
736
|
+
- col_vals_regex:
|
|
737
|
+
columns: [b]
|
|
738
|
+
pattern: '[0-9]-[a-z]{3}-[0-9]{3}'
|
|
739
|
+
- col_vals_not_null:
|
|
740
|
+
columns: [date, a]
|
|
741
|
+
'''
|
|
742
|
+
|
|
743
|
+
# Execute the validation workflow
|
|
744
|
+
result = pb.yaml_interrogate(yaml_config)
|
|
745
|
+
print(f"Table name: {result.tbl_name}")
|
|
746
|
+
print(f"Label: {result.label}")
|
|
747
|
+
print(f"Total validation steps: {len(result.validation_info)}")
|
|
748
|
+
```
|
|
749
|
+
|
|
750
|
+
The validation results now include our custom table name and label. The thresholds we defined
|
|
751
|
+
will determine when validation steps are marked as warnings, errors, or critical failures.
|
|
752
|
+
|
|
753
|
+
You can also load YAML configurations from files. Here's how you would work with a YAML file:
|
|
754
|
+
|
|
755
|
+
```{python}
|
|
756
|
+
from pathlib import Path
|
|
757
|
+
import tempfile
|
|
758
|
+
|
|
759
|
+
# Create a temporary YAML file for demonstration
|
|
760
|
+
yaml_content = '''
|
|
761
|
+
tbl: small_table
|
|
762
|
+
tbl_name: File-based Validation
|
|
763
|
+
steps:
|
|
764
|
+
- col_vals_between:
|
|
765
|
+
columns: [c]
|
|
766
|
+
left: 1
|
|
767
|
+
right: 10
|
|
768
|
+
- col_vals_in_set:
|
|
769
|
+
columns: [f]
|
|
770
|
+
set: [low, mid, high]
|
|
771
|
+
'''
|
|
772
|
+
|
|
773
|
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
|
|
774
|
+
f.write(yaml_content)
|
|
775
|
+
yaml_file_path = Path(f.name)
|
|
776
|
+
|
|
777
|
+
# Load and execute validation from file
|
|
778
|
+
result = pb.yaml_interrogate(yaml_file_path)
|
|
779
|
+
result
|
|
780
|
+
```
|
|
781
|
+
|
|
782
|
+
This approach is particularly useful for storing validation configurations as part of your data
|
|
783
|
+
pipeline or version control system, allowing you to maintain validation rules alongside your
|
|
784
|
+
code.
|
|
785
|
+
"""
|
|
786
|
+
validator = YAMLValidator()
|
|
787
|
+
config = validator.load_config(yaml)
|
|
788
|
+
return validator.execute_workflow(config)
|
|
789
|
+
|
|
790
|
+
|
|
791
|
+
def load_yaml_config(file_path: Union[str, Path]) -> dict:
|
|
792
|
+
"""Load YAML configuration from file or string.
|
|
793
|
+
|
|
794
|
+
Parameters
|
|
795
|
+
----------
|
|
796
|
+
file_path
|
|
797
|
+
Path to YAML file or YAML content string
|
|
798
|
+
|
|
799
|
+
Returns
|
|
800
|
+
-------
|
|
801
|
+
dict
|
|
802
|
+
Parsed configuration dictionary
|
|
803
|
+
|
|
804
|
+
Raises
|
|
805
|
+
------
|
|
806
|
+
YAMLValidationError
|
|
807
|
+
If the file cannot be loaded or is invalid
|
|
808
|
+
"""
|
|
809
|
+
validator = YAMLValidator()
|
|
810
|
+
return validator.load_config(file_path)
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
def validate_yaml(yaml: Union[str, Path]) -> None:
|
|
814
|
+
"""Validate YAML configuration against the expected structure.
|
|
815
|
+
|
|
816
|
+
This function validates that a YAML configuration conforms to the expected structure for
|
|
817
|
+
validation workflows. It checks for required fields, proper data types, and valid
|
|
818
|
+
validation method names. This is useful for validating configurations before execution or
|
|
819
|
+
for building configuration editors and validators.
|
|
820
|
+
|
|
821
|
+
The function performs comprehensive validation including:
|
|
822
|
+
|
|
823
|
+
- required fields ('tbl' and 'steps')
|
|
824
|
+
- proper data types for all fields
|
|
825
|
+
- valid threshold configurations
|
|
826
|
+
- known validation method names
|
|
827
|
+
- proper step configuration structure
|
|
828
|
+
|
|
829
|
+
Parameters
|
|
830
|
+
----------
|
|
831
|
+
yaml
|
|
832
|
+
YAML configuration as string or file path. Can be: (1) a YAML string containing the
|
|
833
|
+
validation configuration, or (2) a Path object or string path to a YAML file.
|
|
834
|
+
|
|
835
|
+
Raises
|
|
836
|
+
------
|
|
837
|
+
YAMLValidationError
|
|
838
|
+
If the YAML is invalid, malformed, or execution fails. This includes syntax errors,
|
|
839
|
+
missing required fields, unknown validation methods, or data loading failures.
|
|
840
|
+
|
|
841
|
+
Examples
|
|
842
|
+
--------
|
|
843
|
+
```{python}
|
|
844
|
+
#| echo: false
|
|
845
|
+
#| output: false
|
|
846
|
+
import pointblank as pb
|
|
847
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
848
|
+
```
|
|
849
|
+
For the examples here, we'll demonstrate how to validate YAML configurations before using them
|
|
850
|
+
with validation workflows. This is particularly useful for building robust data validation
|
|
851
|
+
systems where you want to catch configuration errors early.
|
|
852
|
+
|
|
853
|
+
Let's start with validating a basic configuration:
|
|
854
|
+
|
|
855
|
+
```{python}
|
|
856
|
+
import pointblank as pb
|
|
857
|
+
|
|
858
|
+
# Define a basic YAML validation configuration
|
|
859
|
+
yaml_config = '''
|
|
860
|
+
tbl: small_table
|
|
861
|
+
steps:
|
|
862
|
+
- rows_distinct
|
|
863
|
+
- col_exists:
|
|
864
|
+
columns: [a, b]
|
|
865
|
+
'''
|
|
866
|
+
|
|
867
|
+
# Validate the configuration: no exception means it's valid
|
|
868
|
+
pb.validate_yaml(yaml_config)
|
|
869
|
+
print("Basic YAML configuration is valid")
|
|
870
|
+
```
|
|
871
|
+
|
|
872
|
+
The function completed without raising an exception, which means our configuration is valid and
|
|
873
|
+
follows the expected structure.
|
|
874
|
+
|
|
875
|
+
Now let's validate a more complex configuration with thresholds and metadata:
|
|
876
|
+
|
|
877
|
+
```{python}
|
|
878
|
+
# Complex YAML configuration with all optional fields
|
|
879
|
+
yaml_config = '''
|
|
880
|
+
tbl: small_table
|
|
881
|
+
tbl_name: My Dataset
|
|
882
|
+
label: Quality check
|
|
883
|
+
lang: en
|
|
884
|
+
locale: en
|
|
885
|
+
thresholds:
|
|
886
|
+
warning: 0.1
|
|
887
|
+
error: 0.25
|
|
888
|
+
critical: 0.35
|
|
889
|
+
steps:
|
|
890
|
+
- rows_distinct
|
|
891
|
+
- col_vals_gt:
|
|
892
|
+
columns: [d]
|
|
893
|
+
value: 100
|
|
894
|
+
- col_vals_regex:
|
|
895
|
+
columns: [b]
|
|
896
|
+
pattern: '[0-9]-[a-z]{3}-[0-9]{3}'
|
|
897
|
+
'''
|
|
898
|
+
|
|
899
|
+
# Validate the configuration
|
|
900
|
+
pb.validate_yaml(yaml_config)
|
|
901
|
+
print("Complex YAML configuration is valid")
|
|
902
|
+
|
|
903
|
+
# Count the validation steps
|
|
904
|
+
import pointblank.yaml as pby
|
|
905
|
+
config = pby.load_yaml_config(yaml_config)
|
|
906
|
+
print(f"Configuration has {len(config['steps'])} validation steps")
|
|
907
|
+
```
|
|
908
|
+
|
|
909
|
+
This configuration includes all the optional metadata fields and complex validation steps,
|
|
910
|
+
demonstrating that the validation handles the full range of supported options.
|
|
911
|
+
|
|
912
|
+
Let's see what happens when we try to validate an invalid configuration:
|
|
913
|
+
|
|
914
|
+
```{python}
|
|
915
|
+
# Invalid YAML configuration: missing required 'tbl' field
|
|
916
|
+
invalid_yaml = '''
|
|
917
|
+
steps:
|
|
918
|
+
- rows_distinct
|
|
919
|
+
'''
|
|
920
|
+
|
|
921
|
+
try:
|
|
922
|
+
pb.validate_yaml(invalid_yaml)
|
|
923
|
+
except pb.yaml.YAMLValidationError as e:
|
|
924
|
+
print(f"Validation failed: {e}")
|
|
925
|
+
```
|
|
926
|
+
|
|
927
|
+
The validation correctly identifies that our configuration is missing the required `'tbl'`
|
|
928
|
+
field.
|
|
929
|
+
|
|
930
|
+
Here's a practical example of using validation in a workflow builder:
|
|
931
|
+
|
|
932
|
+
```{python}
|
|
933
|
+
def safe_yaml_interrogate(yaml_config):
|
|
934
|
+
\"\"\"Safely execute a YAML configuration after validation.\"\"\"
|
|
935
|
+
try:
|
|
936
|
+
# Validate the YAML configuration first
|
|
937
|
+
pb.validate_yaml(yaml_config)
|
|
938
|
+
print("✓ YAML configuration is valid")
|
|
939
|
+
|
|
940
|
+
# Then execute the workflow
|
|
941
|
+
result = pb.yaml_interrogate(yaml_config)
|
|
942
|
+
print(f"Validation completed with {len(result.validation_info)} steps")
|
|
943
|
+
return result
|
|
944
|
+
|
|
945
|
+
except pb.yaml.YAMLValidationError as e:
|
|
946
|
+
print(f"Configuration error: {e}")
|
|
947
|
+
return None
|
|
948
|
+
|
|
949
|
+
# Test with a valid YAML configuration
|
|
950
|
+
test_yaml = '''
|
|
951
|
+
tbl: small_table
|
|
952
|
+
steps:
|
|
953
|
+
- col_vals_between:
|
|
954
|
+
columns: [c]
|
|
955
|
+
left: 1
|
|
956
|
+
right: 10
|
|
957
|
+
'''
|
|
958
|
+
|
|
959
|
+
result = safe_yaml_interrogate(test_yaml)
|
|
960
|
+
```
|
|
961
|
+
|
|
962
|
+
This pattern of validating before executing helps build more reliable data validation pipelines
|
|
963
|
+
by catching configuration errors early in the process.
|
|
964
|
+
|
|
965
|
+
Note that this function only validates the structure and does not check if the specified data
|
|
966
|
+
source ('tbl') exists or is accessible. Data source validation occurs during execution with
|
|
967
|
+
`yaml_interrogate()`.
|
|
968
|
+
|
|
969
|
+
See Also
|
|
970
|
+
--------
|
|
971
|
+
yaml_interrogate : execute YAML-based validation workflows
|
|
972
|
+
"""
|
|
973
|
+
validator = YAMLValidator()
|
|
974
|
+
config = validator.load_config(yaml)
|
|
975
|
+
# Only validate, don't execute the workflow
|
|
976
|
+
return None
|
|
977
|
+
|
|
978
|
+
|
|
979
|
+
def yaml_to_python(yaml: Union[str, Path]) -> str:
|
|
980
|
+
"""Convert YAML validation configuration to equivalent Python code.
|
|
981
|
+
|
|
982
|
+
This function takes a YAML validation configuration and generates the equivalent Python code
|
|
983
|
+
that would produce the same validation workflow. This is useful for documentation, code
|
|
984
|
+
generation, or learning how to translate YAML workflows into programmatic workflows.
|
|
985
|
+
|
|
986
|
+
The generated Python code includes all necessary imports, data loading, validation steps,
|
|
987
|
+
and interrogation execution, formatted as executable Python code.
|
|
988
|
+
|
|
989
|
+
Parameters
|
|
990
|
+
----------
|
|
991
|
+
yaml
|
|
992
|
+
YAML configuration as string or file path. Can be: (1) a YAML string containing the
|
|
993
|
+
validation configuration, or (2) a Path object or string path to a YAML file.
|
|
994
|
+
|
|
995
|
+
Returns
|
|
996
|
+
-------
|
|
997
|
+
str
|
|
998
|
+
A formatted Python code string enclosed in markdown code blocks that replicates the YAML
|
|
999
|
+
workflow. The code includes import statements, data loading, validation method calls, and
|
|
1000
|
+
interrogation execution.
|
|
1001
|
+
|
|
1002
|
+
Raises
|
|
1003
|
+
------
|
|
1004
|
+
YAMLValidationError
|
|
1005
|
+
If the YAML is invalid, malformed, or contains unknown validation methods.
|
|
1006
|
+
|
|
1007
|
+
Examples
|
|
1008
|
+
--------
|
|
1009
|
+
```{python}
|
|
1010
|
+
#| echo: false
|
|
1011
|
+
#| output: false
|
|
1012
|
+
import pointblank as pb
|
|
1013
|
+
pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
|
|
1014
|
+
```
|
|
1015
|
+
|
|
1016
|
+
Convert a basic YAML configuration to Python code:
|
|
1017
|
+
|
|
1018
|
+
```{python}
|
|
1019
|
+
import pointblank as pb
|
|
1020
|
+
|
|
1021
|
+
# Define a YAML validation workflow
|
|
1022
|
+
yaml_config = '''
|
|
1023
|
+
tbl: small_table
|
|
1024
|
+
tbl_name: Data Quality Check
|
|
1025
|
+
steps:
|
|
1026
|
+
- col_vals_not_null:
|
|
1027
|
+
columns: [a, b]
|
|
1028
|
+
- col_vals_gt:
|
|
1029
|
+
columns: [c]
|
|
1030
|
+
value: 0
|
|
1031
|
+
'''
|
|
1032
|
+
|
|
1033
|
+
# Generate equivalent Python code
|
|
1034
|
+
python_code = pb.yaml_to_python(yaml_config)
|
|
1035
|
+
print(python_code)
|
|
1036
|
+
```
|
|
1037
|
+
|
|
1038
|
+
The generated Python code shows exactly how to replicate the YAML workflow programmatically.
|
|
1039
|
+
This is particularly useful when transitioning from YAML-based workflows to code-based
|
|
1040
|
+
workflows, or when generating documentation that shows both YAML and Python approaches.
|
|
1041
|
+
|
|
1042
|
+
For more complex workflows with thresholds and metadata:
|
|
1043
|
+
|
|
1044
|
+
```{python}
|
|
1045
|
+
# Advanced YAML configuration
|
|
1046
|
+
yaml_config = '''
|
|
1047
|
+
tbl: small_table
|
|
1048
|
+
tbl_name: Advanced Validation
|
|
1049
|
+
label: Production data check
|
|
1050
|
+
thresholds:
|
|
1051
|
+
warning: 0.1
|
|
1052
|
+
error: 0.2
|
|
1053
|
+
steps:
|
|
1054
|
+
- col_vals_between:
|
|
1055
|
+
columns: [c]
|
|
1056
|
+
left: 1
|
|
1057
|
+
right: 10
|
|
1058
|
+
- col_vals_regex:
|
|
1059
|
+
columns: [b]
|
|
1060
|
+
pattern: '[0-9]-[a-z]{3}-[0-9]{3}'
|
|
1061
|
+
'''
|
|
1062
|
+
|
|
1063
|
+
# Generate the equivalent Python code
|
|
1064
|
+
python_code = pb.yaml_to_python(yaml_config)
|
|
1065
|
+
print(python_code)
|
|
1066
|
+
```
|
|
1067
|
+
|
|
1068
|
+
The generated code includes all configuration parameters, thresholds, and maintains the exact
|
|
1069
|
+
same validation logic as the original YAML workflow.
|
|
1070
|
+
|
|
1071
|
+
This function is also useful for educational purposes, helping users understand how YAML
|
|
1072
|
+
configurations map to the underlying Python API calls.
|
|
1073
|
+
"""
|
|
1074
|
+
# First, parse the raw YAML to detect Polars/Pandas expressions in the source code
|
|
1075
|
+
if isinstance(yaml, Path):
|
|
1076
|
+
yaml_content = yaml.read_text()
|
|
1077
|
+
elif isinstance(yaml, str):
|
|
1078
|
+
# Check if it's a file path (single line, reasonable length, no newlines)
|
|
1079
|
+
if len(yaml) < 260 and "\n" not in yaml and Path(yaml).exists():
|
|
1080
|
+
yaml_content = Path(yaml).read_text()
|
|
1081
|
+
else:
|
|
1082
|
+
yaml_content = yaml
|
|
1083
|
+
else:
|
|
1084
|
+
yaml_content = str(yaml)
|
|
1085
|
+
|
|
1086
|
+
# Track whether we need to import Polars and Pandas by analyzing the raw YAML content
|
|
1087
|
+
needs_polars_import = False
|
|
1088
|
+
needs_pandas_import = False
|
|
1089
|
+
|
|
1090
|
+
# Check for polars/pandas patterns in the raw YAML content
|
|
1091
|
+
if "pd." in yaml_content or "pandas" in yaml_content:
|
|
1092
|
+
needs_pandas_import = True
|
|
1093
|
+
if "pl." in yaml_content or "polars" in yaml_content:
|
|
1094
|
+
needs_polars_import = True
|
|
1095
|
+
|
|
1096
|
+
# Parse the raw YAML to extract original Python expressions before they get processed
|
|
1097
|
+
import yaml as yaml_module
|
|
1098
|
+
|
|
1099
|
+
raw_config = yaml_module.safe_load(yaml_content)
|
|
1100
|
+
|
|
1101
|
+
# Extract the original tbl python expression if it exists
|
|
1102
|
+
original_tbl_expression = None
|
|
1103
|
+
if isinstance(raw_config.get("tbl"), dict) and "python" in raw_config["tbl"]:
|
|
1104
|
+
original_tbl_expression = raw_config["tbl"]["python"].strip()
|
|
1105
|
+
|
|
1106
|
+
# Extract original Actions expressions if they exist
|
|
1107
|
+
original_actions_expressions = {}
|
|
1108
|
+
if "actions" in raw_config:
|
|
1109
|
+
for key, value in raw_config["actions"].items():
|
|
1110
|
+
if isinstance(value, dict) and "python" in value:
|
|
1111
|
+
original_actions_expressions[key] = value["python"].strip()
|
|
1112
|
+
|
|
1113
|
+
# Define function for recursively extract original Python expressions from step parameters
|
|
1114
|
+
def extract_python_expressions(obj, path=""):
|
|
1115
|
+
expressions = {}
|
|
1116
|
+
if isinstance(obj, dict):
|
|
1117
|
+
if "python" in obj and len(obj) == 1:
|
|
1118
|
+
expressions[path] = obj["python"].strip()
|
|
1119
|
+
else:
|
|
1120
|
+
for key, value in obj.items():
|
|
1121
|
+
new_path = f"{path}.{key}" if path else key
|
|
1122
|
+
# Special handling for `expr=` and `pre=` parameters that
|
|
1123
|
+
# can use shortcut syntax
|
|
1124
|
+
if key in ["expr", "pre"] and isinstance(value, str):
|
|
1125
|
+
expressions[new_path] = value.strip()
|
|
1126
|
+
# Special handling for actions that might contain python: expressions
|
|
1127
|
+
elif key == "actions" and isinstance(value, dict):
|
|
1128
|
+
for action_key, action_value in value.items():
|
|
1129
|
+
if isinstance(action_value, dict) and "python" in action_value:
|
|
1130
|
+
expressions[f"{new_path}.{action_key}"] = action_value[
|
|
1131
|
+
"python"
|
|
1132
|
+
].strip()
|
|
1133
|
+
else:
|
|
1134
|
+
expressions.update(extract_python_expressions(value, new_path))
|
|
1135
|
+
elif isinstance(obj, list):
|
|
1136
|
+
for i, item in enumerate(obj):
|
|
1137
|
+
new_path = f"{path}[{i}]"
|
|
1138
|
+
expressions.update(extract_python_expressions(item, new_path))
|
|
1139
|
+
return expressions
|
|
1140
|
+
|
|
1141
|
+
step_expressions = {}
|
|
1142
|
+
if "steps" in raw_config:
|
|
1143
|
+
for i, step in enumerate(raw_config["steps"]):
|
|
1144
|
+
if isinstance(step, dict):
|
|
1145
|
+
step_expressions.update(extract_python_expressions(step, f"steps[{i}]"))
|
|
1146
|
+
|
|
1147
|
+
# Load and validate the YAML configuration
|
|
1148
|
+
validator = YAMLValidator()
|
|
1149
|
+
config = validator.load_config(yaml)
|
|
1150
|
+
|
|
1151
|
+
# Start building the Python code
|
|
1152
|
+
code_lines = []
|
|
1153
|
+
|
|
1154
|
+
# Add imports (we'll determine Polars/Pandas import need during processing)
|
|
1155
|
+
imports = ["import pointblank as pb"]
|
|
1156
|
+
|
|
1157
|
+
# Build the chained validation call
|
|
1158
|
+
code_lines.append("(")
|
|
1159
|
+
|
|
1160
|
+
# Build validation initialization arguments
|
|
1161
|
+
validate_args = []
|
|
1162
|
+
|
|
1163
|
+
# Add data loading as first argument
|
|
1164
|
+
tbl_spec = config["tbl"]
|
|
1165
|
+
if isinstance(tbl_spec, str):
|
|
1166
|
+
if tbl_spec.endswith((".csv", ".parquet")):
|
|
1167
|
+
# File loading
|
|
1168
|
+
validate_args.append(f'data=pb.load_dataset("{tbl_spec}")')
|
|
1169
|
+
else:
|
|
1170
|
+
# Dataset loading
|
|
1171
|
+
validate_args.append(f'data=pb.load_dataset("{tbl_spec}")')
|
|
1172
|
+
else:
|
|
1173
|
+
# Use the original Python expression if we extracted it
|
|
1174
|
+
if original_tbl_expression:
|
|
1175
|
+
validate_args.append(f"data={original_tbl_expression}")
|
|
1176
|
+
else:
|
|
1177
|
+
# Fallback to placeholder if we couldn't extract the original expression
|
|
1178
|
+
validate_args.append("data=<python_expression_result>")
|
|
1179
|
+
|
|
1180
|
+
# Add table name if present
|
|
1181
|
+
if "tbl_name" in config:
|
|
1182
|
+
validate_args.append(f'tbl_name="{config["tbl_name"]}"')
|
|
1183
|
+
|
|
1184
|
+
# Add `label=` if present
|
|
1185
|
+
if "label" in config:
|
|
1186
|
+
validate_args.append(f'label="{config["label"]}"')
|
|
1187
|
+
|
|
1188
|
+
# Add `thresholds=` if present: format as `pb.Thresholds()` for an idiomatic style
|
|
1189
|
+
if "thresholds" in config:
|
|
1190
|
+
thresholds_dict = config["thresholds"]
|
|
1191
|
+
threshold_params = []
|
|
1192
|
+
for key, value in thresholds_dict.items():
|
|
1193
|
+
threshold_params.append(f"{key}={value}")
|
|
1194
|
+
thresholds_str = "pb.Thresholds(" + ", ".join(threshold_params) + ")"
|
|
1195
|
+
validate_args.append(f"thresholds={thresholds_str}")
|
|
1196
|
+
|
|
1197
|
+
# Add `actions=` if present: format as `pb.Actions()` for an idiomatic style
|
|
1198
|
+
if "actions" in config:
|
|
1199
|
+
actions_dict = config["actions"]
|
|
1200
|
+
action_params = []
|
|
1201
|
+
for key, value in actions_dict.items():
|
|
1202
|
+
if key == "highest_only":
|
|
1203
|
+
action_params.append(f"{key}={value}")
|
|
1204
|
+
elif key in original_actions_expressions:
|
|
1205
|
+
# Use the original Python expression for callables
|
|
1206
|
+
action_params.append(f"{key}={original_actions_expressions[key]}")
|
|
1207
|
+
elif isinstance(value, str):
|
|
1208
|
+
action_params.append(f'{key}="{value}"')
|
|
1209
|
+
else:
|
|
1210
|
+
# For callables or complex expressions, use placeholder
|
|
1211
|
+
action_params.append(f"{key}={value}")
|
|
1212
|
+
actions_str = "pb.Actions(" + ", ".join(action_params) + ")"
|
|
1213
|
+
validate_args.append(f"actions={actions_str}")
|
|
1214
|
+
|
|
1215
|
+
# Add language if present
|
|
1216
|
+
if "lang" in config:
|
|
1217
|
+
validate_args.append(f'lang="{config["lang"]}"')
|
|
1218
|
+
|
|
1219
|
+
# Add locale if present
|
|
1220
|
+
if "locale" in config:
|
|
1221
|
+
validate_args.append(f'locale="{config["locale"]}"')
|
|
1222
|
+
|
|
1223
|
+
# Add global brief if present
|
|
1224
|
+
if "brief" in config:
|
|
1225
|
+
if isinstance(config["brief"], bool):
|
|
1226
|
+
validate_args.append(f"brief={str(config['brief'])}")
|
|
1227
|
+
else:
|
|
1228
|
+
validate_args.append(f'brief="{config["brief"]}"')
|
|
1229
|
+
|
|
1230
|
+
# Create the `pb.Validate()` call
|
|
1231
|
+
if len(validate_args) == 1:
|
|
1232
|
+
# Single argument fits on one line
|
|
1233
|
+
code_lines.append(f" pb.Validate({validate_args[0]})")
|
|
1234
|
+
else:
|
|
1235
|
+
# Multiple arguments: format each on its own line
|
|
1236
|
+
code_lines.append(" pb.Validate(")
|
|
1237
|
+
for i, arg in enumerate(validate_args):
|
|
1238
|
+
if i == len(validate_args) - 1:
|
|
1239
|
+
code_lines.append(f" {arg},")
|
|
1240
|
+
else:
|
|
1241
|
+
code_lines.append(f" {arg},")
|
|
1242
|
+
code_lines.append(" )")
|
|
1243
|
+
|
|
1244
|
+
# Add validation steps as chained method calls
|
|
1245
|
+
for step_index, step_config in enumerate(config["steps"]):
|
|
1246
|
+
method_name, parameters = validator._parse_validation_step(step_config)
|
|
1247
|
+
|
|
1248
|
+
# Format parameters
|
|
1249
|
+
param_parts = []
|
|
1250
|
+
for key, value in parameters.items():
|
|
1251
|
+
# Check if we have an original expression for this parameter
|
|
1252
|
+
expression_path = f"steps[{step_index}].{list(step_config.keys())[0]}.{key}"
|
|
1253
|
+
if expression_path in step_expressions:
|
|
1254
|
+
# Use the original Python expression
|
|
1255
|
+
param_parts.append(f"{key}={step_expressions[expression_path]}")
|
|
1256
|
+
elif key in ["columns", "columns_subset"]:
|
|
1257
|
+
if isinstance(value, list):
|
|
1258
|
+
if len(value) == 1:
|
|
1259
|
+
# Single column as string
|
|
1260
|
+
param_parts.append(f'{key}="{value[0]}"')
|
|
1261
|
+
else:
|
|
1262
|
+
# Multiple columns as list
|
|
1263
|
+
columns_str = "[" + ", ".join([f'"{col}"' for col in value]) + "]"
|
|
1264
|
+
param_parts.append(f"{key}={columns_str}")
|
|
1265
|
+
else:
|
|
1266
|
+
param_parts.append(f'{key}="{value}"')
|
|
1267
|
+
elif key == "brief":
|
|
1268
|
+
# Handle `brief=` parameter: can be a boolean or a string
|
|
1269
|
+
if isinstance(value, bool):
|
|
1270
|
+
param_parts.append(f"brief={str(value)}")
|
|
1271
|
+
else:
|
|
1272
|
+
param_parts.append(f'brief="{value}"')
|
|
1273
|
+
elif key == "actions":
|
|
1274
|
+
# Handle actions parameter: format as `pb.Actions()`
|
|
1275
|
+
if isinstance(value, Actions):
|
|
1276
|
+
# Already an `Actions` object, format its attributes
|
|
1277
|
+
action_params = []
|
|
1278
|
+
|
|
1279
|
+
# Check for original expressions for each action level
|
|
1280
|
+
step_action_base = f"steps[{step_index}].{list(step_config.keys())[0]}.actions"
|
|
1281
|
+
|
|
1282
|
+
if value.warning is not None:
|
|
1283
|
+
warning_expr_path = f"{step_action_base}.warning"
|
|
1284
|
+
if warning_expr_path in step_expressions:
|
|
1285
|
+
action_params.append(f"warning={step_expressions[warning_expr_path]}")
|
|
1286
|
+
elif isinstance(value.warning, list) and len(value.warning) == 1:
|
|
1287
|
+
action_params.append(f'warning="{value.warning[0]}"')
|
|
1288
|
+
else:
|
|
1289
|
+
action_params.append(f"warning={value.warning}")
|
|
1290
|
+
|
|
1291
|
+
if value.error is not None:
|
|
1292
|
+
error_expr_path = f"{step_action_base}.error"
|
|
1293
|
+
if error_expr_path in step_expressions:
|
|
1294
|
+
action_params.append(f"error={step_expressions[error_expr_path]}")
|
|
1295
|
+
elif isinstance(value.error, list) and len(value.error) == 1:
|
|
1296
|
+
action_params.append(f'error="{value.error[0]}"')
|
|
1297
|
+
else:
|
|
1298
|
+
action_params.append(f"error={value.error}")
|
|
1299
|
+
|
|
1300
|
+
if value.critical is not None:
|
|
1301
|
+
critical_expr_path = f"{step_action_base}.critical"
|
|
1302
|
+
if critical_expr_path in step_expressions:
|
|
1303
|
+
action_params.append(f"critical={step_expressions[critical_expr_path]}")
|
|
1304
|
+
elif isinstance(value.critical, list) and len(value.critical) == 1:
|
|
1305
|
+
action_params.append(f'critical="{value.critical[0]}"')
|
|
1306
|
+
else:
|
|
1307
|
+
action_params.append(f"critical={value.critical}")
|
|
1308
|
+
|
|
1309
|
+
if hasattr(value, "highest_only") and value.highest_only is not True:
|
|
1310
|
+
action_params.append(f"highest_only={value.highest_only}")
|
|
1311
|
+
actions_str = "pb.Actions(" + ", ".join(action_params) + ")"
|
|
1312
|
+
param_parts.append(f"actions={actions_str}")
|
|
1313
|
+
elif isinstance(value, dict):
|
|
1314
|
+
action_params = []
|
|
1315
|
+
step_action_base = f"steps[{step_index}].{list(step_config.keys())[0]}.actions"
|
|
1316
|
+
for action_key, action_value in value.items():
|
|
1317
|
+
if action_key == "highest_only":
|
|
1318
|
+
action_params.append(f"{action_key}={action_value}")
|
|
1319
|
+
else:
|
|
1320
|
+
# Check if we have an original expression for this action
|
|
1321
|
+
action_expr_path = f"{step_action_base}.{action_key}"
|
|
1322
|
+
if action_expr_path in step_expressions:
|
|
1323
|
+
action_params.append(
|
|
1324
|
+
f"{action_key}={step_expressions[action_expr_path]}"
|
|
1325
|
+
)
|
|
1326
|
+
elif isinstance(action_value, str):
|
|
1327
|
+
action_params.append(f'{action_key}="{action_value}"')
|
|
1328
|
+
else:
|
|
1329
|
+
# For callables or complex expressions
|
|
1330
|
+
action_params.append(f"{action_key}={action_value}")
|
|
1331
|
+
actions_str = "pb.Actions(" + ", ".join(action_params) + ")"
|
|
1332
|
+
param_parts.append(f"actions={actions_str}")
|
|
1333
|
+
else:
|
|
1334
|
+
param_parts.append(f"actions={value}")
|
|
1335
|
+
elif key == "thresholds":
|
|
1336
|
+
# Handle thresholds parameter: format as `pb.Thresholds()`
|
|
1337
|
+
if isinstance(value, dict):
|
|
1338
|
+
threshold_params = []
|
|
1339
|
+
for threshold_key, threshold_value in value.items():
|
|
1340
|
+
threshold_params.append(f"{threshold_key}={threshold_value}")
|
|
1341
|
+
thresholds_str = "pb.Thresholds(" + ", ".join(threshold_params) + ")"
|
|
1342
|
+
param_parts.append(f"thresholds={thresholds_str}")
|
|
1343
|
+
else:
|
|
1344
|
+
param_parts.append(f"thresholds={value}")
|
|
1345
|
+
elif isinstance(value, str):
|
|
1346
|
+
param_parts.append(f'{key}="{value}"')
|
|
1347
|
+
elif isinstance(value, bool):
|
|
1348
|
+
param_parts.append(f"{key}={str(value)}")
|
|
1349
|
+
elif isinstance(value, tuple):
|
|
1350
|
+
# Handle tuples like `inclusive=(False, True)`
|
|
1351
|
+
tuple_str = "(" + ", ".join([str(item) for item in value]) + ")"
|
|
1352
|
+
param_parts.append(f"{key}={tuple_str}")
|
|
1353
|
+
elif isinstance(value, list):
|
|
1354
|
+
# Handle lists/tuples (like `set=` parameter)
|
|
1355
|
+
if all(isinstance(item, str) for item in value):
|
|
1356
|
+
list_str = "[" + ", ".join([f'"{item}"' for item in value]) + "]"
|
|
1357
|
+
else:
|
|
1358
|
+
list_str = str(list(value))
|
|
1359
|
+
param_parts.append(f"{key}={list_str}")
|
|
1360
|
+
else:
|
|
1361
|
+
# Handle complex objects (like polars/pandas expressions from python: blocks)
|
|
1362
|
+
# For these, we'll use a placeholder since they can't be easily converted back
|
|
1363
|
+
param_parts.append(f"{key}={value}")
|
|
1364
|
+
|
|
1365
|
+
if param_parts:
|
|
1366
|
+
params_str = ", ".join(param_parts)
|
|
1367
|
+
code_lines.append(f" .{method_name}({params_str})")
|
|
1368
|
+
else:
|
|
1369
|
+
code_lines.append(f" .{method_name}()")
|
|
1370
|
+
|
|
1371
|
+
# Add interrogation method call
|
|
1372
|
+
code_lines.append(" .interrogate()")
|
|
1373
|
+
code_lines.append(")")
|
|
1374
|
+
|
|
1375
|
+
# Add imports at the beginning
|
|
1376
|
+
if needs_polars_import:
|
|
1377
|
+
imports.append("import polars as pl")
|
|
1378
|
+
if needs_pandas_import:
|
|
1379
|
+
imports.append("import pandas as pd")
|
|
1380
|
+
|
|
1381
|
+
# Build final code with imports
|
|
1382
|
+
final_code_lines = imports + [""] + code_lines
|
|
1383
|
+
|
|
1384
|
+
# Join all code lines and wrap in single markdown code block
|
|
1385
|
+
python_code = "\n".join(final_code_lines)
|
|
1386
|
+
return f"```python\n{python_code}\n```"
|