pointblank 0.11.3__py3-none-any.whl → 0.11.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/yaml.py ADDED
@@ -0,0 +1,1386 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Union
5
+
6
+ import yaml
7
+
8
+ from pointblank.thresholds import Actions
9
+ from pointblank.validate import Validate, load_dataset
10
+
11
+
12
+ class YAMLValidationError(Exception):
13
+ """Exception raised for YAML validation errors."""
14
+
15
+ pass
16
+
17
+
18
+ def _safe_eval_python_code(code: str) -> Any:
19
+ """Safely evaluate Python code with restricted namespace.
20
+
21
+ This function provides a controlled environment for executing Python code embedded in YAML
22
+ configurations. It includes common libraries and functions while restricting access to
23
+ dangerous operations.
24
+
25
+ Parameters
26
+ ----------
27
+ code
28
+ The Python code to evaluate.
29
+
30
+ Returns
31
+ -------
32
+ Any
33
+ The result of evaluating the Python code.
34
+
35
+ Raises
36
+ ------
37
+ YAMLValidationError
38
+ If the code execution fails or contains unsafe operations.
39
+ """
40
+ import ast
41
+ import re
42
+ from pathlib import Path
43
+
44
+ from pointblank._utils import _is_lib_present
45
+
46
+ # Create a safe namespace with commonly needed imports
47
+ safe_namespace = {
48
+ "Path": Path, # pathlib.Path
49
+ "__builtins__": {
50
+ # Allow basic built-in functions
51
+ "len": len,
52
+ "str": str,
53
+ "int": int,
54
+ "float": float,
55
+ "bool": bool,
56
+ "list": list,
57
+ "dict": dict,
58
+ "tuple": tuple,
59
+ "set": set,
60
+ "range": range,
61
+ "enumerate": enumerate,
62
+ "zip": zip,
63
+ "sum": sum,
64
+ "min": min,
65
+ "max": max,
66
+ "abs": abs,
67
+ "round": round,
68
+ "print": print,
69
+ },
70
+ }
71
+
72
+ # Add pointblank itself to the namespace
73
+ import pointblank as pb
74
+
75
+ safe_namespace["pb"] = pb
76
+
77
+ # Add polars if available
78
+ if _is_lib_present("polars"):
79
+ import polars as pl
80
+
81
+ safe_namespace["pl"] = pl
82
+
83
+ # Add pandas if available
84
+ if _is_lib_present("pandas"):
85
+ import pandas as pd
86
+
87
+ safe_namespace["pd"] = pd
88
+
89
+ # Check for dangerous patterns
90
+ dangerous_patterns = [
91
+ r"import\s+os",
92
+ r"import\s+sys",
93
+ r"import\s+subprocess",
94
+ r"__import__",
95
+ r"exec\s*\(",
96
+ r"eval\s*\(",
97
+ r"open\s*\(",
98
+ r"file\s*\(",
99
+ r"input\s*\(",
100
+ r"raw_input\s*\(",
101
+ ]
102
+
103
+ for pattern in dangerous_patterns:
104
+ if re.search(pattern, code, re.IGNORECASE):
105
+ raise YAMLValidationError(
106
+ f"Potentially unsafe Python code detected: '{code}'. "
107
+ f"Pattern '{pattern}' is not allowed."
108
+ )
109
+
110
+ try:
111
+ # First try to parse as expression for simple cases
112
+ try:
113
+ parsed = ast.parse(code, mode="eval")
114
+ return eval(compile(parsed, "<string>", "eval"), safe_namespace)
115
+ except SyntaxError:
116
+ # If that fails, try as a statement (for more complex code)
117
+ # For multi-statement code, we need to capture the result of the last expression
118
+ parsed = ast.parse(code, mode="exec")
119
+
120
+ # Check if the last node is an expression
121
+ if parsed.body and isinstance(parsed.body[-1], ast.Expr):
122
+ # Split the last expression from the statements
123
+ statements = parsed.body[:-1]
124
+ last_expr = parsed.body[-1].value
125
+
126
+ # Execute the statements first
127
+ if statements:
128
+ statements_module = ast.Module(body=statements, type_ignores=[])
129
+ exec(compile(statements_module, "<string>", "exec"), safe_namespace)
130
+
131
+ # Then evaluate the last expression and return its value
132
+ expr_module = ast.Expression(body=last_expr)
133
+ return eval(compile(expr_module, "<string>", "eval"), safe_namespace)
134
+ else:
135
+ # No expression at the end, just execute statements
136
+ exec(compile(parsed, "<string>", "exec"), safe_namespace)
137
+ return None
138
+
139
+ except Exception as e:
140
+ raise YAMLValidationError(f"Error executing Python code '{code}': {e}")
141
+
142
+
143
+ def _process_python_expressions(value: Any) -> Any:
144
+ """Process Python code snippets embedded in YAML values.
145
+
146
+ This function supports the python: block syntax for embedding Python code:
147
+
148
+ python: |
149
+ import polars as pl
150
+ pl.scan_csv("data.csv").head(10)
151
+
152
+ Note: col_vals_expr() also supports a shortcut syntax where the expr parameter
153
+ can be written directly without the python: wrapper:
154
+
155
+ col_vals_expr:
156
+ expr: |
157
+ pl.col("column") > 0
158
+
159
+ Parameters
160
+ ----------
161
+ value
162
+ The value to process, can be any YAML type.
163
+
164
+ Returns
165
+ -------
166
+ Any
167
+ The processed value with Python expressions evaluated.
168
+
169
+ Examples
170
+ --------
171
+ >>> _process_python_expressions({"python": "pl.scan_csv('data.csv').head(10)"})
172
+ # Returns the result of the Python expression
173
+
174
+ >>> _process_python_expressions({"python": "import polars as pl\\npl.scan_csv('data.csv')"})
175
+ # Returns the result of multiline Python code
176
+ """
177
+ if isinstance(value, dict):
178
+ # Handle python: block syntax
179
+ if "python" in value and len(value) == 1:
180
+ code = value["python"]
181
+ return _safe_eval_python_code(code)
182
+
183
+ # Recursively process dictionary values
184
+ return {k: _process_python_expressions(v) for k, v in value.items()}
185
+
186
+ elif isinstance(value, list):
187
+ # Recursively process list items
188
+ return [_process_python_expressions(item) for item in value]
189
+
190
+ else:
191
+ # Return primitive types unchanged
192
+ return value
193
+
194
+
195
+ class YAMLValidator:
196
+ """Validates YAML configuration and converts to Validate objects."""
197
+
198
+ # Map YAML method names to Python method names
199
+ validation_method_map = {
200
+ "col_exists": "col_exists",
201
+ "col_vals_gt": "col_vals_gt",
202
+ "col_vals_ge": "col_vals_ge",
203
+ "col_vals_lt": "col_vals_lt",
204
+ "col_vals_le": "col_vals_le",
205
+ "col_vals_eq": "col_vals_eq",
206
+ "col_vals_ne": "col_vals_ne",
207
+ "col_vals_between": "col_vals_between",
208
+ "col_vals_outside": "col_vals_outside",
209
+ "col_vals_regex": "col_vals_regex",
210
+ "col_vals_in_set": "col_vals_in_set",
211
+ "col_vals_not_in_set": "col_vals_not_in_set",
212
+ "col_vals_not_null": "col_vals_not_null",
213
+ "col_vals_null": "col_vals_null",
214
+ "col_vals_expr": "col_vals_expr",
215
+ "rows_distinct": "rows_distinct",
216
+ "rows_complete": "rows_complete",
217
+ "col_count_match": "col_count_match",
218
+ "row_count_match": "row_count_match",
219
+ "col_schema_match": "col_schema_match",
220
+ }
221
+
222
+ def __init__(self):
223
+ """Initialize the YAML validator."""
224
+ pass
225
+
226
+ def load_config(self, source: Union[str, Path]) -> dict:
227
+ """Load and validate YAML configuration.
228
+
229
+ Parameters
230
+ ----------
231
+ source
232
+ YAML string or Path to YAML file.
233
+
234
+ Returns
235
+ -------
236
+ dict
237
+ Parsed and validated configuration dictionary.
238
+
239
+ Raises
240
+ ------
241
+ YAMLValidationError
242
+ If the YAML is invalid or malformed.
243
+ """
244
+ try:
245
+ if isinstance(source, (str, Path)):
246
+ if isinstance(source, Path):
247
+ # It's definitely a file path
248
+ with open(source, "r", encoding="utf-8") as f:
249
+ config = yaml.safe_load(f)
250
+ elif isinstance(source, str):
251
+ # Check if it looks like YAML content
252
+ stripped = source.strip()
253
+ if (
254
+ stripped.startswith(("tbl:", "steps:"))
255
+ or "\n" in stripped
256
+ or ":" in stripped
257
+ ):
258
+ # Looks like YAML content
259
+ config = yaml.safe_load(source)
260
+ else:
261
+ # Assume it's a file path
262
+ with open(source, "r", encoding="utf-8") as f:
263
+ config = yaml.safe_load(f)
264
+ else:
265
+ raise YAMLValidationError(
266
+ f"Invalid source type: {type(source)}. Only YAML strings and file paths supported."
267
+ )
268
+
269
+ if not isinstance(config, dict):
270
+ raise YAMLValidationError("YAML must contain a dictionary at the root level")
271
+
272
+ self._validate_schema(config)
273
+ return config
274
+
275
+ except yaml.YAMLError as e:
276
+ raise YAMLValidationError(f"Invalid YAML syntax: {e}")
277
+ except Exception as e:
278
+ raise YAMLValidationError(f"Error loading YAML configuration: {e}")
279
+
280
+ def _validate_schema(self, config: dict) -> None:
281
+ """Validate the YAML configuration schema.
282
+
283
+ Parameters
284
+ ----------
285
+ config
286
+ Configuration dictionary to validate.
287
+
288
+ Raises
289
+ ------
290
+ YAMLValidationError
291
+ If the schema is invalid.
292
+ """
293
+ # Check required fields
294
+ if "tbl" not in config:
295
+ raise YAMLValidationError("YAML must contain 'tbl' field")
296
+
297
+ if "steps" not in config:
298
+ raise YAMLValidationError("YAML must contain 'steps' field")
299
+
300
+ if not isinstance(config["steps"], list):
301
+ raise YAMLValidationError("'steps' must be a list")
302
+
303
+ if len(config["steps"]) == 0:
304
+ raise YAMLValidationError("'steps' cannot be empty")
305
+
306
+ # Validate thresholds if present
307
+ if "thresholds" in config:
308
+ thresholds = config["thresholds"]
309
+ if not isinstance(thresholds, dict):
310
+ raise YAMLValidationError("'thresholds' must be a dictionary")
311
+
312
+ for key, value in thresholds.items():
313
+ if key not in ["warning", "error", "critical"]:
314
+ raise YAMLValidationError(
315
+ f"Invalid threshold key: {key}. Must be 'warning', 'error', or 'critical'"
316
+ )
317
+
318
+ if not isinstance(value, (int, float)):
319
+ raise YAMLValidationError(f"Threshold '{key}' must be a number")
320
+
321
+ if value < 0:
322
+ raise YAMLValidationError(f"Threshold '{key}' must be non-negative")
323
+
324
+ # Validate actions if present
325
+ if "actions" in config:
326
+ actions = config["actions"]
327
+ if not isinstance(actions, dict):
328
+ raise YAMLValidationError("'actions' must be a dictionary")
329
+
330
+ for key, value in actions.items():
331
+ if key not in ["warning", "error", "critical", "default", "highest_only"]:
332
+ raise YAMLValidationError(
333
+ f"Invalid action key: {key}. Must be 'warning', 'error', 'critical', "
334
+ f"'default', or 'highest_only'"
335
+ )
336
+
337
+ if key == "highest_only":
338
+ if not isinstance(value, bool):
339
+ raise YAMLValidationError(f"Action '{key}' must be a boolean")
340
+ else:
341
+ # Action values can be strings or have python: block syntax for callables
342
+ if not isinstance(value, (str, dict, list)):
343
+ raise YAMLValidationError(
344
+ f"Action '{key}' must be a string, dictionary (for python: block), "
345
+ f"or list of strings/dictionaries"
346
+ )
347
+
348
+ def _load_data_source(self, tbl_spec: str) -> Any:
349
+ """Load data source based on table specification.
350
+
351
+ Parameters
352
+ ----------
353
+ tbl_spec
354
+ Data source specification. Can be (1) a dataset name for `load_dataset()`, (2) a CSV file
355
+ path (relative or absolute), (3) a Parquet file path (relative or absolute), or (4) a
356
+ Python code snippet to be executed for dynamic data loading.
357
+
358
+ Returns
359
+ -------
360
+ Loaded data object.
361
+
362
+ Raises
363
+ ------
364
+ YAMLValidationError
365
+ If data source cannot be loaded.
366
+ """
367
+ from pointblank.validate import _process_data
368
+
369
+ try:
370
+ # First, try to process as Python expression
371
+ processed_tbl_spec = _process_python_expressions(tbl_spec)
372
+
373
+ # If processing returned a different object (not a string), use it directly
374
+ if processed_tbl_spec is not tbl_spec or not isinstance(processed_tbl_spec, str):
375
+ return processed_tbl_spec
376
+
377
+ # Use the centralized data processing pipeline from validate.py
378
+ # This handles CSV files, Parquet files, and other data sources
379
+ processed_data = _process_data(processed_tbl_spec)
380
+
381
+ # If _process_data returns the original string unchanged,
382
+ # then it's not a file path, so try load_dataset
383
+ if processed_data is processed_tbl_spec and isinstance(processed_tbl_spec, str):
384
+ return load_dataset(processed_tbl_spec)
385
+ else:
386
+ return processed_data
387
+
388
+ except Exception as e:
389
+ raise YAMLValidationError(f"Failed to load data source '{tbl_spec}': {e}")
390
+
391
+ def _parse_column_spec(self, columns_expr: Any) -> list[str]:
392
+ """Parse column specification from YAML.
393
+
394
+ Handles standard YAML syntax for columns.
395
+
396
+ Parameters
397
+ ----------
398
+ columns_expr
399
+ Column specification (list, or string).
400
+
401
+ Returns
402
+ -------
403
+ list[str]
404
+ List of column names.
405
+ """
406
+ if isinstance(columns_expr, list):
407
+ return [str(col) for col in columns_expr]
408
+
409
+ if isinstance(columns_expr, str):
410
+ # Single column name
411
+ return [columns_expr]
412
+
413
+ # Fallback: convert to string
414
+ return [str(columns_expr)]
415
+
416
+ def _parse_schema_spec(self, schema_spec: Any) -> Any:
417
+ """Parse schema specification from YAML.
418
+
419
+ Converts dictionary-based schema definitions into Schema objects.
420
+
421
+ Column specifications support multiple formats:
422
+ - Scalar strings: "column_name" (name only, no type checking)
423
+ - Lists with name and type: ["column_name", "data_type"]
424
+ - Lists with name only: ["column_name"] (equivalent to scalar)
425
+
426
+ Parameters
427
+ ----------
428
+ schema_spec
429
+ Schema specification as a dictionary with 'columns' field.
430
+
431
+ Returns
432
+ -------
433
+ Schema
434
+ A Schema object created from the specification.
435
+
436
+ Raises
437
+ ------
438
+ YAMLValidationError
439
+ If schema specification is invalid.
440
+ """
441
+ from pointblank.schema import Schema
442
+
443
+ # Handle dictionary specification only
444
+ if isinstance(schema_spec, dict):
445
+ if "columns" in schema_spec:
446
+ # Convert columns list to a `Schema` object
447
+ columns_spec = schema_spec["columns"]
448
+
449
+ if not isinstance(columns_spec, list):
450
+ raise YAMLValidationError(
451
+ "Schema 'columns' must be a list of column specifications"
452
+ )
453
+
454
+ # Convert YAML column specs to `Schema` format
455
+ schema_columns = []
456
+ for col_spec in columns_spec:
457
+ if isinstance(col_spec, list):
458
+ if len(col_spec) == 1:
459
+ # Column name only: ["column_name"]
460
+ schema_columns.append((col_spec[0],))
461
+ elif len(col_spec) == 2:
462
+ # Column name and type: ["column_name", "type"]
463
+ schema_columns.append((col_spec[0], col_spec[1]))
464
+ else:
465
+ raise YAMLValidationError(
466
+ f"Column specification must have 1-2 elements, got: {col_spec}"
467
+ )
468
+ elif isinstance(col_spec, str):
469
+ # Just column name as string
470
+ schema_columns.append((col_spec,))
471
+ else:
472
+ raise YAMLValidationError(
473
+ f"Invalid column specification type: {type(col_spec)}"
474
+ )
475
+
476
+ # Create Schema object
477
+ return Schema(columns=schema_columns)
478
+ else:
479
+ raise YAMLValidationError("Schema specification must contain 'columns' field")
480
+ else:
481
+ raise YAMLValidationError(
482
+ f"Schema specification must be a dictionary, got: {type(schema_spec)}"
483
+ )
484
+
485
+ def _parse_validation_step(self, step_config: Union[str, dict]) -> tuple[str, dict]:
486
+ """Parse a single validation step from YAML configuration.
487
+
488
+ Parameters
489
+ ----------
490
+ step_config
491
+ Step configuration (string for parameterless steps, dict for others).
492
+
493
+ Returns
494
+ -------
495
+ tuple[str, dict]
496
+ Tuple of (method_name, parameters).
497
+
498
+ Raises
499
+ ------
500
+ YAMLValidationError
501
+ If step configuration is invalid.
502
+ """
503
+ if isinstance(step_config, str):
504
+ # Simple step with no parameters (e.g., "rows_distinct")
505
+ method_name = step_config
506
+ parameters = {}
507
+ elif isinstance(step_config, dict):
508
+ # Step with parameters
509
+ if len(step_config) != 1:
510
+ raise YAMLValidationError(
511
+ "Step configuration must contain exactly one validation method, "
512
+ f"got: {list(step_config.keys())}"
513
+ )
514
+
515
+ method_name = list(step_config.keys())[0]
516
+ parameters = step_config[method_name] or {}
517
+
518
+ if not isinstance(parameters, dict):
519
+ raise YAMLValidationError(f"Parameters for '{method_name}' must be a dictionary")
520
+ else:
521
+ raise YAMLValidationError(f"Invalid step configuration type: {type(step_config)}")
522
+
523
+ # Validate that we know this method
524
+ if method_name not in self.validation_method_map:
525
+ available_methods = list(self.validation_method_map.keys())
526
+ raise YAMLValidationError(
527
+ f"Unknown validation method '{method_name}'. Available methods: {available_methods}"
528
+ )
529
+
530
+ # Process Python expressions in all parameters
531
+ processed_parameters = {}
532
+ for key, value in parameters.items():
533
+ # Special case: `col_vals_expr()`'s `expr=` parameter can use shortcut syntax
534
+ if method_name == "col_vals_expr" and key == "expr" and isinstance(value, str):
535
+ # Treat string directly as Python code (shortcut syntax)
536
+ processed_parameters[key] = _safe_eval_python_code(value)
537
+ # Special case: `pre=` parameter can use shortcut syntax (like `expr=`)
538
+ elif key == "pre" and isinstance(value, str):
539
+ # Treat string directly as Python code (shortcut syntax)
540
+ processed_parameters[key] = _safe_eval_python_code(value)
541
+ else:
542
+ # Normal processing (requires python: block syntax)
543
+ processed_parameters[key] = _process_python_expressions(value)
544
+ parameters = processed_parameters
545
+
546
+ # Convert `columns=` specification
547
+ if "columns" in parameters:
548
+ parameters["columns"] = self._parse_column_spec(parameters["columns"])
549
+
550
+ #
551
+ # Convert special parameter formats
552
+ #
553
+
554
+ # Convert `columns_subset=` if present (from `rows_[distinct|complete]()`)
555
+ if "columns_subset" in parameters:
556
+ parameters["columns_subset"] = self._parse_column_spec(parameters["columns_subset"])
557
+
558
+ # Convert `schema=` if present (for `col_schema_match()`)
559
+ if "schema" in parameters and method_name == "col_schema_match":
560
+ parameters["schema"] = self._parse_schema_spec(parameters["schema"])
561
+
562
+ # Convert `actions=` if present (ensure it's an Actions object)
563
+ if "actions" in parameters:
564
+ if isinstance(parameters["actions"], dict):
565
+ parameters["actions"] = Actions(**parameters["actions"])
566
+
567
+ # Handle `inclusive=` parameter for `col_vals_[inside|outside]()` (convert list to tuple)
568
+ if "inclusive" in parameters and isinstance(parameters["inclusive"], list):
569
+ parameters["inclusive"] = tuple(parameters["inclusive"])
570
+
571
+ return self.validation_method_map[method_name], parameters
572
+
573
+ def build_validation(self, config: dict) -> Validate:
574
+ """Convert YAML config to Validate object.
575
+
576
+ Parameters
577
+ ----------
578
+ config
579
+ Validated configuration dictionary.
580
+
581
+ Returns
582
+ -------
583
+ Validate
584
+ Validate object with configured validation steps.
585
+ """
586
+ # Load data source
587
+ data = self._load_data_source(config["tbl"])
588
+
589
+ # Create Validate object
590
+ validate_kwargs = {}
591
+
592
+ # Set table name if provided
593
+ if "tbl_name" in config:
594
+ validate_kwargs["tbl_name"] = config["tbl_name"]
595
+
596
+ # Set label if provided
597
+ if "label" in config:
598
+ validate_kwargs["label"] = config["label"]
599
+
600
+ # Set thresholds if provided
601
+ if "thresholds" in config:
602
+ validate_kwargs["thresholds"] = config["thresholds"]
603
+
604
+ # Set actions if provided
605
+ if "actions" in config:
606
+ # Process actions - handle python: block syntax for callables
607
+ processed_actions = _process_python_expressions(config["actions"])
608
+ # Convert to Actions object
609
+ validate_kwargs["actions"] = Actions(**processed_actions)
610
+
611
+ # Set language if provided
612
+ if "lang" in config:
613
+ validate_kwargs["lang"] = config["lang"]
614
+
615
+ # Set locale if provided
616
+ if "locale" in config:
617
+ validate_kwargs["locale"] = config["locale"]
618
+
619
+ # Set global brief if provided
620
+ if "brief" in config:
621
+ validate_kwargs["brief"] = config["brief"]
622
+
623
+ validation = Validate(data, **validate_kwargs)
624
+
625
+ # Add validation steps
626
+ for step_config in config["steps"]:
627
+ method_name, parameters = self._parse_validation_step(step_config)
628
+
629
+ # Get the method from the validation object
630
+ method = getattr(validation, method_name)
631
+
632
+ # Call the method with parameters
633
+ validation = method(**parameters)
634
+
635
+ return validation
636
+
637
+ def execute_workflow(self, config: dict) -> Validate:
638
+ """Execute a complete YAML validation workflow.
639
+
640
+ Parameters
641
+ ----------
642
+ config
643
+ Validated configuration dictionary.
644
+
645
+ Returns
646
+ -------
647
+ Validate
648
+ Interrogated Validate object with results.
649
+ """
650
+ # Build the validation plan
651
+ validation = self.build_validation(config)
652
+
653
+ # Execute interrogation to get results
654
+ validation = validation.interrogate()
655
+
656
+ return validation
657
+
658
+
659
+ def yaml_interrogate(yaml: Union[str, Path]) -> Validate:
660
+ """Execute a YAML-based validation workflow.
661
+
662
+ This is the main entry point for YAML-based validation workflows. It takes YAML configuration
663
+ (as a string or file path) and returns a validated `Validate` object with interrogation results.
664
+
665
+ The YAML configuration defines the data source, validation steps, and optional settings like
666
+ thresholds and labels. This function automatically loads the data, builds the validation plan,
667
+ executes all validation steps, and returns the interrogated results.
668
+
669
+ Parameters
670
+ ----------
671
+ yaml
672
+ YAML configuration as string or file path. Can be: (1) a YAML string containing the
673
+ validation configuration, or (2) a Path object or string path to a YAML file.
674
+
675
+ Returns
676
+ -------
677
+ Validate
678
+ An instance of the `Validate` class that has been configured based on the YAML input.
679
+ This object contains the results of the validation steps defined in the YAML configuration.
680
+ It includes metadata like table name, label, language, and thresholds if specified.
681
+
682
+ Raises
683
+ ------
684
+ YAMLValidationError
685
+ If the YAML is invalid, malformed, or execution fails. This includes syntax errors, missing
686
+ required fields, unknown validation methods, or data loading failures.
687
+
688
+ Examples
689
+ --------
690
+ ```{python}
691
+ #| echo: false
692
+ #| output: false
693
+ import pointblank as pb
694
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
695
+ ```
696
+ For the examples here, we'll use YAML configurations to define validation workflows. Let's start
697
+ with a basic YAML workflow that validates the built-in `small_table` dataset.
698
+
699
+ ```{python}
700
+ import pointblank as pb
701
+
702
+ # Define a basic YAML validation workflow
703
+ yaml_config = '''
704
+ tbl: small_table
705
+ steps:
706
+ - rows_distinct
707
+ - col_exists:
708
+ columns: [date, a, b]
709
+ '''
710
+
711
+ # Execute the validation workflow
712
+ result = pb.yaml_interrogate(yaml_config)
713
+ result
714
+ ```
715
+
716
+ The validation table shows the results of our YAML-defined workflow. We can see that the
717
+ `rows_distinct()` validation failed (because there are duplicate rows in the table), while the
718
+ column existence checks passed.
719
+
720
+ Now let's create a more comprehensive validation workflow with thresholds and metadata:
721
+
722
+ ```{python}
723
+ # Advanced YAML configuration with thresholds and metadata
724
+ yaml_config = '''
725
+ tbl: small_table
726
+ tbl_name: small_table_demo
727
+ label: Comprehensive data validation
728
+ thresholds:
729
+ warning: 0.1
730
+ error: 0.25
731
+ critical: 0.35
732
+ steps:
733
+ - col_vals_gt:
734
+ columns: [d]
735
+ value: 100
736
+ - col_vals_regex:
737
+ columns: [b]
738
+ pattern: '[0-9]-[a-z]{3}-[0-9]{3}'
739
+ - col_vals_not_null:
740
+ columns: [date, a]
741
+ '''
742
+
743
+ # Execute the validation workflow
744
+ result = pb.yaml_interrogate(yaml_config)
745
+ print(f"Table name: {result.tbl_name}")
746
+ print(f"Label: {result.label}")
747
+ print(f"Total validation steps: {len(result.validation_info)}")
748
+ ```
749
+
750
+ The validation results now include our custom table name and label. The thresholds we defined
751
+ will determine when validation steps are marked as warnings, errors, or critical failures.
752
+
753
+ You can also load YAML configurations from files. Here's how you would work with a YAML file:
754
+
755
+ ```{python}
756
+ from pathlib import Path
757
+ import tempfile
758
+
759
+ # Create a temporary YAML file for demonstration
760
+ yaml_content = '''
761
+ tbl: small_table
762
+ tbl_name: File-based Validation
763
+ steps:
764
+ - col_vals_between:
765
+ columns: [c]
766
+ left: 1
767
+ right: 10
768
+ - col_vals_in_set:
769
+ columns: [f]
770
+ set: [low, mid, high]
771
+ '''
772
+
773
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f:
774
+ f.write(yaml_content)
775
+ yaml_file_path = Path(f.name)
776
+
777
+ # Load and execute validation from file
778
+ result = pb.yaml_interrogate(yaml_file_path)
779
+ result
780
+ ```
781
+
782
+ This approach is particularly useful for storing validation configurations as part of your data
783
+ pipeline or version control system, allowing you to maintain validation rules alongside your
784
+ code.
785
+ """
786
+ validator = YAMLValidator()
787
+ config = validator.load_config(yaml)
788
+ return validator.execute_workflow(config)
789
+
790
+
791
+ def load_yaml_config(file_path: Union[str, Path]) -> dict:
792
+ """Load YAML configuration from file or string.
793
+
794
+ Parameters
795
+ ----------
796
+ file_path
797
+ Path to YAML file or YAML content string
798
+
799
+ Returns
800
+ -------
801
+ dict
802
+ Parsed configuration dictionary
803
+
804
+ Raises
805
+ ------
806
+ YAMLValidationError
807
+ If the file cannot be loaded or is invalid
808
+ """
809
+ validator = YAMLValidator()
810
+ return validator.load_config(file_path)
811
+
812
+
813
+ def validate_yaml(yaml: Union[str, Path]) -> None:
814
+ """Validate YAML configuration against the expected structure.
815
+
816
+ This function validates that a YAML configuration conforms to the expected structure for
817
+ validation workflows. It checks for required fields, proper data types, and valid
818
+ validation method names. This is useful for validating configurations before execution or
819
+ for building configuration editors and validators.
820
+
821
+ The function performs comprehensive validation including:
822
+
823
+ - required fields ('tbl' and 'steps')
824
+ - proper data types for all fields
825
+ - valid threshold configurations
826
+ - known validation method names
827
+ - proper step configuration structure
828
+
829
+ Parameters
830
+ ----------
831
+ yaml
832
+ YAML configuration as string or file path. Can be: (1) a YAML string containing the
833
+ validation configuration, or (2) a Path object or string path to a YAML file.
834
+
835
+ Raises
836
+ ------
837
+ YAMLValidationError
838
+ If the YAML is invalid, malformed, or execution fails. This includes syntax errors,
839
+ missing required fields, unknown validation methods, or data loading failures.
840
+
841
+ Examples
842
+ --------
843
+ ```{python}
844
+ #| echo: false
845
+ #| output: false
846
+ import pointblank as pb
847
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
848
+ ```
849
+ For the examples here, we'll demonstrate how to validate YAML configurations before using them
850
+ with validation workflows. This is particularly useful for building robust data validation
851
+ systems where you want to catch configuration errors early.
852
+
853
+ Let's start with validating a basic configuration:
854
+
855
+ ```{python}
856
+ import pointblank as pb
857
+
858
+ # Define a basic YAML validation configuration
859
+ yaml_config = '''
860
+ tbl: small_table
861
+ steps:
862
+ - rows_distinct
863
+ - col_exists:
864
+ columns: [a, b]
865
+ '''
866
+
867
+ # Validate the configuration: no exception means it's valid
868
+ pb.validate_yaml(yaml_config)
869
+ print("Basic YAML configuration is valid")
870
+ ```
871
+
872
+ The function completed without raising an exception, which means our configuration is valid and
873
+ follows the expected structure.
874
+
875
+ Now let's validate a more complex configuration with thresholds and metadata:
876
+
877
+ ```{python}
878
+ # Complex YAML configuration with all optional fields
879
+ yaml_config = '''
880
+ tbl: small_table
881
+ tbl_name: My Dataset
882
+ label: Quality check
883
+ lang: en
884
+ locale: en
885
+ thresholds:
886
+ warning: 0.1
887
+ error: 0.25
888
+ critical: 0.35
889
+ steps:
890
+ - rows_distinct
891
+ - col_vals_gt:
892
+ columns: [d]
893
+ value: 100
894
+ - col_vals_regex:
895
+ columns: [b]
896
+ pattern: '[0-9]-[a-z]{3}-[0-9]{3}'
897
+ '''
898
+
899
+ # Validate the configuration
900
+ pb.validate_yaml(yaml_config)
901
+ print("Complex YAML configuration is valid")
902
+
903
+ # Count the validation steps
904
+ import pointblank.yaml as pby
905
+ config = pby.load_yaml_config(yaml_config)
906
+ print(f"Configuration has {len(config['steps'])} validation steps")
907
+ ```
908
+
909
+ This configuration includes all the optional metadata fields and complex validation steps,
910
+ demonstrating that the validation handles the full range of supported options.
911
+
912
+ Let's see what happens when we try to validate an invalid configuration:
913
+
914
+ ```{python}
915
+ # Invalid YAML configuration: missing required 'tbl' field
916
+ invalid_yaml = '''
917
+ steps:
918
+ - rows_distinct
919
+ '''
920
+
921
+ try:
922
+ pb.validate_yaml(invalid_yaml)
923
+ except pb.yaml.YAMLValidationError as e:
924
+ print(f"Validation failed: {e}")
925
+ ```
926
+
927
+ The validation correctly identifies that our configuration is missing the required `'tbl'`
928
+ field.
929
+
930
+ Here's a practical example of using validation in a workflow builder:
931
+
932
+ ```{python}
933
+ def safe_yaml_interrogate(yaml_config):
934
+ \"\"\"Safely execute a YAML configuration after validation.\"\"\"
935
+ try:
936
+ # Validate the YAML configuration first
937
+ pb.validate_yaml(yaml_config)
938
+ print("✓ YAML configuration is valid")
939
+
940
+ # Then execute the workflow
941
+ result = pb.yaml_interrogate(yaml_config)
942
+ print(f"Validation completed with {len(result.validation_info)} steps")
943
+ return result
944
+
945
+ except pb.yaml.YAMLValidationError as e:
946
+ print(f"Configuration error: {e}")
947
+ return None
948
+
949
+ # Test with a valid YAML configuration
950
+ test_yaml = '''
951
+ tbl: small_table
952
+ steps:
953
+ - col_vals_between:
954
+ columns: [c]
955
+ left: 1
956
+ right: 10
957
+ '''
958
+
959
+ result = safe_yaml_interrogate(test_yaml)
960
+ ```
961
+
962
+ This pattern of validating before executing helps build more reliable data validation pipelines
963
+ by catching configuration errors early in the process.
964
+
965
+ Note that this function only validates the structure and does not check if the specified data
966
+ source ('tbl') exists or is accessible. Data source validation occurs during execution with
967
+ `yaml_interrogate()`.
968
+
969
+ See Also
970
+ --------
971
+ yaml_interrogate : execute YAML-based validation workflows
972
+ """
973
+ validator = YAMLValidator()
974
+ config = validator.load_config(yaml)
975
+ # Only validate, don't execute the workflow
976
+ return None
977
+
978
+
979
+ def yaml_to_python(yaml: Union[str, Path]) -> str:
980
+ """Convert YAML validation configuration to equivalent Python code.
981
+
982
+ This function takes a YAML validation configuration and generates the equivalent Python code
983
+ that would produce the same validation workflow. This is useful for documentation, code
984
+ generation, or learning how to translate YAML workflows into programmatic workflows.
985
+
986
+ The generated Python code includes all necessary imports, data loading, validation steps,
987
+ and interrogation execution, formatted as executable Python code.
988
+
989
+ Parameters
990
+ ----------
991
+ yaml
992
+ YAML configuration as string or file path. Can be: (1) a YAML string containing the
993
+ validation configuration, or (2) a Path object or string path to a YAML file.
994
+
995
+ Returns
996
+ -------
997
+ str
998
+ A formatted Python code string enclosed in markdown code blocks that replicates the YAML
999
+ workflow. The code includes import statements, data loading, validation method calls, and
1000
+ interrogation execution.
1001
+
1002
+ Raises
1003
+ ------
1004
+ YAMLValidationError
1005
+ If the YAML is invalid, malformed, or contains unknown validation methods.
1006
+
1007
+ Examples
1008
+ --------
1009
+ ```{python}
1010
+ #| echo: false
1011
+ #| output: false
1012
+ import pointblank as pb
1013
+ pb.config(report_incl_header=False, report_incl_footer=False, preview_incl_header=False)
1014
+ ```
1015
+
1016
+ Convert a basic YAML configuration to Python code:
1017
+
1018
+ ```{python}
1019
+ import pointblank as pb
1020
+
1021
+ # Define a YAML validation workflow
1022
+ yaml_config = '''
1023
+ tbl: small_table
1024
+ tbl_name: Data Quality Check
1025
+ steps:
1026
+ - col_vals_not_null:
1027
+ columns: [a, b]
1028
+ - col_vals_gt:
1029
+ columns: [c]
1030
+ value: 0
1031
+ '''
1032
+
1033
+ # Generate equivalent Python code
1034
+ python_code = pb.yaml_to_python(yaml_config)
1035
+ print(python_code)
1036
+ ```
1037
+
1038
+ The generated Python code shows exactly how to replicate the YAML workflow programmatically.
1039
+ This is particularly useful when transitioning from YAML-based workflows to code-based
1040
+ workflows, or when generating documentation that shows both YAML and Python approaches.
1041
+
1042
+ For more complex workflows with thresholds and metadata:
1043
+
1044
+ ```{python}
1045
+ # Advanced YAML configuration
1046
+ yaml_config = '''
1047
+ tbl: small_table
1048
+ tbl_name: Advanced Validation
1049
+ label: Production data check
1050
+ thresholds:
1051
+ warning: 0.1
1052
+ error: 0.2
1053
+ steps:
1054
+ - col_vals_between:
1055
+ columns: [c]
1056
+ left: 1
1057
+ right: 10
1058
+ - col_vals_regex:
1059
+ columns: [b]
1060
+ pattern: '[0-9]-[a-z]{3}-[0-9]{3}'
1061
+ '''
1062
+
1063
+ # Generate the equivalent Python code
1064
+ python_code = pb.yaml_to_python(yaml_config)
1065
+ print(python_code)
1066
+ ```
1067
+
1068
+ The generated code includes all configuration parameters, thresholds, and maintains the exact
1069
+ same validation logic as the original YAML workflow.
1070
+
1071
+ This function is also useful for educational purposes, helping users understand how YAML
1072
+ configurations map to the underlying Python API calls.
1073
+ """
1074
+ # First, parse the raw YAML to detect Polars/Pandas expressions in the source code
1075
+ if isinstance(yaml, Path):
1076
+ yaml_content = yaml.read_text()
1077
+ elif isinstance(yaml, str):
1078
+ # Check if it's a file path (single line, reasonable length, no newlines)
1079
+ if len(yaml) < 260 and "\n" not in yaml and Path(yaml).exists():
1080
+ yaml_content = Path(yaml).read_text()
1081
+ else:
1082
+ yaml_content = yaml
1083
+ else:
1084
+ yaml_content = str(yaml)
1085
+
1086
+ # Track whether we need to import Polars and Pandas by analyzing the raw YAML content
1087
+ needs_polars_import = False
1088
+ needs_pandas_import = False
1089
+
1090
+ # Check for polars/pandas patterns in the raw YAML content
1091
+ if "pd." in yaml_content or "pandas" in yaml_content:
1092
+ needs_pandas_import = True
1093
+ if "pl." in yaml_content or "polars" in yaml_content:
1094
+ needs_polars_import = True
1095
+
1096
+ # Parse the raw YAML to extract original Python expressions before they get processed
1097
+ import yaml as yaml_module
1098
+
1099
+ raw_config = yaml_module.safe_load(yaml_content)
1100
+
1101
+ # Extract the original tbl python expression if it exists
1102
+ original_tbl_expression = None
1103
+ if isinstance(raw_config.get("tbl"), dict) and "python" in raw_config["tbl"]:
1104
+ original_tbl_expression = raw_config["tbl"]["python"].strip()
1105
+
1106
+ # Extract original Actions expressions if they exist
1107
+ original_actions_expressions = {}
1108
+ if "actions" in raw_config:
1109
+ for key, value in raw_config["actions"].items():
1110
+ if isinstance(value, dict) and "python" in value:
1111
+ original_actions_expressions[key] = value["python"].strip()
1112
+
1113
+ # Define function for recursively extract original Python expressions from step parameters
1114
+ def extract_python_expressions(obj, path=""):
1115
+ expressions = {}
1116
+ if isinstance(obj, dict):
1117
+ if "python" in obj and len(obj) == 1:
1118
+ expressions[path] = obj["python"].strip()
1119
+ else:
1120
+ for key, value in obj.items():
1121
+ new_path = f"{path}.{key}" if path else key
1122
+ # Special handling for `expr=` and `pre=` parameters that
1123
+ # can use shortcut syntax
1124
+ if key in ["expr", "pre"] and isinstance(value, str):
1125
+ expressions[new_path] = value.strip()
1126
+ # Special handling for actions that might contain python: expressions
1127
+ elif key == "actions" and isinstance(value, dict):
1128
+ for action_key, action_value in value.items():
1129
+ if isinstance(action_value, dict) and "python" in action_value:
1130
+ expressions[f"{new_path}.{action_key}"] = action_value[
1131
+ "python"
1132
+ ].strip()
1133
+ else:
1134
+ expressions.update(extract_python_expressions(value, new_path))
1135
+ elif isinstance(obj, list):
1136
+ for i, item in enumerate(obj):
1137
+ new_path = f"{path}[{i}]"
1138
+ expressions.update(extract_python_expressions(item, new_path))
1139
+ return expressions
1140
+
1141
+ step_expressions = {}
1142
+ if "steps" in raw_config:
1143
+ for i, step in enumerate(raw_config["steps"]):
1144
+ if isinstance(step, dict):
1145
+ step_expressions.update(extract_python_expressions(step, f"steps[{i}]"))
1146
+
1147
+ # Load and validate the YAML configuration
1148
+ validator = YAMLValidator()
1149
+ config = validator.load_config(yaml)
1150
+
1151
+ # Start building the Python code
1152
+ code_lines = []
1153
+
1154
+ # Add imports (we'll determine Polars/Pandas import need during processing)
1155
+ imports = ["import pointblank as pb"]
1156
+
1157
+ # Build the chained validation call
1158
+ code_lines.append("(")
1159
+
1160
+ # Build validation initialization arguments
1161
+ validate_args = []
1162
+
1163
+ # Add data loading as first argument
1164
+ tbl_spec = config["tbl"]
1165
+ if isinstance(tbl_spec, str):
1166
+ if tbl_spec.endswith((".csv", ".parquet")):
1167
+ # File loading
1168
+ validate_args.append(f'data=pb.load_dataset("{tbl_spec}")')
1169
+ else:
1170
+ # Dataset loading
1171
+ validate_args.append(f'data=pb.load_dataset("{tbl_spec}")')
1172
+ else:
1173
+ # Use the original Python expression if we extracted it
1174
+ if original_tbl_expression:
1175
+ validate_args.append(f"data={original_tbl_expression}")
1176
+ else:
1177
+ # Fallback to placeholder if we couldn't extract the original expression
1178
+ validate_args.append("data=<python_expression_result>")
1179
+
1180
+ # Add table name if present
1181
+ if "tbl_name" in config:
1182
+ validate_args.append(f'tbl_name="{config["tbl_name"]}"')
1183
+
1184
+ # Add `label=` if present
1185
+ if "label" in config:
1186
+ validate_args.append(f'label="{config["label"]}"')
1187
+
1188
+ # Add `thresholds=` if present: format as `pb.Thresholds()` for an idiomatic style
1189
+ if "thresholds" in config:
1190
+ thresholds_dict = config["thresholds"]
1191
+ threshold_params = []
1192
+ for key, value in thresholds_dict.items():
1193
+ threshold_params.append(f"{key}={value}")
1194
+ thresholds_str = "pb.Thresholds(" + ", ".join(threshold_params) + ")"
1195
+ validate_args.append(f"thresholds={thresholds_str}")
1196
+
1197
+ # Add `actions=` if present: format as `pb.Actions()` for an idiomatic style
1198
+ if "actions" in config:
1199
+ actions_dict = config["actions"]
1200
+ action_params = []
1201
+ for key, value in actions_dict.items():
1202
+ if key == "highest_only":
1203
+ action_params.append(f"{key}={value}")
1204
+ elif key in original_actions_expressions:
1205
+ # Use the original Python expression for callables
1206
+ action_params.append(f"{key}={original_actions_expressions[key]}")
1207
+ elif isinstance(value, str):
1208
+ action_params.append(f'{key}="{value}"')
1209
+ else:
1210
+ # For callables or complex expressions, use placeholder
1211
+ action_params.append(f"{key}={value}")
1212
+ actions_str = "pb.Actions(" + ", ".join(action_params) + ")"
1213
+ validate_args.append(f"actions={actions_str}")
1214
+
1215
+ # Add language if present
1216
+ if "lang" in config:
1217
+ validate_args.append(f'lang="{config["lang"]}"')
1218
+
1219
+ # Add locale if present
1220
+ if "locale" in config:
1221
+ validate_args.append(f'locale="{config["locale"]}"')
1222
+
1223
+ # Add global brief if present
1224
+ if "brief" in config:
1225
+ if isinstance(config["brief"], bool):
1226
+ validate_args.append(f"brief={str(config['brief'])}")
1227
+ else:
1228
+ validate_args.append(f'brief="{config["brief"]}"')
1229
+
1230
+ # Create the `pb.Validate()` call
1231
+ if len(validate_args) == 1:
1232
+ # Single argument fits on one line
1233
+ code_lines.append(f" pb.Validate({validate_args[0]})")
1234
+ else:
1235
+ # Multiple arguments: format each on its own line
1236
+ code_lines.append(" pb.Validate(")
1237
+ for i, arg in enumerate(validate_args):
1238
+ if i == len(validate_args) - 1:
1239
+ code_lines.append(f" {arg},")
1240
+ else:
1241
+ code_lines.append(f" {arg},")
1242
+ code_lines.append(" )")
1243
+
1244
+ # Add validation steps as chained method calls
1245
+ for step_index, step_config in enumerate(config["steps"]):
1246
+ method_name, parameters = validator._parse_validation_step(step_config)
1247
+
1248
+ # Format parameters
1249
+ param_parts = []
1250
+ for key, value in parameters.items():
1251
+ # Check if we have an original expression for this parameter
1252
+ expression_path = f"steps[{step_index}].{list(step_config.keys())[0]}.{key}"
1253
+ if expression_path in step_expressions:
1254
+ # Use the original Python expression
1255
+ param_parts.append(f"{key}={step_expressions[expression_path]}")
1256
+ elif key in ["columns", "columns_subset"]:
1257
+ if isinstance(value, list):
1258
+ if len(value) == 1:
1259
+ # Single column as string
1260
+ param_parts.append(f'{key}="{value[0]}"')
1261
+ else:
1262
+ # Multiple columns as list
1263
+ columns_str = "[" + ", ".join([f'"{col}"' for col in value]) + "]"
1264
+ param_parts.append(f"{key}={columns_str}")
1265
+ else:
1266
+ param_parts.append(f'{key}="{value}"')
1267
+ elif key == "brief":
1268
+ # Handle `brief=` parameter: can be a boolean or a string
1269
+ if isinstance(value, bool):
1270
+ param_parts.append(f"brief={str(value)}")
1271
+ else:
1272
+ param_parts.append(f'brief="{value}"')
1273
+ elif key == "actions":
1274
+ # Handle actions parameter: format as `pb.Actions()`
1275
+ if isinstance(value, Actions):
1276
+ # Already an `Actions` object, format its attributes
1277
+ action_params = []
1278
+
1279
+ # Check for original expressions for each action level
1280
+ step_action_base = f"steps[{step_index}].{list(step_config.keys())[0]}.actions"
1281
+
1282
+ if value.warning is not None:
1283
+ warning_expr_path = f"{step_action_base}.warning"
1284
+ if warning_expr_path in step_expressions:
1285
+ action_params.append(f"warning={step_expressions[warning_expr_path]}")
1286
+ elif isinstance(value.warning, list) and len(value.warning) == 1:
1287
+ action_params.append(f'warning="{value.warning[0]}"')
1288
+ else:
1289
+ action_params.append(f"warning={value.warning}")
1290
+
1291
+ if value.error is not None:
1292
+ error_expr_path = f"{step_action_base}.error"
1293
+ if error_expr_path in step_expressions:
1294
+ action_params.append(f"error={step_expressions[error_expr_path]}")
1295
+ elif isinstance(value.error, list) and len(value.error) == 1:
1296
+ action_params.append(f'error="{value.error[0]}"')
1297
+ else:
1298
+ action_params.append(f"error={value.error}")
1299
+
1300
+ if value.critical is not None:
1301
+ critical_expr_path = f"{step_action_base}.critical"
1302
+ if critical_expr_path in step_expressions:
1303
+ action_params.append(f"critical={step_expressions[critical_expr_path]}")
1304
+ elif isinstance(value.critical, list) and len(value.critical) == 1:
1305
+ action_params.append(f'critical="{value.critical[0]}"')
1306
+ else:
1307
+ action_params.append(f"critical={value.critical}")
1308
+
1309
+ if hasattr(value, "highest_only") and value.highest_only is not True:
1310
+ action_params.append(f"highest_only={value.highest_only}")
1311
+ actions_str = "pb.Actions(" + ", ".join(action_params) + ")"
1312
+ param_parts.append(f"actions={actions_str}")
1313
+ elif isinstance(value, dict):
1314
+ action_params = []
1315
+ step_action_base = f"steps[{step_index}].{list(step_config.keys())[0]}.actions"
1316
+ for action_key, action_value in value.items():
1317
+ if action_key == "highest_only":
1318
+ action_params.append(f"{action_key}={action_value}")
1319
+ else:
1320
+ # Check if we have an original expression for this action
1321
+ action_expr_path = f"{step_action_base}.{action_key}"
1322
+ if action_expr_path in step_expressions:
1323
+ action_params.append(
1324
+ f"{action_key}={step_expressions[action_expr_path]}"
1325
+ )
1326
+ elif isinstance(action_value, str):
1327
+ action_params.append(f'{action_key}="{action_value}"')
1328
+ else:
1329
+ # For callables or complex expressions
1330
+ action_params.append(f"{action_key}={action_value}")
1331
+ actions_str = "pb.Actions(" + ", ".join(action_params) + ")"
1332
+ param_parts.append(f"actions={actions_str}")
1333
+ else:
1334
+ param_parts.append(f"actions={value}")
1335
+ elif key == "thresholds":
1336
+ # Handle thresholds parameter: format as `pb.Thresholds()`
1337
+ if isinstance(value, dict):
1338
+ threshold_params = []
1339
+ for threshold_key, threshold_value in value.items():
1340
+ threshold_params.append(f"{threshold_key}={threshold_value}")
1341
+ thresholds_str = "pb.Thresholds(" + ", ".join(threshold_params) + ")"
1342
+ param_parts.append(f"thresholds={thresholds_str}")
1343
+ else:
1344
+ param_parts.append(f"thresholds={value}")
1345
+ elif isinstance(value, str):
1346
+ param_parts.append(f'{key}="{value}"')
1347
+ elif isinstance(value, bool):
1348
+ param_parts.append(f"{key}={str(value)}")
1349
+ elif isinstance(value, tuple):
1350
+ # Handle tuples like `inclusive=(False, True)`
1351
+ tuple_str = "(" + ", ".join([str(item) for item in value]) + ")"
1352
+ param_parts.append(f"{key}={tuple_str}")
1353
+ elif isinstance(value, list):
1354
+ # Handle lists/tuples (like `set=` parameter)
1355
+ if all(isinstance(item, str) for item in value):
1356
+ list_str = "[" + ", ".join([f'"{item}"' for item in value]) + "]"
1357
+ else:
1358
+ list_str = str(list(value))
1359
+ param_parts.append(f"{key}={list_str}")
1360
+ else:
1361
+ # Handle complex objects (like polars/pandas expressions from python: blocks)
1362
+ # For these, we'll use a placeholder since they can't be easily converted back
1363
+ param_parts.append(f"{key}={value}")
1364
+
1365
+ if param_parts:
1366
+ params_str = ", ".join(param_parts)
1367
+ code_lines.append(f" .{method_name}({params_str})")
1368
+ else:
1369
+ code_lines.append(f" .{method_name}()")
1370
+
1371
+ # Add interrogation method call
1372
+ code_lines.append(" .interrogate()")
1373
+ code_lines.append(")")
1374
+
1375
+ # Add imports at the beginning
1376
+ if needs_polars_import:
1377
+ imports.append("import polars as pl")
1378
+ if needs_pandas_import:
1379
+ imports.append("import pandas as pd")
1380
+
1381
+ # Build final code with imports
1382
+ final_code_lines = imports + [""] + code_lines
1383
+
1384
+ # Join all code lines and wrap in single markdown code block
1385
+ python_code = "\n".join(final_code_lines)
1386
+ return f"```python\n{python_code}\n```"