pointblank 0.13.4__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +117 -0
- pointblank/_constants_translations.py +487 -2
- pointblank/_interrogation.py +1065 -12
- pointblank/_spec_utils.py +1015 -0
- pointblank/_utils.py +17 -7
- pointblank/_utils_ai.py +875 -0
- pointblank/assistant.py +1 -1
- pointblank/cli.py +128 -115
- pointblank/column.py +1 -1
- pointblank/data/api-docs.txt +1838 -130
- pointblank/data/validations/README.md +108 -0
- pointblank/data/validations/complex_preprocessing.json +54 -0
- pointblank/data/validations/complex_preprocessing.pkl +0 -0
- pointblank/data/validations/generate_test_files.py +127 -0
- pointblank/data/validations/multiple_steps.json +83 -0
- pointblank/data/validations/multiple_steps.pkl +0 -0
- pointblank/data/validations/narwhals_function.json +28 -0
- pointblank/data/validations/narwhals_function.pkl +0 -0
- pointblank/data/validations/no_preprocessing.json +83 -0
- pointblank/data/validations/no_preprocessing.pkl +0 -0
- pointblank/data/validations/pandas_compatible.json +28 -0
- pointblank/data/validations/pandas_compatible.pkl +0 -0
- pointblank/data/validations/preprocessing_functions.py +46 -0
- pointblank/data/validations/simple_preprocessing.json +57 -0
- pointblank/data/validations/simple_preprocessing.pkl +0 -0
- pointblank/datascan.py +4 -4
- pointblank/draft.py +52 -3
- pointblank/scan_profile.py +6 -6
- pointblank/schema.py +8 -82
- pointblank/thresholds.py +1 -1
- pointblank/validate.py +3069 -437
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/METADATA +67 -8
- pointblank-0.15.0.dist-info/RECORD +56 -0
- pointblank-0.13.4.dist-info/RECORD +0 -39
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/WHEEL +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
# Validation Serialization Test Infrastructure
|
|
2
|
+
|
|
3
|
+
This directory contains test files and utilities for ensuring serialization compatibility of pointblank validation objects across versions.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
The serialization functionality in pointblank allows validation objects to be saved to disk and reloaded later. To ensure this works correctly across different versions and with various types of preprocessing functions, we maintain a collection of reference validation files for regression testing.
|
|
8
|
+
|
|
9
|
+
## Files
|
|
10
|
+
|
|
11
|
+
### Preprocessing Functions (`preprocessing_functions.py`)
|
|
12
|
+
|
|
13
|
+
Contains preprocessing functions used in validation examples:
|
|
14
|
+
|
|
15
|
+
- `double_column_a()` - Simple column transformation
|
|
16
|
+
- `add_computed_column()` - Creates computed columns
|
|
17
|
+
- `filter_by_d_gt_100()` - Filtering operations
|
|
18
|
+
- `narwhals_median_transform()` - Cross-backend compatible functions using narwhals
|
|
19
|
+
- `complex_preprocessing()` - Complex multi-step transformations
|
|
20
|
+
- `pandas_compatible_transform()` - Functions that work with both pandas and polars
|
|
21
|
+
|
|
22
|
+
### Test File Generator (`generate_test_files.py`)
|
|
23
|
+
|
|
24
|
+
Script that creates reference validation objects with various preprocessing functions:
|
|
25
|
+
|
|
26
|
+
- Creates test datasets
|
|
27
|
+
- Defines validation objects with different preprocessing scenarios
|
|
28
|
+
- Saves both pickle (`.pkl`) and JSON (`.json`) files
|
|
29
|
+
- Each validation object is interrogated to populate results
|
|
30
|
+
|
|
31
|
+
### Test Cases (`tests/test_serialization_compat.py`)
|
|
32
|
+
|
|
33
|
+
Comprehensive tests for serialization functionality located in the main tests directory:
|
|
34
|
+
|
|
35
|
+
- **Roundtrip testing**: Pickle and unpickle validation objects
|
|
36
|
+
- **Preprocessing preservation**: Verify functions are correctly serialized
|
|
37
|
+
- **Cross-backend compatibility**: Test narwhals functions work after deserialization
|
|
38
|
+
- **Complex workflows**: Multi-step validation with different preprocessing functions
|
|
39
|
+
|
|
40
|
+
### Generated Files
|
|
41
|
+
|
|
42
|
+
The following validation files are generated for regression testing:
|
|
43
|
+
|
|
44
|
+
#### Basic Validation Examples
|
|
45
|
+
|
|
46
|
+
- `no_preprocessing.pkl/.json` - Control case without preprocessing
|
|
47
|
+
- `simple_preprocessing.pkl/.json` - Basic single-function preprocessing
|
|
48
|
+
|
|
49
|
+
#### Advanced Validation Examples
|
|
50
|
+
|
|
51
|
+
- `complex_preprocessing.pkl/.json` - Multi-step transformations
|
|
52
|
+
- `multiple_steps.pkl/.json` - Different preprocessing per validation step
|
|
53
|
+
- `narwhals_function.pkl/.json` - Cross-backend compatible functions
|
|
54
|
+
- `pandas_compatible.pkl/.json` - Functions that work with multiple backends
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
### Running Tests
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
# Run all serialization compatibility tests
|
|
62
|
+
python -m pytest tests/test_serialization_compat.py -v
|
|
63
|
+
|
|
64
|
+
# Generate new test files (if functions change)
|
|
65
|
+
cd pointblank/data/validations
|
|
66
|
+
python generate_test_files.py
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Adding New Test Cases
|
|
70
|
+
|
|
71
|
+
1. Add new preprocessing functions to `preprocessing_functions.py`
|
|
72
|
+
2. Update `generate_test_files.py` to create validations using the new functions
|
|
73
|
+
3. Add corresponding test cases in `tests/test_serialization_compat.py`
|
|
74
|
+
4. Regenerate test files: `python generate_test_files.py`
|
|
75
|
+
|
|
76
|
+
## Version Compatibility
|
|
77
|
+
|
|
78
|
+
These reference files serve as regression tests to ensure:
|
|
79
|
+
|
|
80
|
+
- New versions can load validation files created with previous versions
|
|
81
|
+
- Preprocessing functions are correctly preserved across serialization
|
|
82
|
+
- Cross-backend compatibility is maintained
|
|
83
|
+
- Complex workflows continue to work after deserialization
|
|
84
|
+
|
|
85
|
+
The pickle files are the authoritative test cases, while JSON files provide human-readable versions for debugging.
|
|
86
|
+
|
|
87
|
+
## Best Practices
|
|
88
|
+
|
|
89
|
+
### For Preprocessing Functions
|
|
90
|
+
|
|
91
|
+
- Always use proper function definitions (not lambdas) for serializable functions
|
|
92
|
+
- Import required libraries inside functions for self-contained serialization
|
|
93
|
+
- Use narwhals for cross-backend compatibility when possible
|
|
94
|
+
- Test functions work with both polars and pandas DataFrames
|
|
95
|
+
|
|
96
|
+
### For Test Coverage
|
|
97
|
+
|
|
98
|
+
- Include examples of each type of preprocessing function
|
|
99
|
+
- Test both simple and complex multi-step workflows
|
|
100
|
+
- Verify roundtrip serialization (pickle → unpickle → pickle again)
|
|
101
|
+
- Check that deserialized functions produce expected results
|
|
102
|
+
|
|
103
|
+
### For Maintenance
|
|
104
|
+
|
|
105
|
+
- Regenerate test files when adding new preprocessing function types
|
|
106
|
+
- Keep test functions focused and well-documented
|
|
107
|
+
- Update tests when validation object structure changes
|
|
108
|
+
- Document any breaking changes that affect serialization compatibility
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"i": 1,
|
|
4
|
+
"i_o": 1,
|
|
5
|
+
"assertion_type": "col_vals_gt",
|
|
6
|
+
"column": "a_doubled",
|
|
7
|
+
"values": 0,
|
|
8
|
+
"inclusive": null,
|
|
9
|
+
"na_pass": false,
|
|
10
|
+
"pre": "def complex_preprocessing(df):\n \"\"\"Complex preprocessing combining multiple operations.\"\"\"\n return (\n df.filter(pl.col(\"a\") > 1)\n .with_columns((pl.col(\"a\") * 2).alias(\"a_doubled\"), (pl.col(\"d\") / 10).alias(\"d_scaled\"))\n .filter(pl.col(\"d_scaled\") > 10)\n )",
|
|
11
|
+
"segments": null,
|
|
12
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
13
|
+
"label": null,
|
|
14
|
+
"brief": null,
|
|
15
|
+
"active": true,
|
|
16
|
+
"all_passed": true,
|
|
17
|
+
"n": 7,
|
|
18
|
+
"n_passed": 7,
|
|
19
|
+
"n_failed": 0,
|
|
20
|
+
"f_passed": 1.0,
|
|
21
|
+
"f_failed": 0.0,
|
|
22
|
+
"warning": null,
|
|
23
|
+
"error": null,
|
|
24
|
+
"critical": null,
|
|
25
|
+
"time_processed": "2025-10-02T04:16:44.706+00:00",
|
|
26
|
+
"proc_duration_s": 0.00161
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"i": 2,
|
|
30
|
+
"i_o": 2,
|
|
31
|
+
"assertion_type": "col_vals_gt",
|
|
32
|
+
"column": "d_scaled",
|
|
33
|
+
"values": 15,
|
|
34
|
+
"inclusive": null,
|
|
35
|
+
"na_pass": false,
|
|
36
|
+
"pre": "def complex_preprocessing(df):\n \"\"\"Complex preprocessing combining multiple operations.\"\"\"\n return (\n df.filter(pl.col(\"a\") > 1)\n .with_columns((pl.col(\"a\") * 2).alias(\"a_doubled\"), (pl.col(\"d\") / 10).alias(\"d_scaled\"))\n .filter(pl.col(\"d_scaled\") > 10)\n )",
|
|
37
|
+
"segments": null,
|
|
38
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
39
|
+
"label": null,
|
|
40
|
+
"brief": null,
|
|
41
|
+
"active": true,
|
|
42
|
+
"all_passed": false,
|
|
43
|
+
"n": 7,
|
|
44
|
+
"n_passed": 5,
|
|
45
|
+
"n_failed": 2,
|
|
46
|
+
"f_passed": 0.7142857142857143,
|
|
47
|
+
"f_failed": 0.2857142857142857,
|
|
48
|
+
"warning": null,
|
|
49
|
+
"error": null,
|
|
50
|
+
"critical": null,
|
|
51
|
+
"time_processed": "2025-10-02T04:16:44.708+00:00",
|
|
52
|
+
"proc_duration_s": 0.001607
|
|
53
|
+
}
|
|
54
|
+
]
|
|
Binary file
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Generate reference validation files for serialization regression testing.
|
|
3
|
+
|
|
4
|
+
This script creates validation objects with various preprocessing functions
|
|
5
|
+
and stores them as pickled files in the validations directory. These files
|
|
6
|
+
serve as regression tests to ensure serialization compatibility across versions.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import pickle
|
|
10
|
+
|
|
11
|
+
# Add the parent directory to Python path to import pointblank
|
|
12
|
+
import sys
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import polars as pl
|
|
16
|
+
|
|
17
|
+
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
|
18
|
+
|
|
19
|
+
from preprocessing_functions import (
|
|
20
|
+
add_computed_column,
|
|
21
|
+
complex_preprocessing,
|
|
22
|
+
double_column_a,
|
|
23
|
+
filter_by_d_gt_100,
|
|
24
|
+
narwhals_median_transform,
|
|
25
|
+
pandas_compatible_transform,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
import pointblank as pb
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def create_test_data():
|
|
32
|
+
"""Create a test dataset for validation examples."""
|
|
33
|
+
return pl.DataFrame(
|
|
34
|
+
{
|
|
35
|
+
"a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
|
|
36
|
+
"b": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
|
|
37
|
+
"c": ["x", "y", "x", "y", "x", "y", "x", "y", "x", "y"],
|
|
38
|
+
"d": [50, 75, 100, 125, 150, 175, 200, 225, 250, 275],
|
|
39
|
+
}
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def create_validation_examples():
|
|
44
|
+
"""Create various validation objects for testing serialization."""
|
|
45
|
+
data = create_test_data()
|
|
46
|
+
validations = {}
|
|
47
|
+
|
|
48
|
+
# Basic validation with simple preprocessing
|
|
49
|
+
validations["simple_preprocessing"] = (
|
|
50
|
+
pb.Validate(data, tbl_name="test_data")
|
|
51
|
+
.col_vals_gt("a", value=0, pre=double_column_a)
|
|
52
|
+
.col_vals_in_set("c", set=["x", "y"])
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Validation with complex preprocessing
|
|
56
|
+
validations["complex_preprocessing"] = (
|
|
57
|
+
pb.Validate(data, tbl_name="test_data")
|
|
58
|
+
.col_vals_gt("a_doubled", value=0, pre=complex_preprocessing)
|
|
59
|
+
.col_vals_gt("d_scaled", value=15, pre=complex_preprocessing)
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Validation with narwhals function
|
|
63
|
+
validations["narwhals_function"] = pb.Validate(data, tbl_name="test_data").col_vals_gt(
|
|
64
|
+
"a", value=5, pre=narwhals_median_transform
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Validation with multiple preprocessing steps
|
|
68
|
+
validations["multiple_steps"] = (
|
|
69
|
+
pb.Validate(data, tbl_name="test_data")
|
|
70
|
+
.col_vals_gt("a", value=2, pre=double_column_a)
|
|
71
|
+
.col_vals_in_set("c", set=["x", "y"], pre=filter_by_d_gt_100)
|
|
72
|
+
.col_vals_gt("sum_ab", value=100, pre=add_computed_column)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Validation with pandas-compatible function
|
|
76
|
+
validations["pandas_compatible"] = pb.Validate(data, tbl_name="test_data").col_vals_gt(
|
|
77
|
+
"a_plus_b", value=10, pre=pandas_compatible_transform
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Basic validation without preprocessing (control case)
|
|
81
|
+
validations["no_preprocessing"] = (
|
|
82
|
+
pb.Validate(data, tbl_name="test_data")
|
|
83
|
+
.col_vals_gt("a", value=0)
|
|
84
|
+
.col_vals_lt("d", value=300)
|
|
85
|
+
.col_vals_in_set("c", set=["x", "y"])
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return validations
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def save_validation_files(validations, output_dir):
|
|
92
|
+
"""Save validation objects as pickled files."""
|
|
93
|
+
output_path = Path(output_dir)
|
|
94
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
|
|
96
|
+
for name, validation in validations.items():
|
|
97
|
+
# Interrogate to populate results
|
|
98
|
+
validation.interrogate()
|
|
99
|
+
|
|
100
|
+
# Save the validation object
|
|
101
|
+
file_path = output_path / f"{name}.pkl"
|
|
102
|
+
with open(file_path, "wb") as f:
|
|
103
|
+
pickle.dump(validation, f)
|
|
104
|
+
|
|
105
|
+
print(f"Saved {name} validation to {file_path}")
|
|
106
|
+
|
|
107
|
+
# Also save as JSON for human readability
|
|
108
|
+
json_path = output_path / f"{name}.json"
|
|
109
|
+
try:
|
|
110
|
+
json_report = validation.get_json_report()
|
|
111
|
+
with open(json_path, "w") as f:
|
|
112
|
+
f.write(json_report)
|
|
113
|
+
print(f"Saved {name} validation JSON to {json_path}")
|
|
114
|
+
except Exception as e:
|
|
115
|
+
print(f"Could not save JSON for {name}: {e}")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
if __name__ == "__main__":
|
|
119
|
+
# Create validation examples
|
|
120
|
+
validations = create_validation_examples()
|
|
121
|
+
|
|
122
|
+
# Save to the validations directory
|
|
123
|
+
output_dir = Path(__file__).parent
|
|
124
|
+
save_validation_files(validations, output_dir)
|
|
125
|
+
|
|
126
|
+
print(f"\nCreated {len(validations)} test validation files in {output_dir}")
|
|
127
|
+
print("These files can be used for regression testing serialization compatibility.")
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"i": 1,
|
|
4
|
+
"i_o": 1,
|
|
5
|
+
"assertion_type": "col_vals_gt",
|
|
6
|
+
"column": "a",
|
|
7
|
+
"values": 2,
|
|
8
|
+
"inclusive": null,
|
|
9
|
+
"na_pass": false,
|
|
10
|
+
"pre": "def double_column_a(df):\n \"\"\"Double the values in column 'a'.\"\"\"\n return df.with_columns(pl.col(\"a\") * 2)",
|
|
11
|
+
"segments": null,
|
|
12
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
13
|
+
"label": null,
|
|
14
|
+
"brief": null,
|
|
15
|
+
"active": true,
|
|
16
|
+
"all_passed": false,
|
|
17
|
+
"n": 10,
|
|
18
|
+
"n_passed": 9,
|
|
19
|
+
"n_failed": 1,
|
|
20
|
+
"f_passed": 0.9,
|
|
21
|
+
"f_failed": 0.1,
|
|
22
|
+
"warning": null,
|
|
23
|
+
"error": null,
|
|
24
|
+
"critical": null,
|
|
25
|
+
"time_processed": "2025-10-02T04:16:44.712+00:00",
|
|
26
|
+
"proc_duration_s": 0.00152
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"i": 2,
|
|
30
|
+
"i_o": 2,
|
|
31
|
+
"assertion_type": "col_vals_in_set",
|
|
32
|
+
"column": "c",
|
|
33
|
+
"values": [
|
|
34
|
+
"x",
|
|
35
|
+
"y"
|
|
36
|
+
],
|
|
37
|
+
"inclusive": null,
|
|
38
|
+
"na_pass": null,
|
|
39
|
+
"pre": "def filter_by_d_gt_100(df):\n \"\"\"Filter rows where column 'd' is greater than 100.\"\"\"\n return df.filter(pl.col(\"d\") > 100)",
|
|
40
|
+
"segments": null,
|
|
41
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
42
|
+
"label": null,
|
|
43
|
+
"brief": null,
|
|
44
|
+
"active": true,
|
|
45
|
+
"all_passed": true,
|
|
46
|
+
"n": 7,
|
|
47
|
+
"n_passed": 7,
|
|
48
|
+
"n_failed": 0,
|
|
49
|
+
"f_passed": 1.0,
|
|
50
|
+
"f_failed": 0.0,
|
|
51
|
+
"warning": null,
|
|
52
|
+
"error": null,
|
|
53
|
+
"critical": null,
|
|
54
|
+
"time_processed": "2025-10-02T04:16:44.713+00:00",
|
|
55
|
+
"proc_duration_s": 0.001
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"i": 3,
|
|
59
|
+
"i_o": 3,
|
|
60
|
+
"assertion_type": "col_vals_gt",
|
|
61
|
+
"column": "sum_ab",
|
|
62
|
+
"values": 100,
|
|
63
|
+
"inclusive": null,
|
|
64
|
+
"na_pass": false,
|
|
65
|
+
"pre": "def add_computed_column(df):\n \"\"\"Add a computed column based on existing columns.\"\"\"\n return df.with_columns((pl.col(\"a\") + pl.col(\"b\")).alias(\"sum_ab\"))",
|
|
66
|
+
"segments": null,
|
|
67
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
68
|
+
"label": null,
|
|
69
|
+
"brief": null,
|
|
70
|
+
"active": true,
|
|
71
|
+
"all_passed": false,
|
|
72
|
+
"n": 10,
|
|
73
|
+
"n_passed": 1,
|
|
74
|
+
"n_failed": 9,
|
|
75
|
+
"f_passed": 0.1,
|
|
76
|
+
"f_failed": 0.9,
|
|
77
|
+
"warning": null,
|
|
78
|
+
"error": null,
|
|
79
|
+
"critical": null,
|
|
80
|
+
"time_processed": "2025-10-02T04:16:44.714+00:00",
|
|
81
|
+
"proc_duration_s": 0.001464
|
|
82
|
+
}
|
|
83
|
+
]
|
|
Binary file
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"i": 1,
|
|
4
|
+
"i_o": 1,
|
|
5
|
+
"assertion_type": "col_vals_gt",
|
|
6
|
+
"column": "a",
|
|
7
|
+
"values": 5,
|
|
8
|
+
"inclusive": null,
|
|
9
|
+
"na_pass": false,
|
|
10
|
+
"pre": "def narwhals_median_transform(df):\n \"\"\"Use narwhals to compute median - cross-backend compatible.\"\"\"\n return nw.from_native(df).select(nw.median(\"a\"), nw.median(\"d\"))",
|
|
11
|
+
"segments": null,
|
|
12
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
13
|
+
"label": null,
|
|
14
|
+
"brief": null,
|
|
15
|
+
"active": true,
|
|
16
|
+
"all_passed": true,
|
|
17
|
+
"n": 1,
|
|
18
|
+
"n_passed": 1,
|
|
19
|
+
"n_failed": 0,
|
|
20
|
+
"f_passed": 1.0,
|
|
21
|
+
"f_failed": 0.0,
|
|
22
|
+
"warning": null,
|
|
23
|
+
"error": null,
|
|
24
|
+
"critical": null,
|
|
25
|
+
"time_processed": "2025-10-02T04:16:44.710+00:00",
|
|
26
|
+
"proc_duration_s": 0.001455
|
|
27
|
+
}
|
|
28
|
+
]
|
|
Binary file
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"i": 1,
|
|
4
|
+
"i_o": 1,
|
|
5
|
+
"assertion_type": "col_vals_gt",
|
|
6
|
+
"column": "a",
|
|
7
|
+
"values": 0,
|
|
8
|
+
"inclusive": null,
|
|
9
|
+
"na_pass": false,
|
|
10
|
+
"pre": null,
|
|
11
|
+
"segments": null,
|
|
12
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
13
|
+
"label": null,
|
|
14
|
+
"brief": null,
|
|
15
|
+
"active": true,
|
|
16
|
+
"all_passed": true,
|
|
17
|
+
"n": 10,
|
|
18
|
+
"n_passed": 10,
|
|
19
|
+
"n_failed": 0,
|
|
20
|
+
"f_passed": 1.0,
|
|
21
|
+
"f_failed": 0.0,
|
|
22
|
+
"warning": null,
|
|
23
|
+
"error": null,
|
|
24
|
+
"critical": null,
|
|
25
|
+
"time_processed": "2025-10-02T04:16:44.718+00:00",
|
|
26
|
+
"proc_duration_s": 0.001148
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"i": 2,
|
|
30
|
+
"i_o": 2,
|
|
31
|
+
"assertion_type": "col_vals_lt",
|
|
32
|
+
"column": "d",
|
|
33
|
+
"values": 300,
|
|
34
|
+
"inclusive": null,
|
|
35
|
+
"na_pass": false,
|
|
36
|
+
"pre": null,
|
|
37
|
+
"segments": null,
|
|
38
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
39
|
+
"label": null,
|
|
40
|
+
"brief": null,
|
|
41
|
+
"active": true,
|
|
42
|
+
"all_passed": true,
|
|
43
|
+
"n": 10,
|
|
44
|
+
"n_passed": 10,
|
|
45
|
+
"n_failed": 0,
|
|
46
|
+
"f_passed": 1.0,
|
|
47
|
+
"f_failed": 0.0,
|
|
48
|
+
"warning": null,
|
|
49
|
+
"error": null,
|
|
50
|
+
"critical": null,
|
|
51
|
+
"time_processed": "2025-10-02T04:16:44.719+00:00",
|
|
52
|
+
"proc_duration_s": 0.001181
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"i": 3,
|
|
56
|
+
"i_o": 3,
|
|
57
|
+
"assertion_type": "col_vals_in_set",
|
|
58
|
+
"column": "c",
|
|
59
|
+
"values": [
|
|
60
|
+
"x",
|
|
61
|
+
"y"
|
|
62
|
+
],
|
|
63
|
+
"inclusive": null,
|
|
64
|
+
"na_pass": null,
|
|
65
|
+
"pre": null,
|
|
66
|
+
"segments": null,
|
|
67
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
68
|
+
"label": null,
|
|
69
|
+
"brief": null,
|
|
70
|
+
"active": true,
|
|
71
|
+
"all_passed": true,
|
|
72
|
+
"n": 10,
|
|
73
|
+
"n_passed": 10,
|
|
74
|
+
"n_failed": 0,
|
|
75
|
+
"f_passed": 1.0,
|
|
76
|
+
"f_failed": 0.0,
|
|
77
|
+
"warning": null,
|
|
78
|
+
"error": null,
|
|
79
|
+
"critical": null,
|
|
80
|
+
"time_processed": "2025-10-02T04:16:44.720+00:00",
|
|
81
|
+
"proc_duration_s": 0.000892
|
|
82
|
+
}
|
|
83
|
+
]
|
|
Binary file
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"i": 1,
|
|
4
|
+
"i_o": 1,
|
|
5
|
+
"assertion_type": "col_vals_gt",
|
|
6
|
+
"column": "a_plus_b",
|
|
7
|
+
"values": 10,
|
|
8
|
+
"inclusive": null,
|
|
9
|
+
"na_pass": false,
|
|
10
|
+
"pre": "def pandas_compatible_transform(df):\n \"\"\"Transform that works with pandas DataFrames.\"\"\"\n if hasattr(df, \"assign\"): # pandas\n return df.assign(a_plus_b=df[\"a\"] + df.get(\"b\", 0))\n else: # polars or other\n return df.with_columns((pl.col(\"a\") + pl.col(\"b\")).alias(\"a_plus_b\"))",
|
|
11
|
+
"segments": null,
|
|
12
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
13
|
+
"label": null,
|
|
14
|
+
"brief": null,
|
|
15
|
+
"active": true,
|
|
16
|
+
"all_passed": true,
|
|
17
|
+
"n": 10,
|
|
18
|
+
"n_passed": 10,
|
|
19
|
+
"n_failed": 0,
|
|
20
|
+
"f_passed": 1.0,
|
|
21
|
+
"f_failed": 0.0,
|
|
22
|
+
"warning": null,
|
|
23
|
+
"error": null,
|
|
24
|
+
"critical": null,
|
|
25
|
+
"time_processed": "2025-10-02T04:16:44.717+00:00",
|
|
26
|
+
"proc_duration_s": 0.001428
|
|
27
|
+
}
|
|
28
|
+
]
|
|
Binary file
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test preprocessing functions for validation serialization examples.
|
|
3
|
+
|
|
4
|
+
These functions are used to create validation objects that can be serialized
|
|
5
|
+
and stored as reference files for regression testing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import narwhals as nw
|
|
9
|
+
import polars as pl
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def double_column_a(df):
|
|
13
|
+
"""Double the values in column 'a'."""
|
|
14
|
+
return df.with_columns(pl.col("a") * 2)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def add_computed_column(df):
|
|
18
|
+
"""Add a computed column based on existing columns."""
|
|
19
|
+
return df.with_columns((pl.col("a") + pl.col("b")).alias("sum_ab"))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def filter_by_d_gt_100(df):
|
|
23
|
+
"""Filter rows where column 'd' is greater than 100."""
|
|
24
|
+
return df.filter(pl.col("d") > 100)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def narwhals_median_transform(df):
|
|
28
|
+
"""Use narwhals to compute median - cross-backend compatible."""
|
|
29
|
+
return nw.from_native(df).select(nw.median("a"), nw.median("d"))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def complex_preprocessing(df):
|
|
33
|
+
"""Complex preprocessing combining multiple operations."""
|
|
34
|
+
return (
|
|
35
|
+
df.filter(pl.col("a") > 1)
|
|
36
|
+
.with_columns((pl.col("a") * 2).alias("a_doubled"), (pl.col("d") / 10).alias("d_scaled"))
|
|
37
|
+
.filter(pl.col("d_scaled") > 10)
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def pandas_compatible_transform(df):
|
|
42
|
+
"""Transform that works with pandas DataFrames."""
|
|
43
|
+
if hasattr(df, "assign"): # pandas
|
|
44
|
+
return df.assign(a_plus_b=df["a"] + df.get("b", 0))
|
|
45
|
+
else: # polars or other
|
|
46
|
+
return df.with_columns((pl.col("a") + pl.col("b")).alias("a_plus_b"))
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"i": 1,
|
|
4
|
+
"i_o": 1,
|
|
5
|
+
"assertion_type": "col_vals_gt",
|
|
6
|
+
"column": "a",
|
|
7
|
+
"values": 0,
|
|
8
|
+
"inclusive": null,
|
|
9
|
+
"na_pass": false,
|
|
10
|
+
"pre": "def double_column_a(df):\n \"\"\"Double the values in column 'a'.\"\"\"\n return df.with_columns(pl.col(\"a\") * 2)",
|
|
11
|
+
"segments": null,
|
|
12
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
13
|
+
"label": null,
|
|
14
|
+
"brief": null,
|
|
15
|
+
"active": true,
|
|
16
|
+
"all_passed": true,
|
|
17
|
+
"n": 10,
|
|
18
|
+
"n_passed": 10,
|
|
19
|
+
"n_failed": 0,
|
|
20
|
+
"f_passed": 1.0,
|
|
21
|
+
"f_failed": 0.0,
|
|
22
|
+
"warning": null,
|
|
23
|
+
"error": null,
|
|
24
|
+
"critical": null,
|
|
25
|
+
"time_processed": "2025-10-02T04:16:44.702+00:00",
|
|
26
|
+
"proc_duration_s": 0.00387
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"i": 2,
|
|
30
|
+
"i_o": 2,
|
|
31
|
+
"assertion_type": "col_vals_in_set",
|
|
32
|
+
"column": "c",
|
|
33
|
+
"values": [
|
|
34
|
+
"x",
|
|
35
|
+
"y"
|
|
36
|
+
],
|
|
37
|
+
"inclusive": null,
|
|
38
|
+
"na_pass": null,
|
|
39
|
+
"pre": null,
|
|
40
|
+
"segments": null,
|
|
41
|
+
"thresholds": "Thresholds(warning=None, error=None, critical=None)",
|
|
42
|
+
"label": null,
|
|
43
|
+
"brief": null,
|
|
44
|
+
"active": true,
|
|
45
|
+
"all_passed": true,
|
|
46
|
+
"n": 10,
|
|
47
|
+
"n_passed": 10,
|
|
48
|
+
"n_failed": 0,
|
|
49
|
+
"f_passed": 1.0,
|
|
50
|
+
"f_failed": 0.0,
|
|
51
|
+
"warning": null,
|
|
52
|
+
"error": null,
|
|
53
|
+
"critical": null,
|
|
54
|
+
"time_processed": "2025-10-02T04:16:44.703+00:00",
|
|
55
|
+
"proc_duration_s": 0.000983
|
|
56
|
+
}
|
|
57
|
+
]
|
|
Binary file
|
pointblank/datascan.py
CHANGED
|
@@ -143,17 +143,17 @@ class DataScan:
|
|
|
143
143
|
for conv_method in valid_conversion_methods:
|
|
144
144
|
try:
|
|
145
145
|
valid_native = getattr(ibis_native, conv_method)()
|
|
146
|
-
except (NotImplementedError, ImportError, ModuleNotFoundError):
|
|
147
|
-
continue
|
|
146
|
+
except (NotImplementedError, ImportError, ModuleNotFoundError): # pragma: no cover
|
|
147
|
+
continue # pragma: no cover
|
|
148
148
|
break
|
|
149
|
-
else:
|
|
149
|
+
else: # pragma: no cover
|
|
150
150
|
msg = (
|
|
151
151
|
"To use `ibis` as input, you must have one of arrow, pandas, polars or numpy "
|
|
152
152
|
"available in the process. Until `ibis` is fully supported by Narwhals, this is "
|
|
153
153
|
"necessary. Additionally, the data must be collected in order to calculate some "
|
|
154
154
|
"structural statistics, which may be performance detrimental."
|
|
155
155
|
)
|
|
156
|
-
raise ImportError(msg)
|
|
156
|
+
raise ImportError(msg) # pragma: no cover
|
|
157
157
|
as_native = nw.from_native(valid_native)
|
|
158
158
|
|
|
159
159
|
self.nw_data: Frame = nw.from_native(as_native)
|