pointblank 0.9.5__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +6 -0
- pointblank/_datascan_utils.py +65 -0
- pointblank/_utils.py +128 -0
- pointblank/_utils_html.py +40 -0
- pointblank/actions.py +3 -3
- pointblank/assistant.py +1 -3
- pointblank/column.py +4 -4
- pointblank/compare.py +27 -0
- pointblank/data/api-docs.txt +769 -138
- pointblank/datascan.py +318 -959
- pointblank/scan_profile.py +321 -0
- pointblank/scan_profile_stats.py +180 -0
- pointblank/schema.py +14 -3
- pointblank/thresholds.py +2 -2
- pointblank/validate.py +1594 -207
- {pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/METADATA +6 -3
- pointblank-0.10.0.dist-info/RECORD +37 -0
- {pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/WHEEL +1 -1
- pointblank-0.9.5.dist-info/RECORD +0 -33
- {pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.9.5.dist-info → pointblank-0.10.0.dist-info}/top_level.txt +0 -0
pointblank/data/api-docs.txt
CHANGED
|
@@ -42,8 +42,14 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
42
42
|
Parameters
|
|
43
43
|
----------
|
|
44
44
|
data
|
|
45
|
-
The table to validate, which could be a DataFrame object
|
|
46
|
-
|
|
45
|
+
The table to validate, which could be a DataFrame object, an Ibis table object, a CSV
|
|
46
|
+
file path, a Parquet file path, or a database connection string. When providing a CSV or
|
|
47
|
+
Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
|
|
48
|
+
loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
|
|
49
|
+
glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
|
|
50
|
+
Connection strings enable direct database access via Ibis with optional table specification
|
|
51
|
+
using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
|
|
52
|
+
on the supported table types.
|
|
47
53
|
tbl_name
|
|
48
54
|
An optional name to assign to the input table object. If no value is provided, a name will
|
|
49
55
|
be generated based on whatever information is available. This table name will be displayed
|
|
@@ -107,13 +113,40 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
107
113
|
- MySQL table (`"mysql"`)*
|
|
108
114
|
- PostgreSQL table (`"postgresql"`)*
|
|
109
115
|
- SQLite table (`"sqlite"`)*
|
|
116
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
117
|
+
- Snowflake table (`"snowflake"`)*
|
|
118
|
+
- Databricks table (`"databricks"`)*
|
|
119
|
+
- PySpark table (`"pyspark"`)*
|
|
120
|
+
- BigQuery table (`"bigquery"`)*
|
|
110
121
|
- Parquet table (`"parquet"`)*
|
|
122
|
+
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
|
|
123
|
+
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
|
|
124
|
+
extension, or partitioned dataset)
|
|
125
|
+
- Database connection strings (URI format with optional table specification)
|
|
111
126
|
|
|
112
127
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
113
128
|
`ibis.expr.types.relations.Table`). Furthermore, the use of `Validate` with such tables requires
|
|
114
129
|
the Ibis library v9.5.0 and above to be installed. If the input table is a Polars or Pandas
|
|
115
130
|
DataFrame, the Ibis library is not required.
|
|
116
131
|
|
|
132
|
+
To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
|
|
133
|
+
provided. The file will be automatically detected and loaded using the best available DataFrame
|
|
134
|
+
library. The loading preference is Polars first, then Pandas as a fallback.
|
|
135
|
+
|
|
136
|
+
Connection strings follow database URL formats and must also specify a table using the
|
|
137
|
+
`::table_name` suffix. Examples include:
|
|
138
|
+
|
|
139
|
+
```
|
|
140
|
+
"duckdb:///path/to/database.ddb::table_name"
|
|
141
|
+
"sqlite:///path/to/database.db::table_name"
|
|
142
|
+
"postgresql://user:password@localhost:5432/database::table_name"
|
|
143
|
+
"mysql://user:password@localhost:3306/database::table_name"
|
|
144
|
+
"bigquery://project/dataset::table_name"
|
|
145
|
+
"snowflake://user:password@account/database/schema::table_name"
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
When using connection strings, the Ibis library with the appropriate backend driver is required.
|
|
149
|
+
|
|
117
150
|
Thresholds
|
|
118
151
|
----------
|
|
119
152
|
The `thresholds=` parameter is used to set the failure-condition levels for all validation
|
|
@@ -270,8 +303,8 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
270
303
|
```python
|
|
271
304
|
import pointblank as pb
|
|
272
305
|
|
|
273
|
-
# Load the small_table dataset
|
|
274
|
-
small_table = pb.load_dataset()
|
|
306
|
+
# Load the `small_table` dataset
|
|
307
|
+
small_table = pb.load_dataset(dataset="small_table", tbl_type="polars")
|
|
275
308
|
|
|
276
309
|
# Preview the table
|
|
277
310
|
pb.preview(small_table)
|
|
@@ -337,7 +370,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
337
370
|
brief). Here's an example of a global setting for briefs:
|
|
338
371
|
|
|
339
372
|
```python
|
|
340
|
-
|
|
373
|
+
validation_2 = (
|
|
341
374
|
pb.Validate(
|
|
342
375
|
data=pb.load_dataset(),
|
|
343
376
|
tbl_name="small_table",
|
|
@@ -354,7 +387,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
354
387
|
.interrogate()
|
|
355
388
|
)
|
|
356
389
|
|
|
357
|
-
|
|
390
|
+
validation_2
|
|
358
391
|
```
|
|
359
392
|
|
|
360
393
|
We see the text of the briefs appear in the `STEP` column of the reporting table. Furthermore,
|
|
@@ -372,7 +405,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
372
405
|
the data extracts for each validation step.
|
|
373
406
|
|
|
374
407
|
```python
|
|
375
|
-
|
|
408
|
+
validation_2.get_data_extracts()
|
|
376
409
|
```
|
|
377
410
|
|
|
378
411
|
We can also view step reports for each validation step using the
|
|
@@ -380,7 +413,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
380
413
|
type of validation step and shows the relevant information for a step's validation.
|
|
381
414
|
|
|
382
415
|
```python
|
|
383
|
-
|
|
416
|
+
validation_2.get_step_report(i=2)
|
|
384
417
|
```
|
|
385
418
|
|
|
386
419
|
The `Validate` class also has a method for getting the sundered data, which is the data that
|
|
@@ -388,11 +421,141 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
388
421
|
[`get_sundered_data()`](`pointblank.Validate.get_sundered_data`) method.
|
|
389
422
|
|
|
390
423
|
```python
|
|
391
|
-
pb.preview(
|
|
424
|
+
pb.preview(validation_2.get_sundered_data())
|
|
392
425
|
```
|
|
393
426
|
|
|
394
427
|
The sundered data is a DataFrame that contains the rows that passed or failed the validation.
|
|
395
428
|
The default behavior is to return the rows that failed the validation, as shown above.
|
|
429
|
+
|
|
430
|
+
### Working with CSV Files
|
|
431
|
+
|
|
432
|
+
The `Validate` class can directly accept CSV file paths, making it easy to validate data stored
|
|
433
|
+
in CSV files without manual loading:
|
|
434
|
+
|
|
435
|
+
```python
|
|
436
|
+
# Get a path to a CSV file from the package data
|
|
437
|
+
csv_path = pb.get_data_path("global_sales", "csv")
|
|
438
|
+
|
|
439
|
+
validation_3 = (
|
|
440
|
+
pb.Validate(
|
|
441
|
+
data=csv_path,
|
|
442
|
+
label="CSV validation example"
|
|
443
|
+
)
|
|
444
|
+
.col_exists(["customer_id", "product_id", "revenue"])
|
|
445
|
+
.col_vals_not_null(["customer_id", "product_id"])
|
|
446
|
+
.col_vals_gt(columns="revenue", value=0)
|
|
447
|
+
.interrogate()
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
validation_3
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
You can also use a Path object to specify the CSV file. Here's an example of how to do that:
|
|
454
|
+
|
|
455
|
+
```python
|
|
456
|
+
from pathlib import Path
|
|
457
|
+
|
|
458
|
+
csv_file = Path(pb.get_data_path("game_revenue", "csv"))
|
|
459
|
+
|
|
460
|
+
validation_4 = (
|
|
461
|
+
pb.Validate(data=csv_file, label="Game Revenue Validation")
|
|
462
|
+
.col_exists(["player_id", "session_id", "item_name"])
|
|
463
|
+
.col_vals_regex(
|
|
464
|
+
columns="session_id",
|
|
465
|
+
pattern=r"[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}"
|
|
466
|
+
)
|
|
467
|
+
.col_vals_gt(columns="item_revenue", value=0, na_pass=True)
|
|
468
|
+
.interrogate()
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
validation_4
|
|
472
|
+
```
|
|
473
|
+
|
|
474
|
+
The CSV loading is automatic, so when a string or Path with a `.csv` extension is provided,
|
|
475
|
+
Pointblank will automatically load the file using the best available DataFrame library (Polars
|
|
476
|
+
preferred, Pandas as fallback). The loaded data can then be used with all validation methods
|
|
477
|
+
just like any other supported table type.
|
|
478
|
+
|
|
479
|
+
### Working with Parquet Files
|
|
480
|
+
|
|
481
|
+
The `Validate` class can directly accept Parquet files and datasets in various formats. The
|
|
482
|
+
following examples illustrate how to validate Parquet files:
|
|
483
|
+
|
|
484
|
+
```python
|
|
485
|
+
# Single Parquet file from package data
|
|
486
|
+
parquet_path = pb.get_data_path("nycflights", "parquet")
|
|
487
|
+
|
|
488
|
+
validation_5 = (
|
|
489
|
+
pb.Validate(
|
|
490
|
+
data=parquet_path,
|
|
491
|
+
tbl_name="NYC Flights Data"
|
|
492
|
+
)
|
|
493
|
+
.col_vals_not_null(["carrier", "origin", "dest"])
|
|
494
|
+
.col_vals_gt(columns="distance", value=0)
|
|
495
|
+
.interrogate()
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
validation_5
|
|
499
|
+
```
|
|
500
|
+
|
|
501
|
+
You can also use glob patterns and directories. Here are some examples for how to:
|
|
502
|
+
|
|
503
|
+
1. load multiple Parquet files
|
|
504
|
+
2. load a Parquet-containing directory
|
|
505
|
+
3. load a partitioned Parquet dataset
|
|
506
|
+
|
|
507
|
+
```python
|
|
508
|
+
# Multiple Parquet files with glob patterns
|
|
509
|
+
validation_6 = pb.Validate(data="data/sales_*.parquet")
|
|
510
|
+
|
|
511
|
+
# Directory containing Parquet files
|
|
512
|
+
validation_7 = pb.Validate(data="parquet_data/")
|
|
513
|
+
|
|
514
|
+
# Partitioned Parquet dataset
|
|
515
|
+
validation_8 = (
|
|
516
|
+
pb.Validate(data="sales_data/") # Contains year=2023/quarter=Q1/region=US/sales.parquet
|
|
517
|
+
.col_exists(["transaction_id", "amount", "year", "quarter", "region"])
|
|
518
|
+
.interrogate()
|
|
519
|
+
)
|
|
520
|
+
```
|
|
521
|
+
|
|
522
|
+
When you point to a directory that contains a partitioned Parquet dataset (with subdirectories
|
|
523
|
+
like `year=2023/quarter=Q1/region=US/`), Pointblank will automatically:
|
|
524
|
+
|
|
525
|
+
- discover all Parquet files recursively
|
|
526
|
+
- extract partition column values from directory paths
|
|
527
|
+
- add partition columns to the final DataFrame
|
|
528
|
+
- combine all partitions into a single table for validation
|
|
529
|
+
|
|
530
|
+
Both Polars and Pandas handle partitioned datasets natively, so this works seamlessly with
|
|
531
|
+
either DataFrame library. The loading preference is Polars first, then Pandas as a fallback.
|
|
532
|
+
|
|
533
|
+
### Working with Database Connection Strings
|
|
534
|
+
|
|
535
|
+
The `Validate` class supports database connection strings for direct validation of database
|
|
536
|
+
tables. Connection strings must specify a table using the `::table_name` suffix:
|
|
537
|
+
|
|
538
|
+
```python
|
|
539
|
+
# Get path to a DuckDB database file from package data
|
|
540
|
+
duckdb_path = pb.get_data_path("game_revenue", "duckdb")
|
|
541
|
+
|
|
542
|
+
validation_9 = (
|
|
543
|
+
pb.Validate(
|
|
544
|
+
data=f"duckdb:///{duckdb_path}::game_revenue",
|
|
545
|
+
label="DuckDB Game Revenue Validation"
|
|
546
|
+
)
|
|
547
|
+
.col_exists(["player_id", "session_id", "item_revenue"])
|
|
548
|
+
.col_vals_gt(columns="item_revenue", value=0)
|
|
549
|
+
.interrogate()
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
validation_9
|
|
553
|
+
```
|
|
554
|
+
|
|
555
|
+
For comprehensive documentation on supported connection string formats, error handling, and
|
|
556
|
+
installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
|
|
557
|
+
function. This function handles all the connection logic and provides helpful error messages
|
|
558
|
+
when table specifications are missing or backend dependencies are not installed.
|
|
396
559
|
|
|
397
560
|
|
|
398
561
|
Thresholds(warning: 'int | float | bool | None' = None, error: 'int | float | bool | None' = None, critical: 'int | float | bool | None' = None) -> None
|
|
@@ -580,7 +743,7 @@ Actions(warning: 'str | Callable | list[str | Callable] | None' = None, error: '
|
|
|
580
743
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
581
744
|
actions=pb.Actions(critical="Major data quality issue found in step {step}."),
|
|
582
745
|
)
|
|
583
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
746
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
584
747
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
585
748
|
.col_vals_gt(columns="session_duration", value=15)
|
|
586
749
|
.interrogate()
|
|
@@ -610,7 +773,7 @@ Actions(warning: 'str | Callable | list[str | Callable] | None' = None, error: '
|
|
|
610
773
|
data=pb.load_dataset(dataset="game_revenue", tbl_type="duckdb"),
|
|
611
774
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
612
775
|
)
|
|
613
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
776
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
614
777
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
615
778
|
.col_vals_gt(
|
|
616
779
|
columns="session_duration",
|
|
@@ -1282,12 +1445,16 @@ col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1282
1445
|
(i.e., no validation steps will be created for them).
|
|
1283
1446
|
|
|
1284
1447
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1285
|
-
for more complex segmentation scenarios. The following inputs are
|
|
1448
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
1286
1449
|
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1450
|
+
```
|
|
1451
|
+
# Segments from all unique values in the `region` column
|
|
1452
|
+
# and specific dates in the `date` column
|
|
1453
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
1454
|
+
|
|
1455
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
1456
|
+
segments=["region", "date"]
|
|
1457
|
+
```
|
|
1291
1458
|
|
|
1292
1459
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1293
1460
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -1503,12 +1670,16 @@ col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1503
1670
|
(i.e., no validation steps will be created for them).
|
|
1504
1671
|
|
|
1505
1672
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1506
|
-
for more complex segmentation scenarios. The following inputs are
|
|
1673
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
1507
1674
|
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1675
|
+
```
|
|
1676
|
+
# Segments from all unique values in the `region` column
|
|
1677
|
+
# and specific dates in the `date` column
|
|
1678
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
1679
|
+
|
|
1680
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
1681
|
+
segments=["region", "date"]
|
|
1682
|
+
```
|
|
1512
1683
|
|
|
1513
1684
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1514
1685
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -1724,12 +1895,16 @@ col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1724
1895
|
(i.e., no validation steps will be created for them).
|
|
1725
1896
|
|
|
1726
1897
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1727
|
-
for more complex segmentation scenarios. The following inputs are
|
|
1898
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
1728
1899
|
|
|
1729
|
-
|
|
1730
|
-
|
|
1731
|
-
|
|
1732
|
-
|
|
1900
|
+
```
|
|
1901
|
+
# Segments from all unique values in the `region` column
|
|
1902
|
+
# and specific dates in the `date` column
|
|
1903
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
1904
|
+
|
|
1905
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
1906
|
+
segments=["region", "date"]
|
|
1907
|
+
```
|
|
1733
1908
|
|
|
1734
1909
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1735
1910
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -1945,12 +2120,16 @@ col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1945
2120
|
(i.e., no validation steps will be created for them).
|
|
1946
2121
|
|
|
1947
2122
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1948
|
-
for more complex segmentation scenarios. The following inputs are
|
|
2123
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
1949
2124
|
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
|
|
1953
|
-
|
|
2125
|
+
```
|
|
2126
|
+
# Segments from all unique values in the `region` column
|
|
2127
|
+
# and specific dates in the `date` column
|
|
2128
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
2129
|
+
|
|
2130
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
2131
|
+
segments=["region", "date"]
|
|
2132
|
+
```
|
|
1954
2133
|
|
|
1955
2134
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1956
2135
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -2166,12 +2345,16 @@ col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2166
2345
|
(i.e., no validation steps will be created for them).
|
|
2167
2346
|
|
|
2168
2347
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2169
|
-
for more complex segmentation scenarios. The following inputs are
|
|
2348
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
2170
2349
|
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2350
|
+
```
|
|
2351
|
+
# Segments from all unique values in the `region` column
|
|
2352
|
+
# and specific dates in the `date` column
|
|
2353
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
2354
|
+
|
|
2355
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
2356
|
+
segments=["region", "date"]
|
|
2357
|
+
```
|
|
2175
2358
|
|
|
2176
2359
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2177
2360
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -2385,12 +2568,16 @@ col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2385
2568
|
(i.e., no validation steps will be created for them).
|
|
2386
2569
|
|
|
2387
2570
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2388
|
-
for more complex segmentation scenarios. The following inputs are
|
|
2571
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
2389
2572
|
|
|
2390
|
-
|
|
2391
|
-
|
|
2392
|
-
|
|
2393
|
-
|
|
2573
|
+
```
|
|
2574
|
+
# Segments from all unique values in the `region` column
|
|
2575
|
+
# and specific dates in the `date` column
|
|
2576
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
2577
|
+
|
|
2578
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
2579
|
+
segments=["region", "date"]
|
|
2580
|
+
```
|
|
2394
2581
|
|
|
2395
2582
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2396
2583
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -2614,12 +2801,16 @@ col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2614
2801
|
(i.e., no validation steps will be created for them).
|
|
2615
2802
|
|
|
2616
2803
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2617
|
-
for more complex segmentation scenarios. The following inputs are
|
|
2804
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
2618
2805
|
|
|
2619
|
-
|
|
2620
|
-
|
|
2621
|
-
|
|
2622
|
-
|
|
2806
|
+
```
|
|
2807
|
+
# Segments from all unique values in the `region` column
|
|
2808
|
+
# and specific dates in the `date` column
|
|
2809
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
2810
|
+
|
|
2811
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
2812
|
+
segments=["region", "date"]
|
|
2813
|
+
```
|
|
2623
2814
|
|
|
2624
2815
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2625
2816
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -2855,12 +3046,16 @@ col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2855
3046
|
(i.e., no validation steps will be created for them).
|
|
2856
3047
|
|
|
2857
3048
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2858
|
-
for more complex segmentation scenarios. The following inputs are
|
|
3049
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
2859
3050
|
|
|
2860
|
-
|
|
2861
|
-
|
|
2862
|
-
|
|
2863
|
-
|
|
3051
|
+
```
|
|
3052
|
+
# Segments from all unique values in the `region` column
|
|
3053
|
+
# and specific dates in the `date` column
|
|
3054
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
3055
|
+
|
|
3056
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
3057
|
+
segments=["region", "date"]
|
|
3058
|
+
```
|
|
2864
3059
|
|
|
2865
3060
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2866
3061
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3055,12 +3250,16 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
|
|
|
3055
3250
|
(i.e., no validation steps will be created for them).
|
|
3056
3251
|
|
|
3057
3252
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3058
|
-
for more complex segmentation scenarios. The following inputs are
|
|
3253
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3059
3254
|
|
|
3060
|
-
|
|
3061
|
-
|
|
3062
|
-
|
|
3063
|
-
|
|
3255
|
+
```
|
|
3256
|
+
# Segments from all unique values in the `region` column
|
|
3257
|
+
# and specific dates in the `date` column
|
|
3258
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
3259
|
+
|
|
3260
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
3261
|
+
segments=["region", "date"]
|
|
3262
|
+
```
|
|
3064
3263
|
|
|
3065
3264
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3066
3265
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3241,12 +3440,16 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
3241
3440
|
(i.e., no validation steps will be created for them).
|
|
3242
3441
|
|
|
3243
3442
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3244
|
-
for more complex segmentation scenarios. The following inputs are
|
|
3443
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3245
3444
|
|
|
3246
|
-
|
|
3247
|
-
|
|
3248
|
-
|
|
3249
|
-
|
|
3445
|
+
```
|
|
3446
|
+
# Segments from all unique values in the `region` column
|
|
3447
|
+
# and specific dates in the `date` column
|
|
3448
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
3449
|
+
|
|
3450
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
3451
|
+
segments=["region", "date"]
|
|
3452
|
+
```
|
|
3250
3453
|
|
|
3251
3454
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3252
3455
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3342,9 +3545,9 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
3342
3545
|
|
|
3343
3546
|
col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3344
3547
|
|
|
3345
|
-
Validate whether values in a column are
|
|
3548
|
+
Validate whether values in a column are Null.
|
|
3346
3549
|
|
|
3347
|
-
The `col_vals_null()` validation method checks whether column values in a table are
|
|
3550
|
+
The `col_vals_null()` validation method checks whether column values in a table are Null.
|
|
3348
3551
|
This validation will operate over the number of test units that is equal to the number
|
|
3349
3552
|
of rows in the table.
|
|
3350
3553
|
|
|
@@ -3425,12 +3628,16 @@ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | Column
|
|
|
3425
3628
|
(i.e., no validation steps will be created for them).
|
|
3426
3629
|
|
|
3427
3630
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3428
|
-
for more complex segmentation scenarios. The following inputs are
|
|
3631
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3429
3632
|
|
|
3430
|
-
|
|
3431
|
-
|
|
3432
|
-
|
|
3433
|
-
|
|
3633
|
+
```
|
|
3634
|
+
# Segments from all unique values in the `region` column
|
|
3635
|
+
# and specific dates in the `date` column
|
|
3636
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
3637
|
+
|
|
3638
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
3639
|
+
segments=["region", "date"]
|
|
3640
|
+
```
|
|
3434
3641
|
|
|
3435
3642
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3436
3643
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3524,10 +3731,10 @@ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | Column
|
|
|
3524
3731
|
|
|
3525
3732
|
col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3526
3733
|
|
|
3527
|
-
Validate whether values in a column are not
|
|
3734
|
+
Validate whether values in a column are not Null.
|
|
3528
3735
|
|
|
3529
3736
|
The `col_vals_not_null()` validation method checks whether column values in a table are not
|
|
3530
|
-
|
|
3737
|
+
Null. This validation will operate over the number of test units that is equal to the number
|
|
3531
3738
|
of rows in the table.
|
|
3532
3739
|
|
|
3533
3740
|
Parameters
|
|
@@ -3607,12 +3814,16 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
|
|
|
3607
3814
|
(i.e., no validation steps will be created for them).
|
|
3608
3815
|
|
|
3609
3816
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3610
|
-
for more complex segmentation scenarios. The following inputs are
|
|
3817
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3611
3818
|
|
|
3612
|
-
|
|
3613
|
-
|
|
3614
|
-
|
|
3615
|
-
|
|
3819
|
+
```
|
|
3820
|
+
# Segments from all unique values in the `region` column
|
|
3821
|
+
# and specific dates in the `date` column
|
|
3822
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
3823
|
+
|
|
3824
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
3825
|
+
segments=["region", "date"]
|
|
3826
|
+
```
|
|
3616
3827
|
|
|
3617
3828
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3618
3829
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3795,12 +4006,16 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
|
|
|
3795
4006
|
(i.e., no validation steps will be created for them).
|
|
3796
4007
|
|
|
3797
4008
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3798
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4009
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3799
4010
|
|
|
3800
|
-
|
|
3801
|
-
|
|
3802
|
-
|
|
3803
|
-
|
|
4011
|
+
```
|
|
4012
|
+
# Segments from all unique values in the `region` column
|
|
4013
|
+
# and specific dates in the `date` column
|
|
4014
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4015
|
+
|
|
4016
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4017
|
+
segments=["region", "date"]
|
|
4018
|
+
```
|
|
3804
4019
|
|
|
3805
4020
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3806
4021
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3977,12 +4192,16 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
|
|
|
3977
4192
|
(i.e., no validation steps will be created for them).
|
|
3978
4193
|
|
|
3979
4194
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3980
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4195
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3981
4196
|
|
|
3982
|
-
|
|
3983
|
-
|
|
3984
|
-
|
|
3985
|
-
|
|
4197
|
+
```
|
|
4198
|
+
# Segments from all unique values in the `region` column
|
|
4199
|
+
# and specific dates in the `date` column
|
|
4200
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4201
|
+
|
|
4202
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4203
|
+
segments=["region", "date"]
|
|
4204
|
+
```
|
|
3986
4205
|
|
|
3987
4206
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3988
4207
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -4267,12 +4486,16 @@ rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
|
|
|
4267
4486
|
(i.e., no validation steps will be created for them).
|
|
4268
4487
|
|
|
4269
4488
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4270
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4489
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
4271
4490
|
|
|
4272
|
-
|
|
4273
|
-
|
|
4274
|
-
|
|
4275
|
-
|
|
4491
|
+
```
|
|
4492
|
+
# Segments from all unique values in the `region` column
|
|
4493
|
+
# and specific dates in the `date` column
|
|
4494
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4495
|
+
|
|
4496
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4497
|
+
segments=["region", "date"]
|
|
4498
|
+
```
|
|
4276
4499
|
|
|
4277
4500
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4278
4501
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -4453,12 +4676,16 @@ rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
|
|
|
4453
4676
|
(i.e., no validation steps will be created for them).
|
|
4454
4677
|
|
|
4455
4678
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4456
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4679
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
4457
4680
|
|
|
4458
|
-
|
|
4459
|
-
|
|
4460
|
-
|
|
4461
|
-
|
|
4681
|
+
```
|
|
4682
|
+
# Segments from all unique values in the `region` column
|
|
4683
|
+
# and specific dates in the `date` column
|
|
4684
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4685
|
+
|
|
4686
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4687
|
+
segments=["region", "date"]
|
|
4688
|
+
```
|
|
4462
4689
|
|
|
4463
4690
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4464
4691
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -6231,7 +6458,7 @@ matches(pattern: 'str', case_sensitive: 'bool' = False) -> 'Matches'
|
|
|
6231
6458
|
`[rev_01, rev_02, profit_01, profit_02, age]`
|
|
6232
6459
|
|
|
6233
6460
|
and you want to validate columns that have two digits at the end of the name, you can use
|
|
6234
|
-
`columns=matches(r"
|
|
6461
|
+
`columns=matches(r"[0-9]{2}$")`. This will select the `rev_01`, `rev_02`, `profit_01`, and
|
|
6235
6462
|
`profit_02` columns.
|
|
6236
6463
|
|
|
6237
6464
|
There will be a validation step created for every resolved column. Note that if there aren't any
|
|
@@ -6285,7 +6512,7 @@ matches(pattern: 'str', case_sensitive: 'bool' = False) -> 'Matches'
|
|
|
6285
6512
|
[`col()`](`pointblank.col`) function, like this:
|
|
6286
6513
|
|
|
6287
6514
|
```python
|
|
6288
|
-
col(matches(r"
|
|
6515
|
+
col(matches(r"^[0-9]{5}") & ends_with("_id"))
|
|
6289
6516
|
```
|
|
6290
6517
|
|
|
6291
6518
|
There are four operators that can be used to compose column selectors:
|
|
@@ -6324,7 +6551,7 @@ matches(pattern: 'str', case_sensitive: 'bool' = False) -> 'Matches'
|
|
|
6324
6551
|
|
|
6325
6552
|
validation = (
|
|
6326
6553
|
pb.Validate(data=tbl)
|
|
6327
|
-
.col_vals_regex(columns=pb.matches("id|identifier"), pattern=r"ID
|
|
6554
|
+
.col_vals_regex(columns=pb.matches("id|identifier"), pattern=r"ID[0-9]{4}")
|
|
6328
6555
|
.interrogate()
|
|
6329
6556
|
)
|
|
6330
6557
|
|
|
@@ -6332,7 +6559,7 @@ matches(pattern: 'str', case_sensitive: 'bool' = False) -> 'Matches'
|
|
|
6332
6559
|
```
|
|
6333
6560
|
|
|
6334
6561
|
From the results of the validation table we get two validation steps, one for `id_old` and one
|
|
6335
|
-
for `new_identifier`. The values in both columns all match the pattern `"ID
|
|
6562
|
+
for `new_identifier`. The values in both columns all match the pattern `"ID[0-9]{4}"`.
|
|
6336
6563
|
|
|
6337
6564
|
We can also use the `matches()` function in combination with other column selectors (within
|
|
6338
6565
|
[`col()`](`pointblank.col`)) to create more complex column selection criteria (i.e., to select
|
|
@@ -6875,7 +7102,7 @@ interrogate(self, collect_extracts: 'bool' = True, collect_tbl_checked: 'bool' =
|
|
|
6875
7102
|
|
|
6876
7103
|
After interrogation is complete, the `Validate` object will have gathered information, and
|
|
6877
7104
|
we can use methods like [`n_passed()`](`pointblank.Validate.n_passed`),
|
|
6878
|
-
[`f_failed()`](`pointblank.Validate.f_failed`)
|
|
7105
|
+
[`f_failed()`](`pointblank.Validate.f_failed`), etc., to understand how the table performed
|
|
6879
7106
|
against the validation plan. A visual representation of the validation results can be viewed
|
|
6880
7107
|
by printing the `Validate` object; this will display the validation table in an HTML viewing
|
|
6881
7108
|
environment.
|
|
@@ -7085,24 +7312,25 @@ get_step_report(self, i: 'int', columns_subset: 'str | list[str] | Column | None
|
|
|
7085
7312
|
Types of Step Reports
|
|
7086
7313
|
---------------------
|
|
7087
7314
|
The `get_step_report()` method produces a report based on the *type* of validation step.
|
|
7088
|
-
The following row-based validation methods will produce a
|
|
7089
|
-
|
|
7315
|
+
The following column-value or row-based validation step validation methods will produce a
|
|
7316
|
+
report that shows the rows of the data that failed:
|
|
7090
7317
|
|
|
7091
7318
|
- [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
|
|
7319
|
+
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
7092
7320
|
- [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
|
|
7321
|
+
- [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
|
|
7093
7322
|
- [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
|
|
7094
7323
|
- [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
|
|
7095
|
-
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
7096
|
-
- [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
|
|
7097
7324
|
- [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
|
|
7098
7325
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
7099
7326
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
7100
7327
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
7101
|
-
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
7102
7328
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
7103
7329
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
7104
|
-
- [`
|
|
7330
|
+
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
7331
|
+
- [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
|
|
7105
7332
|
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
7333
|
+
- [`rows_complete()`](`pointblank.Validate.rows_complete`)
|
|
7106
7334
|
|
|
7107
7335
|
The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
|
|
7108
7336
|
report that shows duplicate rows (or duplicate values in one or a set of columns as defined
|
|
@@ -7320,10 +7548,10 @@ get_sundered_data(self, type='pass') -> 'FrameT'
|
|
|
7320
7548
|
Get the data that passed or failed the validation steps.
|
|
7321
7549
|
|
|
7322
7550
|
Validation of the data is one thing but, sometimes, you want to use the best part of the
|
|
7323
|
-
input dataset for something else. The `get_sundered_data()` method works with a Validate
|
|
7551
|
+
input dataset for something else. The `get_sundered_data()` method works with a `Validate`
|
|
7324
7552
|
object that has been interrogated (i.e., the
|
|
7325
7553
|
[`interrogate()`](`pointblank.Validate.interrogate`) method was used). We can get either the
|
|
7326
|
-
'pass' data piece (rows with no failing test units across all
|
|
7554
|
+
'pass' data piece (rows with no failing test units across all column-value based validation
|
|
7327
7555
|
functions), or, the 'fail' data piece (rows with at least one failing test unit across the
|
|
7328
7556
|
same series of validations).
|
|
7329
7557
|
|
|
@@ -7332,7 +7560,7 @@ get_sundered_data(self, type='pass') -> 'FrameT'
|
|
|
7332
7560
|
There are some caveats to sundering. The validation steps considered for this splitting will
|
|
7333
7561
|
only involve steps where:
|
|
7334
7562
|
|
|
7335
|
-
- of certain check types, where test units are cells checked
|
|
7563
|
+
- of certain check types, where test units are cells checked down a column (e.g., the
|
|
7336
7564
|
`col_vals_*()` methods)
|
|
7337
7565
|
- `active=` is not set to `False`
|
|
7338
7566
|
- `pre=` has not been given an expression for modifying the input table
|
|
@@ -7401,11 +7629,13 @@ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = Fals
|
|
|
7401
7629
|
Get the rows that failed for each validation step.
|
|
7402
7630
|
|
|
7403
7631
|
After the [`interrogate()`](`pointblank.Validate.interrogate`) method has been called, the
|
|
7404
|
-
`get_data_extracts()` method can be used to extract the rows that failed in each
|
|
7405
|
-
validation step (e.g.,
|
|
7406
|
-
|
|
7407
|
-
|
|
7408
|
-
|
|
7632
|
+
`get_data_extracts()` method can be used to extract the rows that failed in each
|
|
7633
|
+
column-value or row-based validation step (e.g.,
|
|
7634
|
+
[`col_vals_gt()`](`pointblank.Validate.col_vals_gt`),
|
|
7635
|
+
[`rows_distinct()`](`pointblank.Validate.rows_distinct`), etc.). The method returns a
|
|
7636
|
+
dictionary of tables containing the rows that failed in every validation step. If
|
|
7637
|
+
`frame=True` and `i=` is a scalar, the value is conveniently returned as a table (forgoing
|
|
7638
|
+
the dictionary structure).
|
|
7409
7639
|
|
|
7410
7640
|
Parameters
|
|
7411
7641
|
----------
|
|
@@ -7418,13 +7648,13 @@ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = Fals
|
|
|
7418
7648
|
Returns
|
|
7419
7649
|
-------
|
|
7420
7650
|
dict[int, FrameT | None] | FrameT | None
|
|
7421
|
-
A dictionary of tables containing the rows that failed in every
|
|
7422
|
-
step
|
|
7651
|
+
A dictionary of tables containing the rows that failed in every compatible validation
|
|
7652
|
+
step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar.
|
|
7423
7653
|
|
|
7424
|
-
Validation Methods
|
|
7425
|
-
|
|
7426
|
-
The following validation methods
|
|
7427
|
-
failing test units.
|
|
7654
|
+
Compatible Validation Methods for Yielding Extracted Rows
|
|
7655
|
+
---------------------------------------------------------
|
|
7656
|
+
The following validation methods operate on column values and will have rows extracted when
|
|
7657
|
+
there are failing test units.
|
|
7428
7658
|
|
|
7429
7659
|
- [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
|
|
7430
7660
|
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
@@ -7439,11 +7669,20 @@ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = Fals
|
|
|
7439
7669
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
7440
7670
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
7441
7671
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
7672
|
+
- [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
|
|
7673
|
+
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
7674
|
+
|
|
7675
|
+
An extracted row for these validation methods means that a test unit failed for that row in
|
|
7676
|
+
the validation step.
|
|
7677
|
+
|
|
7678
|
+
These row-based validation methods will also have rows extracted should there be failing
|
|
7679
|
+
rows:
|
|
7680
|
+
|
|
7442
7681
|
- [`rows_distinct()`](`pointblank.Validate.rows_distinct`)
|
|
7682
|
+
- [`rows_complete()`](`pointblank.Validate.rows_complete`)
|
|
7443
7683
|
|
|
7444
|
-
|
|
7445
|
-
|
|
7446
|
-
understanding the nature of the failing test units.
|
|
7684
|
+
The extracted rows are a subset of the original table and are useful for further analysis
|
|
7685
|
+
or for understanding the nature of the failing test units.
|
|
7447
7686
|
|
|
7448
7687
|
Examples
|
|
7449
7688
|
--------
|
|
@@ -7578,6 +7817,10 @@ assert_passing(self) -> 'None'
|
|
|
7578
7817
|
assertion made is printed in the `AssertionError` message if a failure occurs, ensuring
|
|
7579
7818
|
some details are preserved.
|
|
7580
7819
|
|
|
7820
|
+
If the validation has not yet been interrogated, this method will automatically call
|
|
7821
|
+
[`interrogate()`](`pointblank.Validate.interrogate`) with default parameters before checking
|
|
7822
|
+
for passing tests.
|
|
7823
|
+
|
|
7581
7824
|
Raises
|
|
7582
7825
|
-------
|
|
7583
7826
|
AssertionError
|
|
@@ -7587,8 +7830,9 @@ assert_passing(self) -> 'None'
|
|
|
7587
7830
|
--------
|
|
7588
7831
|
In the example below, we'll use a simple Polars DataFrame with three columns (`a`, `b`, and
|
|
7589
7832
|
`c`). There will be three validation steps, and the second step will have a failing test
|
|
7590
|
-
unit (the value `10` isn't less than `9`).
|
|
7591
|
-
|
|
7833
|
+
unit (the value `10` isn't less than `9`). The `assert_passing()` method is used to assert
|
|
7834
|
+
that all validation steps passed perfectly, automatically performing the interrogation if
|
|
7835
|
+
needed.
|
|
7592
7836
|
|
|
7593
7837
|
```python
|
|
7594
7838
|
#| error: True
|
|
@@ -7609,13 +7853,221 @@ assert_passing(self) -> 'None'
|
|
|
7609
7853
|
.col_vals_gt(columns="a", value=0)
|
|
7610
7854
|
.col_vals_lt(columns="b", value=9) # this assertion is false
|
|
7611
7855
|
.col_vals_in_set(columns="c", set=["a", "b"])
|
|
7612
|
-
.interrogate()
|
|
7613
7856
|
)
|
|
7614
7857
|
|
|
7858
|
+
# No need to call [`interrogate()`](`pointblank.Validate.interrogate`) explicitly
|
|
7615
7859
|
validation.assert_passing()
|
|
7616
7860
|
```
|
|
7617
7861
|
|
|
7618
7862
|
|
|
7863
|
+
assert_below_threshold(self, level: 'str' = 'warning', i: 'int | None' = None, message: 'str | None' = None) -> 'None'
|
|
7864
|
+
|
|
7865
|
+
Raise an `AssertionError` if validation steps exceed a specified threshold level.
|
|
7866
|
+
|
|
7867
|
+
The `assert_below_threshold()` method checks whether validation steps' failure rates are
|
|
7868
|
+
below a given threshold level (`"warning"`, `"error"`, or `"critical"`). This is
|
|
7869
|
+
particularly useful in automated testing environments where you want to ensure your data
|
|
7870
|
+
quality meets minimum standards before proceeding.
|
|
7871
|
+
|
|
7872
|
+
If any validation step exceeds the specified threshold level, an `AssertionError` will be
|
|
7873
|
+
raised with details about which steps failed. If the validation has not yet been
|
|
7874
|
+
interrogated, this method will automatically call
|
|
7875
|
+
[`interrogate()`](`pointblank.Validate.interrogate`) with default parameters.
|
|
7876
|
+
|
|
7877
|
+
Parameters
|
|
7878
|
+
----------
|
|
7879
|
+
level
|
|
7880
|
+
The threshold level to check against, which could be any of `"warning"` (the default),
|
|
7881
|
+
`"error"`, or `"critical"`. An `AssertionError` will be raised if any validation step
|
|
7882
|
+
exceeds this level.
|
|
7883
|
+
i
|
|
7884
|
+
Specific validation step number(s) to check. Can be provided as a single integer or a
|
|
7885
|
+
list of integers. If `None` (the default), all steps are checked.
|
|
7886
|
+
message
|
|
7887
|
+
Custom error message to use if assertion fails. If `None`, a default message will be
|
|
7888
|
+
generated that lists the specific steps that exceeded the threshold.
|
|
7889
|
+
|
|
7890
|
+
Returns
|
|
7891
|
+
-------
|
|
7892
|
+
None
|
|
7893
|
+
|
|
7894
|
+
Raises
|
|
7895
|
+
------
|
|
7896
|
+
AssertionError
|
|
7897
|
+
If any specified validation step exceeds the given threshold level.
|
|
7898
|
+
ValueError
|
|
7899
|
+
If an invalid threshold level is provided.
|
|
7900
|
+
|
|
7901
|
+
Examples
|
|
7902
|
+
--------
|
|
7903
|
+
Below are some examples of how to use the `assert_below_threshold()` method. First, we'll
|
|
7904
|
+
create a simple Polars DataFrame with two columns (`a` and `b`).
|
|
7905
|
+
|
|
7906
|
+
```python
|
|
7907
|
+
import polars as pl
|
|
7908
|
+
|
|
7909
|
+
tbl = pl.DataFrame({
|
|
7910
|
+
"a": [7, 4, 9, 7, 12],
|
|
7911
|
+
"b": [9, 8, 10, 5, 10]
|
|
7912
|
+
})
|
|
7913
|
+
```
|
|
7914
|
+
|
|
7915
|
+
Then a validation plan will be created with thresholds (`warning=0.1`, `error=0.2`,
|
|
7916
|
+
`critical=0.3`). After interrogating, we display the validation report table:
|
|
7917
|
+
|
|
7918
|
+
```python
|
|
7919
|
+
import pointblank as pb
|
|
7920
|
+
|
|
7921
|
+
validation = (
|
|
7922
|
+
pb.Validate(data=tbl, thresholds=(0.1, 0.2, 0.3))
|
|
7923
|
+
.col_vals_gt(columns="a", value=5) # 1 failing test unit
|
|
7924
|
+
.col_vals_lt(columns="b", value=10) # 2 failing test units
|
|
7925
|
+
.interrogate()
|
|
7926
|
+
)
|
|
7927
|
+
|
|
7928
|
+
validation
|
|
7929
|
+
```
|
|
7930
|
+
|
|
7931
|
+
Using `assert_below_threshold(level="warning")` will raise an `AssertionError` if any step
|
|
7932
|
+
exceeds the 'warning' threshold:
|
|
7933
|
+
|
|
7934
|
+
Check a specific step against the 'critical' threshold using the `i=` parameter:
|
|
7935
|
+
|
|
7936
|
+
```python
|
|
7937
|
+
validation.assert_below_threshold(level="critical", i=1) # Won't raise an error
|
|
7938
|
+
```
|
|
7939
|
+
|
|
7940
|
+
As the first step is below the 'critical' threshold (it exceeds the 'warning' and 'error'
|
|
7941
|
+
thresholds), no error is raised and nothing is printed.
|
|
7942
|
+
|
|
7943
|
+
We can also provide a custom error message with the `message=` parameter. Let's try that
|
|
7944
|
+
here:
|
|
7945
|
+
|
|
7946
|
+
```python
|
|
7947
|
+
try:
|
|
7948
|
+
validation.assert_below_threshold(
|
|
7949
|
+
level="error",
|
|
7950
|
+
message="Data quality too low for processing!"
|
|
7951
|
+
)
|
|
7952
|
+
except AssertionError as e:
|
|
7953
|
+
print(f"Custom error: {e}")
|
|
7954
|
+
```
|
|
7955
|
+
|
|
7956
|
+
See Also
|
|
7957
|
+
--------
|
|
7958
|
+
- [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation
|
|
7959
|
+
step
|
|
7960
|
+
- [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step
|
|
7961
|
+
- [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each
|
|
7962
|
+
validation step
|
|
7963
|
+
- [`assert_passing()`](`pointblank.Validate.assert_passing`): assert all validations pass
|
|
7964
|
+
completely
|
|
7965
|
+
|
|
7966
|
+
|
|
7967
|
+
above_threshold(self, level: 'str' = 'warning', i: 'int | None' = None) -> 'bool'
|
|
7968
|
+
|
|
7969
|
+
Check if any validation steps exceed a specified threshold level.
|
|
7970
|
+
|
|
7971
|
+
The `above_threshold()` method checks whether validation steps exceed a given threshold
|
|
7972
|
+
level. This provides a non-exception-based alternative to
|
|
7973
|
+
[`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`) for conditional
|
|
7974
|
+
workflow control based on validation results.
|
|
7975
|
+
|
|
7976
|
+
This method is useful in scenarios where you want to check if any validation steps failed
|
|
7977
|
+
beyond a certain threshold without raising an exception, allowing for more flexible
|
|
7978
|
+
programmatic responses to validation issues.
|
|
7979
|
+
|
|
7980
|
+
Parameters
|
|
7981
|
+
----------
|
|
7982
|
+
level
|
|
7983
|
+
The threshold level to check against. Valid options are: `"warning"` (the least severe
|
|
7984
|
+
threshold level), `"error"` (the middle severity threshold level), and `"critical"` (the
|
|
7985
|
+
most severe threshold level). The default is `"warning"`.
|
|
7986
|
+
i
|
|
7987
|
+
Specific validation step number(s) to check. If a single integer, checks only that step.
|
|
7988
|
+
If a list of integers, checks all specified steps. If `None` (the default), checks all
|
|
7989
|
+
validation steps. Step numbers are 1-based (first step is `1`, not `0`).
|
|
7990
|
+
|
|
7991
|
+
Returns
|
|
7992
|
+
-------
|
|
7993
|
+
bool
|
|
7994
|
+
`True` if any of the specified validation steps exceed the given threshold level,
|
|
7995
|
+
`False` otherwise.
|
|
7996
|
+
|
|
7997
|
+
Raises
|
|
7998
|
+
------
|
|
7999
|
+
ValueError
|
|
8000
|
+
If an invalid threshold level is provided.
|
|
8001
|
+
|
|
8002
|
+
Examples
|
|
8003
|
+
--------
|
|
8004
|
+
Below are some examples of how to use the `above_threshold()` method. First, we'll create a
|
|
8005
|
+
simple Polars DataFrame with a single column (`values`).
|
|
8006
|
+
|
|
8007
|
+
Then a validation plan will be created with thresholds (`warning=0.1`, `error=0.2`,
|
|
8008
|
+
`critical=0.3`). After interrogating, we display the validation report table:
|
|
8009
|
+
|
|
8010
|
+
```python
|
|
8011
|
+
import pointblank as pb
|
|
8012
|
+
|
|
8013
|
+
validation = (
|
|
8014
|
+
pb.Validate(data=tbl, thresholds=(0.1, 0.2, 0.3))
|
|
8015
|
+
.col_vals_gt(columns="values", value=0)
|
|
8016
|
+
.col_vals_lt(columns="values", value=10)
|
|
8017
|
+
.col_vals_between(columns="values", left=0, right=5)
|
|
8018
|
+
.interrogate()
|
|
8019
|
+
)
|
|
8020
|
+
|
|
8021
|
+
validation
|
|
8022
|
+
```
|
|
8023
|
+
|
|
8024
|
+
Let's check if any steps exceed the 'warning' threshold with the `above_threshold()` method.
|
|
8025
|
+
A message will be printed if that's the case:
|
|
8026
|
+
|
|
8027
|
+
```python
|
|
8028
|
+
if validation.above_threshold(level="warning"):
|
|
8029
|
+
print("Some steps have exceeded the warning threshold")
|
|
8030
|
+
```
|
|
8031
|
+
|
|
8032
|
+
Check if only steps 2 and 3 exceed the 'error' threshold through use of the `i=` argument:
|
|
8033
|
+
|
|
8034
|
+
```python
|
|
8035
|
+
if validation.above_threshold(level="error", i=[2, 3]):
|
|
8036
|
+
print("Steps 2 and/or 3 have exceeded the error threshold")
|
|
8037
|
+
```
|
|
8038
|
+
|
|
8039
|
+
You can use this in a workflow to conditionally trigger processes. Here's a snippet of how
|
|
8040
|
+
you might use this in a function:
|
|
8041
|
+
|
|
8042
|
+
```python
|
|
8043
|
+
def process_data(validation_obj):
|
|
8044
|
+
# Only continue processing if validation passes critical thresholds
|
|
8045
|
+
if not validation_obj.above_threshold(level="critical"):
|
|
8046
|
+
# Continue with processing
|
|
8047
|
+
print("Data meets critical quality thresholds, proceeding...")
|
|
8048
|
+
return True
|
|
8049
|
+
else:
|
|
8050
|
+
# Log failure and stop processing
|
|
8051
|
+
print("Data fails critical quality checks, aborting...")
|
|
8052
|
+
return False
|
|
8053
|
+
```
|
|
8054
|
+
|
|
8055
|
+
Note that this is just a suggestion for how to implement conditional workflow processes. You
|
|
8056
|
+
should adapt this pattern to your specific requirements, which might include different
|
|
8057
|
+
threshold levels, custom logging mechanisms, or integration with your organization's data
|
|
8058
|
+
pipelines and notification systems.
|
|
8059
|
+
|
|
8060
|
+
See Also
|
|
8061
|
+
--------
|
|
8062
|
+
- [`assert_below_threshold()`](`pointblank.Validate.assert_below_threshold`): a similar
|
|
8063
|
+
method that raises an exception if thresholds are exceeded
|
|
8064
|
+
- [`warning()`](`pointblank.Validate.warning`): get the 'warning' status for each validation
|
|
8065
|
+
step
|
|
8066
|
+
- [`error()`](`pointblank.Validate.error`): get the 'error' status for each validation step
|
|
8067
|
+
- [`critical()`](`pointblank.Validate.critical`): get the 'critical' status for each
|
|
8068
|
+
validation step
|
|
8069
|
+
|
|
8070
|
+
|
|
7619
8071
|
n(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'dict[int, int] | int'
|
|
7620
8072
|
|
|
7621
8073
|
Provides a dictionary of the number of test units for each validation step.
|
|
@@ -8237,7 +8689,7 @@ critical(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'd
|
|
|
8237
8689
|
Get the 'critical' level status for each validation step.
|
|
8238
8690
|
|
|
8239
8691
|
The 'critical' status for a validation step is `True` if the fraction of failing test units
|
|
8240
|
-
meets or exceeds the threshold for the
|
|
8692
|
+
meets or exceeds the threshold for the 'critical' level. Otherwise, the status is `False`.
|
|
8241
8693
|
|
|
8242
8694
|
The ascribed name of 'critical' is semantic and is thus simply a status indicator that could
|
|
8243
8695
|
be used to trigger some action to be take. Here's how it fits in with other status
|
|
@@ -8249,14 +8701,14 @@ critical(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'd
|
|
|
8249
8701
|
severity
|
|
8250
8702
|
- 'critical': the status obtained by calling `critical()`, most severe
|
|
8251
8703
|
|
|
8252
|
-
This method provides a dictionary of the
|
|
8253
|
-
|
|
8254
|
-
|
|
8704
|
+
This method provides a dictionary of the 'critical' status for each validation step. If the
|
|
8705
|
+
`scalar=True` argument is provided and `i=` is a scalar, the value is returned as a scalar
|
|
8706
|
+
instead of a dictionary.
|
|
8255
8707
|
|
|
8256
8708
|
Parameters
|
|
8257
8709
|
----------
|
|
8258
8710
|
i
|
|
8259
|
-
The validation step number(s) from which the
|
|
8711
|
+
The validation step number(s) from which the 'critical' status is obtained. Can be
|
|
8260
8712
|
provided as a list of integers or a single integer. If `None`, all steps are included.
|
|
8261
8713
|
scalar
|
|
8262
8714
|
If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary.
|
|
@@ -8264,7 +8716,7 @@ critical(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'd
|
|
|
8264
8716
|
Returns
|
|
8265
8717
|
-------
|
|
8266
8718
|
dict[int, bool] | bool
|
|
8267
|
-
A dictionary of the
|
|
8719
|
+
A dictionary of the 'critical' status for each validation step or a scalar value.
|
|
8268
8720
|
|
|
8269
8721
|
Examples
|
|
8270
8722
|
--------
|
|
@@ -8344,7 +8796,7 @@ datasets included in the package can be accessed via the `load_dataset()` functi
|
|
|
8344
8796
|
`config()` utility lets us set global configuration parameters. Want to chat with an assistant? Use
|
|
8345
8797
|
the `assistant()` function to get help with Pointblank.
|
|
8346
8798
|
|
|
8347
|
-
DataScan(data: '
|
|
8799
|
+
DataScan(data: 'IntoFrameT', tbl_name: 'str | None' = None) -> 'None'
|
|
8348
8800
|
|
|
8349
8801
|
Get a summary of a dataset.
|
|
8350
8802
|
|
|
@@ -8458,8 +8910,14 @@ preview(data: 'FrameT | Any', columns_subset: 'str | list[str] | Column | None'
|
|
|
8458
8910
|
Parameters
|
|
8459
8911
|
----------
|
|
8460
8912
|
data
|
|
8461
|
-
The table to preview, which could be a DataFrame object
|
|
8462
|
-
|
|
8913
|
+
The table to preview, which could be a DataFrame object, an Ibis table object, a CSV
|
|
8914
|
+
file path, a Parquet file path, or a database connection string. When providing a CSV or
|
|
8915
|
+
Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
|
|
8916
|
+
loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
|
|
8917
|
+
glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
|
|
8918
|
+
Connection strings enable direct database access via Ibis with optional table specification
|
|
8919
|
+
using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
|
|
8920
|
+
on the supported table types.
|
|
8463
8921
|
columns_subset
|
|
8464
8922
|
The columns to display in the table, by default `None` (all columns are shown). This can
|
|
8465
8923
|
be a string, a list of strings, a `Column` object, or a `ColumnSelector` object. The latter
|
|
@@ -8504,13 +8962,40 @@ preview(data: 'FrameT | Any', columns_subset: 'str | list[str] | Column | None'
|
|
|
8504
8962
|
- MySQL table (`"mysql"`)*
|
|
8505
8963
|
- PostgreSQL table (`"postgresql"`)*
|
|
8506
8964
|
- SQLite table (`"sqlite"`)*
|
|
8965
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
8966
|
+
- Snowflake table (`"snowflake"`)*
|
|
8967
|
+
- Databricks table (`"databricks"`)*
|
|
8968
|
+
- PySpark table (`"pyspark"`)*
|
|
8969
|
+
- BigQuery table (`"bigquery"`)*
|
|
8507
8970
|
- Parquet table (`"parquet"`)*
|
|
8971
|
+
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
|
|
8972
|
+
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
|
|
8973
|
+
extension, or partitioned dataset)
|
|
8974
|
+
- Database connection strings (URI format with optional table specification)
|
|
8508
8975
|
|
|
8509
8976
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
8510
8977
|
`ibis.expr.types.relations.Table`). Furthermore, using `preview()` with these types of tables
|
|
8511
8978
|
requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or
|
|
8512
8979
|
Pandas DataFrame, the availability of Ibis is not needed.
|
|
8513
8980
|
|
|
8981
|
+
To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
|
|
8982
|
+
provided. The file will be automatically detected and loaded using the best available DataFrame
|
|
8983
|
+
library. The loading preference is Polars first, then Pandas as a fallback.
|
|
8984
|
+
|
|
8985
|
+
Connection strings follow database URL formats and must also specify a table using the
|
|
8986
|
+
`::table_name` suffix. Examples include:
|
|
8987
|
+
|
|
8988
|
+
```
|
|
8989
|
+
"duckdb:///path/to/database.ddb::table_name"
|
|
8990
|
+
"sqlite:///path/to/database.db::table_name"
|
|
8991
|
+
"postgresql://user:password@localhost:5432/database::table_name"
|
|
8992
|
+
"mysql://user:password@localhost:3306/database::table_name"
|
|
8993
|
+
"bigquery://project/dataset::table_name"
|
|
8994
|
+
"snowflake://user:password@account/database/schema::table_name"
|
|
8995
|
+
```
|
|
8996
|
+
|
|
8997
|
+
When using connection strings, the Ibis library with the appropriate backend driver is required.
|
|
8998
|
+
|
|
8514
8999
|
Examples
|
|
8515
9000
|
--------
|
|
8516
9001
|
It's easy to preview a table using the `preview()` function. Here's an example using the
|
|
@@ -8569,6 +9054,39 @@ preview(data: 'FrameT | Any', columns_subset: 'str | list[str] | Column | None'
|
|
|
8569
9054
|
columns_subset=pb.col(pb.starts_with("item") | pb.matches("player"))
|
|
8570
9055
|
)
|
|
8571
9056
|
```
|
|
9057
|
+
|
|
9058
|
+
### Working with CSV Files
|
|
9059
|
+
|
|
9060
|
+
The `preview()` function can directly accept CSV file paths, making it easy to preview data
|
|
9061
|
+
stored in CSV files without manual loading:
|
|
9062
|
+
|
|
9063
|
+
You can also use a Path object to specify the CSV file:
|
|
9064
|
+
|
|
9065
|
+
### Working with Parquet Files
|
|
9066
|
+
|
|
9067
|
+
The `preview()` function can directly accept Parquet files and datasets in various formats:
|
|
9068
|
+
|
|
9069
|
+
You can also use glob patterns and directories:
|
|
9070
|
+
|
|
9071
|
+
```python
|
|
9072
|
+
# Multiple Parquet files with glob patterns
|
|
9073
|
+
pb.preview("data/sales_*.parquet")
|
|
9074
|
+
|
|
9075
|
+
# Directory containing Parquet files
|
|
9076
|
+
pb.preview("parquet_data/")
|
|
9077
|
+
|
|
9078
|
+
# Partitioned Parquet dataset
|
|
9079
|
+
pb.preview("sales_data/") # Auto-discovers partition columns
|
|
9080
|
+
```
|
|
9081
|
+
|
|
9082
|
+
### Working with Database Connection Strings
|
|
9083
|
+
|
|
9084
|
+
The `preview()` function supports database connection strings for direct preview of database
|
|
9085
|
+
tables. Connection strings must specify a table using the `::table_name` suffix:
|
|
9086
|
+
|
|
9087
|
+
For comprehensive documentation on supported connection string formats, error handling, and
|
|
9088
|
+
installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
|
|
9089
|
+
function.
|
|
8572
9090
|
|
|
8573
9091
|
|
|
8574
9092
|
col_summary_tbl(data: 'FrameT | Any', tbl_name: 'str | None' = None) -> 'GT'
|
|
@@ -8672,6 +9190,11 @@ missing_vals_tbl(data: 'FrameT | Any') -> 'GT'
|
|
|
8672
9190
|
- MySQL table (`"mysql"`)*
|
|
8673
9191
|
- PostgreSQL table (`"postgresql"`)*
|
|
8674
9192
|
- SQLite table (`"sqlite"`)*
|
|
9193
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
9194
|
+
- Snowflake table (`"snowflake"`)*
|
|
9195
|
+
- Databricks table (`"databricks"`)*
|
|
9196
|
+
- PySpark table (`"pyspark"`)*
|
|
9197
|
+
- BigQuery table (`"bigquery"`)*
|
|
8675
9198
|
- Parquet table (`"parquet"`)*
|
|
8676
9199
|
|
|
8677
9200
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -8932,6 +9455,104 @@ load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'glo
|
|
|
8932
9455
|
regions: North America, Europe, or Asia.
|
|
8933
9456
|
|
|
8934
9457
|
|
|
9458
|
+
get_data_path(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'global_sales']" = 'small_table', file_type: "Literal['csv', 'parquet', 'duckdb']" = 'csv') -> 'str'
|
|
9459
|
+
|
|
9460
|
+
Get the file path to a dataset included with the Pointblank package.
|
|
9461
|
+
|
|
9462
|
+
This function provides direct access to the file paths of datasets included with Pointblank.
|
|
9463
|
+
These paths can be used in examples and documentation to demonstrate file-based data loading
|
|
9464
|
+
without requiring the actual data files. The returned paths can be used with
|
|
9465
|
+
`Validate(data=path)` to demonstrate CSV and Parquet file loading capabilities.
|
|
9466
|
+
|
|
9467
|
+
Parameters
|
|
9468
|
+
----------
|
|
9469
|
+
dataset
|
|
9470
|
+
The name of the dataset to get the path for. Current options are `"small_table"`,
|
|
9471
|
+
`"game_revenue"`, `"nycflights"`, and `"global_sales"`.
|
|
9472
|
+
file_type
|
|
9473
|
+
The file format to get the path for. Options are `"csv"`, `"parquet"`, or `"duckdb"`.
|
|
9474
|
+
|
|
9475
|
+
Returns
|
|
9476
|
+
-------
|
|
9477
|
+
str
|
|
9478
|
+
The file path to the requested dataset file.
|
|
9479
|
+
|
|
9480
|
+
Included Datasets
|
|
9481
|
+
-----------------
|
|
9482
|
+
The available datasets are the same as those in [`load_dataset()`](`pointblank.load_dataset`):
|
|
9483
|
+
|
|
9484
|
+
- `"small_table"`: A small dataset with 13 rows and 8 columns. Ideal for testing and examples.
|
|
9485
|
+
- `"game_revenue"`: A dataset with 2000 rows and 11 columns. Revenue data for a game company.
|
|
9486
|
+
- `"nycflights"`: A dataset with 336,776 rows and 18 columns. Flight data from NYC airports.
|
|
9487
|
+
- `"global_sales"`: A dataset with 50,000 rows and 20 columns. Global sales data across regions.
|
|
9488
|
+
|
|
9489
|
+
File Types
|
|
9490
|
+
----------
|
|
9491
|
+
Each dataset is available in multiple formats:
|
|
9492
|
+
|
|
9493
|
+
- `"csv"`: Comma-separated values file (`.csv`)
|
|
9494
|
+
- `"parquet"`: Parquet file (`.parquet`)
|
|
9495
|
+
- `"duckdb"`: DuckDB database file (`.ddb`)
|
|
9496
|
+
|
|
9497
|
+
Examples
|
|
9498
|
+
--------
|
|
9499
|
+
Get the path to a CSV file and use it with `Validate`:
|
|
9500
|
+
|
|
9501
|
+
```python
|
|
9502
|
+
import pointblank as pb
|
|
9503
|
+
|
|
9504
|
+
# Get path to the small_table CSV file
|
|
9505
|
+
csv_path = pb.get_data_path("small_table", "csv")
|
|
9506
|
+
print(csv_path)
|
|
9507
|
+
|
|
9508
|
+
# Use the path directly with Validate
|
|
9509
|
+
validation = (
|
|
9510
|
+
pb.Validate(data=csv_path)
|
|
9511
|
+
.col_exists(["a", "b", "c"])
|
|
9512
|
+
.col_vals_gt(columns="d", value=0)
|
|
9513
|
+
.interrogate()
|
|
9514
|
+
)
|
|
9515
|
+
|
|
9516
|
+
validation
|
|
9517
|
+
```
|
|
9518
|
+
|
|
9519
|
+
Get a Parquet file path for validation examples:
|
|
9520
|
+
|
|
9521
|
+
```python
|
|
9522
|
+
# Get path to the game_revenue Parquet file
|
|
9523
|
+
parquet_path = pb.get_data_path(dataset="game_revenue", file_type="parquet")
|
|
9524
|
+
|
|
9525
|
+
# Validate the Parquet file directly
|
|
9526
|
+
validation = (
|
|
9527
|
+
pb.Validate(data=parquet_path, label="Game Revenue Data Validation")
|
|
9528
|
+
.col_vals_not_null(columns=["player_id", "session_id"])
|
|
9529
|
+
.col_vals_gt(columns="item_revenue", value=0)
|
|
9530
|
+
.interrogate()
|
|
9531
|
+
)
|
|
9532
|
+
|
|
9533
|
+
validation
|
|
9534
|
+
```
|
|
9535
|
+
|
|
9536
|
+
This is particularly useful for documentation examples where you want to demonstrate
|
|
9537
|
+
file-based workflows without requiring users to have specific data files:
|
|
9538
|
+
|
|
9539
|
+
```python
|
|
9540
|
+
# Example showing CSV file validation
|
|
9541
|
+
sales_csv = pb.get_data_path(dataset="global_sales", file_type="csv")
|
|
9542
|
+
|
|
9543
|
+
validation = (
|
|
9544
|
+
pb.Validate(data=sales_csv, label="Sales Data Validation")
|
|
9545
|
+
.col_exists(["customer_id", "product_id", "amount"])
|
|
9546
|
+
.col_vals_regex(columns="customer_id", pattern=r"CUST_[0-9]{6}")
|
|
9547
|
+
.interrogate()
|
|
9548
|
+
)
|
|
9549
|
+
```
|
|
9550
|
+
|
|
9551
|
+
See Also
|
|
9552
|
+
--------
|
|
9553
|
+
[`load_dataset()`](`pointblank.load_dataset`) for loading datasets directly as table objects.
|
|
9554
|
+
|
|
9555
|
+
|
|
8935
9556
|
|
|
8936
9557
|
## The Utility Functions family
|
|
8937
9558
|
|
|
@@ -8971,6 +9592,11 @@ get_column_count(data: 'FrameT | Any') -> 'int'
|
|
|
8971
9592
|
- MySQL table (`"mysql"`)*
|
|
8972
9593
|
- PostgreSQL table (`"postgresql"`)*
|
|
8973
9594
|
- SQLite table (`"sqlite"`)*
|
|
9595
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
9596
|
+
- Snowflake table (`"snowflake"`)*
|
|
9597
|
+
- Databricks table (`"databricks"`)*
|
|
9598
|
+
- PySpark table (`"pyspark"`)*
|
|
9599
|
+
- BigQuery table (`"bigquery"`)*
|
|
8974
9600
|
- Parquet table (`"parquet"`)*
|
|
8975
9601
|
|
|
8976
9602
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -9028,6 +9654,11 @@ get_row_count(data: 'FrameT | Any') -> 'int'
|
|
|
9028
9654
|
- MySQL table (`"mysql"`)*
|
|
9029
9655
|
- PostgreSQL table (`"postgresql"`)*
|
|
9030
9656
|
- SQLite table (`"sqlite"`)*
|
|
9657
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
9658
|
+
- Snowflake table (`"snowflake"`)*
|
|
9659
|
+
- Databricks table (`"databricks"`)*
|
|
9660
|
+
- PySpark table (`"pyspark"`)*
|
|
9661
|
+
- BigQuery table (`"bigquery"`)*
|
|
9031
9662
|
- Parquet table (`"parquet"`)*
|
|
9032
9663
|
|
|
9033
9664
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
@@ -9467,7 +10098,7 @@ send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None'
|
|
|
9467
10098
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
9468
10099
|
actions=pb.Actions(critical=notify_slack),
|
|
9469
10100
|
)
|
|
9470
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
10101
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
9471
10102
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
9472
10103
|
.col_vals_gt(columns="session_duration", value=15)
|
|
9473
10104
|
.interrogate()
|
|
@@ -9499,7 +10130,7 @@ send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None'
|
|
|
9499
10130
|
thresholds=pb.Thresholds(warning=0.05, error=0.10, critical=0.15),
|
|
9500
10131
|
final_actions=pb.FinalActions(notify_slack),
|
|
9501
10132
|
)
|
|
9502
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
10133
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
9503
10134
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
9504
10135
|
.col_vals_gt(columns="session_duration", value=15)
|
|
9505
10136
|
.interrogate()
|
|
@@ -9567,7 +10198,7 @@ send_slack_notification(webhook_url: 'str | None' = None, step_msg: 'str | None'
|
|
|
9567
10198
|
actions=pb.Actions(default=notify_slack),
|
|
9568
10199
|
final_actions=pb.FinalActions(notify_slack),
|
|
9569
10200
|
)
|
|
9570
|
-
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}
|
|
10201
|
+
.col_vals_regex(columns="player_id", pattern=r"[A-Z]{12}[0-9]{3}")
|
|
9571
10202
|
.col_vals_gt(columns="item_revenue", value=0.05)
|
|
9572
10203
|
.col_vals_gt(columns="session_duration", value=15)
|
|
9573
10204
|
.interrogate()
|