pointblank 0.9.6__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +4 -0
- pointblank/_datascan_utils.py +65 -0
- pointblank/_utils.py +126 -0
- pointblank/_utils_html.py +40 -0
- pointblank/assistant.py +1 -3
- pointblank/cli.py +2737 -0
- pointblank/compare.py +27 -0
- pointblank/data/api-docs.txt +518 -125
- pointblank/datascan.py +318 -959
- pointblank/scan_profile.py +321 -0
- pointblank/scan_profile_stats.py +180 -0
- pointblank/schema.py +14 -3
- pointblank/validate.py +1425 -202
- {pointblank-0.9.6.dist-info → pointblank-0.11.0.dist-info}/METADATA +49 -3
- {pointblank-0.9.6.dist-info → pointblank-0.11.0.dist-info}/RECORD +20 -14
- {pointblank-0.9.6.dist-info → pointblank-0.11.0.dist-info}/WHEEL +1 -1
- pointblank-0.11.0.dist-info/entry_points.txt +2 -0
- {pointblank-0.9.6.dist-info → pointblank-0.11.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.9.6.dist-info → pointblank-0.11.0.dist-info}/top_level.txt +0 -0
pointblank/data/api-docs.txt
CHANGED
|
@@ -42,8 +42,14 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
42
42
|
Parameters
|
|
43
43
|
----------
|
|
44
44
|
data
|
|
45
|
-
The table to validate, which could be a DataFrame object
|
|
46
|
-
|
|
45
|
+
The table to validate, which could be a DataFrame object, an Ibis table object, a CSV
|
|
46
|
+
file path, a Parquet file path, or a database connection string. When providing a CSV or
|
|
47
|
+
Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
|
|
48
|
+
loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
|
|
49
|
+
glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
|
|
50
|
+
Connection strings enable direct database access via Ibis with optional table specification
|
|
51
|
+
using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
|
|
52
|
+
on the supported table types.
|
|
47
53
|
tbl_name
|
|
48
54
|
An optional name to assign to the input table object. If no value is provided, a name will
|
|
49
55
|
be generated based on whatever information is available. This table name will be displayed
|
|
@@ -113,12 +119,34 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
113
119
|
- PySpark table (`"pyspark"`)*
|
|
114
120
|
- BigQuery table (`"bigquery"`)*
|
|
115
121
|
- Parquet table (`"parquet"`)*
|
|
122
|
+
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
|
|
123
|
+
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
|
|
124
|
+
extension, or partitioned dataset)
|
|
125
|
+
- Database connection strings (URI format with optional table specification)
|
|
116
126
|
|
|
117
127
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
118
128
|
`ibis.expr.types.relations.Table`). Furthermore, the use of `Validate` with such tables requires
|
|
119
129
|
the Ibis library v9.5.0 and above to be installed. If the input table is a Polars or Pandas
|
|
120
130
|
DataFrame, the Ibis library is not required.
|
|
121
131
|
|
|
132
|
+
To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
|
|
133
|
+
provided. The file will be automatically detected and loaded using the best available DataFrame
|
|
134
|
+
library. The loading preference is Polars first, then Pandas as a fallback.
|
|
135
|
+
|
|
136
|
+
Connection strings follow database URL formats and must also specify a table using the
|
|
137
|
+
`::table_name` suffix. Examples include:
|
|
138
|
+
|
|
139
|
+
```
|
|
140
|
+
"duckdb:///path/to/database.ddb::table_name"
|
|
141
|
+
"sqlite:///path/to/database.db::table_name"
|
|
142
|
+
"postgresql://user:password@localhost:5432/database::table_name"
|
|
143
|
+
"mysql://user:password@localhost:3306/database::table_name"
|
|
144
|
+
"bigquery://project/dataset::table_name"
|
|
145
|
+
"snowflake://user:password@account/database/schema::table_name"
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
When using connection strings, the Ibis library with the appropriate backend driver is required.
|
|
149
|
+
|
|
122
150
|
Thresholds
|
|
123
151
|
----------
|
|
124
152
|
The `thresholds=` parameter is used to set the failure-condition levels for all validation
|
|
@@ -275,8 +303,8 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
275
303
|
```python
|
|
276
304
|
import pointblank as pb
|
|
277
305
|
|
|
278
|
-
# Load the small_table dataset
|
|
279
|
-
small_table = pb.load_dataset()
|
|
306
|
+
# Load the `small_table` dataset
|
|
307
|
+
small_table = pb.load_dataset(dataset="small_table", tbl_type="polars")
|
|
280
308
|
|
|
281
309
|
# Preview the table
|
|
282
310
|
pb.preview(small_table)
|
|
@@ -342,7 +370,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
342
370
|
brief). Here's an example of a global setting for briefs:
|
|
343
371
|
|
|
344
372
|
```python
|
|
345
|
-
|
|
373
|
+
validation_2 = (
|
|
346
374
|
pb.Validate(
|
|
347
375
|
data=pb.load_dataset(),
|
|
348
376
|
tbl_name="small_table",
|
|
@@ -359,7 +387,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
359
387
|
.interrogate()
|
|
360
388
|
)
|
|
361
389
|
|
|
362
|
-
|
|
390
|
+
validation_2
|
|
363
391
|
```
|
|
364
392
|
|
|
365
393
|
We see the text of the briefs appear in the `STEP` column of the reporting table. Furthermore,
|
|
@@ -377,7 +405,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
377
405
|
the data extracts for each validation step.
|
|
378
406
|
|
|
379
407
|
```python
|
|
380
|
-
|
|
408
|
+
validation_2.get_data_extracts()
|
|
381
409
|
```
|
|
382
410
|
|
|
383
411
|
We can also view step reports for each validation step using the
|
|
@@ -385,7 +413,7 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
385
413
|
type of validation step and shows the relevant information for a step's validation.
|
|
386
414
|
|
|
387
415
|
```python
|
|
388
|
-
|
|
416
|
+
validation_2.get_step_report(i=2)
|
|
389
417
|
```
|
|
390
418
|
|
|
391
419
|
The `Validate` class also has a method for getting the sundered data, which is the data that
|
|
@@ -393,11 +421,141 @@ Validate(data: 'FrameT | Any', tbl_name: 'str | None' = None, label: 'str | None
|
|
|
393
421
|
[`get_sundered_data()`](`pointblank.Validate.get_sundered_data`) method.
|
|
394
422
|
|
|
395
423
|
```python
|
|
396
|
-
pb.preview(
|
|
424
|
+
pb.preview(validation_2.get_sundered_data())
|
|
397
425
|
```
|
|
398
426
|
|
|
399
427
|
The sundered data is a DataFrame that contains the rows that passed or failed the validation.
|
|
400
428
|
The default behavior is to return the rows that failed the validation, as shown above.
|
|
429
|
+
|
|
430
|
+
### Working with CSV Files
|
|
431
|
+
|
|
432
|
+
The `Validate` class can directly accept CSV file paths, making it easy to validate data stored
|
|
433
|
+
in CSV files without manual loading:
|
|
434
|
+
|
|
435
|
+
```python
|
|
436
|
+
# Get a path to a CSV file from the package data
|
|
437
|
+
csv_path = pb.get_data_path("global_sales", "csv")
|
|
438
|
+
|
|
439
|
+
validation_3 = (
|
|
440
|
+
pb.Validate(
|
|
441
|
+
data=csv_path,
|
|
442
|
+
label="CSV validation example"
|
|
443
|
+
)
|
|
444
|
+
.col_exists(["customer_id", "product_id", "revenue"])
|
|
445
|
+
.col_vals_not_null(["customer_id", "product_id"])
|
|
446
|
+
.col_vals_gt(columns="revenue", value=0)
|
|
447
|
+
.interrogate()
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
validation_3
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
You can also use a Path object to specify the CSV file. Here's an example of how to do that:
|
|
454
|
+
|
|
455
|
+
```python
|
|
456
|
+
from pathlib import Path
|
|
457
|
+
|
|
458
|
+
csv_file = Path(pb.get_data_path("game_revenue", "csv"))
|
|
459
|
+
|
|
460
|
+
validation_4 = (
|
|
461
|
+
pb.Validate(data=csv_file, label="Game Revenue Validation")
|
|
462
|
+
.col_exists(["player_id", "session_id", "item_name"])
|
|
463
|
+
.col_vals_regex(
|
|
464
|
+
columns="session_id",
|
|
465
|
+
pattern=r"[A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{12}"
|
|
466
|
+
)
|
|
467
|
+
.col_vals_gt(columns="item_revenue", value=0, na_pass=True)
|
|
468
|
+
.interrogate()
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
validation_4
|
|
472
|
+
```
|
|
473
|
+
|
|
474
|
+
The CSV loading is automatic, so when a string or Path with a `.csv` extension is provided,
|
|
475
|
+
Pointblank will automatically load the file using the best available DataFrame library (Polars
|
|
476
|
+
preferred, Pandas as fallback). The loaded data can then be used with all validation methods
|
|
477
|
+
just like any other supported table type.
|
|
478
|
+
|
|
479
|
+
### Working with Parquet Files
|
|
480
|
+
|
|
481
|
+
The `Validate` class can directly accept Parquet files and datasets in various formats. The
|
|
482
|
+
following examples illustrate how to validate Parquet files:
|
|
483
|
+
|
|
484
|
+
```python
|
|
485
|
+
# Single Parquet file from package data
|
|
486
|
+
parquet_path = pb.get_data_path("nycflights", "parquet")
|
|
487
|
+
|
|
488
|
+
validation_5 = (
|
|
489
|
+
pb.Validate(
|
|
490
|
+
data=parquet_path,
|
|
491
|
+
tbl_name="NYC Flights Data"
|
|
492
|
+
)
|
|
493
|
+
.col_vals_not_null(["carrier", "origin", "dest"])
|
|
494
|
+
.col_vals_gt(columns="distance", value=0)
|
|
495
|
+
.interrogate()
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
validation_5
|
|
499
|
+
```
|
|
500
|
+
|
|
501
|
+
You can also use glob patterns and directories. Here are some examples for how to:
|
|
502
|
+
|
|
503
|
+
1. load multiple Parquet files
|
|
504
|
+
2. load a Parquet-containing directory
|
|
505
|
+
3. load a partitioned Parquet dataset
|
|
506
|
+
|
|
507
|
+
```python
|
|
508
|
+
# Multiple Parquet files with glob patterns
|
|
509
|
+
validation_6 = pb.Validate(data="data/sales_*.parquet")
|
|
510
|
+
|
|
511
|
+
# Directory containing Parquet files
|
|
512
|
+
validation_7 = pb.Validate(data="parquet_data/")
|
|
513
|
+
|
|
514
|
+
# Partitioned Parquet dataset
|
|
515
|
+
validation_8 = (
|
|
516
|
+
pb.Validate(data="sales_data/") # Contains year=2023/quarter=Q1/region=US/sales.parquet
|
|
517
|
+
.col_exists(["transaction_id", "amount", "year", "quarter", "region"])
|
|
518
|
+
.interrogate()
|
|
519
|
+
)
|
|
520
|
+
```
|
|
521
|
+
|
|
522
|
+
When you point to a directory that contains a partitioned Parquet dataset (with subdirectories
|
|
523
|
+
like `year=2023/quarter=Q1/region=US/`), Pointblank will automatically:
|
|
524
|
+
|
|
525
|
+
- discover all Parquet files recursively
|
|
526
|
+
- extract partition column values from directory paths
|
|
527
|
+
- add partition columns to the final DataFrame
|
|
528
|
+
- combine all partitions into a single table for validation
|
|
529
|
+
|
|
530
|
+
Both Polars and Pandas handle partitioned datasets natively, so this works seamlessly with
|
|
531
|
+
either DataFrame library. The loading preference is Polars first, then Pandas as a fallback.
|
|
532
|
+
|
|
533
|
+
### Working with Database Connection Strings
|
|
534
|
+
|
|
535
|
+
The `Validate` class supports database connection strings for direct validation of database
|
|
536
|
+
tables. Connection strings must specify a table using the `::table_name` suffix:
|
|
537
|
+
|
|
538
|
+
```python
|
|
539
|
+
# Get path to a DuckDB database file from package data
|
|
540
|
+
duckdb_path = pb.get_data_path("game_revenue", "duckdb")
|
|
541
|
+
|
|
542
|
+
validation_9 = (
|
|
543
|
+
pb.Validate(
|
|
544
|
+
data=f"duckdb:///{duckdb_path}::game_revenue",
|
|
545
|
+
label="DuckDB Game Revenue Validation"
|
|
546
|
+
)
|
|
547
|
+
.col_exists(["player_id", "session_id", "item_revenue"])
|
|
548
|
+
.col_vals_gt(columns="item_revenue", value=0)
|
|
549
|
+
.interrogate()
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
validation_9
|
|
553
|
+
```
|
|
554
|
+
|
|
555
|
+
For comprehensive documentation on supported connection string formats, error handling, and
|
|
556
|
+
installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
|
|
557
|
+
function. This function handles all the connection logic and provides helpful error messages
|
|
558
|
+
when table specifications are missing or backend dependencies are not installed.
|
|
401
559
|
|
|
402
560
|
|
|
403
561
|
Thresholds(warning: 'int | float | bool | None' = None, error: 'int | float | bool | None' = None, critical: 'int | float | bool | None' = None) -> None
|
|
@@ -1287,12 +1445,16 @@ col_vals_gt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1287
1445
|
(i.e., no validation steps will be created for them).
|
|
1288
1446
|
|
|
1289
1447
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1290
|
-
for more complex segmentation scenarios. The following inputs are
|
|
1448
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
1291
1449
|
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1450
|
+
```
|
|
1451
|
+
# Segments from all unique values in the `region` column
|
|
1452
|
+
# and specific dates in the `date` column
|
|
1453
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
1454
|
+
|
|
1455
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
1456
|
+
segments=["region", "date"]
|
|
1457
|
+
```
|
|
1296
1458
|
|
|
1297
1459
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1298
1460
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -1508,12 +1670,16 @@ col_vals_lt(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1508
1670
|
(i.e., no validation steps will be created for them).
|
|
1509
1671
|
|
|
1510
1672
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1511
|
-
for more complex segmentation scenarios. The following inputs are
|
|
1673
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
1512
1674
|
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1675
|
+
```
|
|
1676
|
+
# Segments from all unique values in the `region` column
|
|
1677
|
+
# and specific dates in the `date` column
|
|
1678
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
1679
|
+
|
|
1680
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
1681
|
+
segments=["region", "date"]
|
|
1682
|
+
```
|
|
1517
1683
|
|
|
1518
1684
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1519
1685
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -1729,12 +1895,16 @@ col_vals_ge(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1729
1895
|
(i.e., no validation steps will be created for them).
|
|
1730
1896
|
|
|
1731
1897
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1732
|
-
for more complex segmentation scenarios. The following inputs are
|
|
1898
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
1733
1899
|
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1900
|
+
```
|
|
1901
|
+
# Segments from all unique values in the `region` column
|
|
1902
|
+
# and specific dates in the `date` column
|
|
1903
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
1904
|
+
|
|
1905
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
1906
|
+
segments=["region", "date"]
|
|
1907
|
+
```
|
|
1738
1908
|
|
|
1739
1909
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1740
1910
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -1950,12 +2120,16 @@ col_vals_le(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
1950
2120
|
(i.e., no validation steps will be created for them).
|
|
1951
2121
|
|
|
1952
2122
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
1953
|
-
for more complex segmentation scenarios. The following inputs are
|
|
2123
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
1954
2124
|
|
|
1955
|
-
|
|
1956
|
-
|
|
1957
|
-
|
|
1958
|
-
|
|
2125
|
+
```
|
|
2126
|
+
# Segments from all unique values in the `region` column
|
|
2127
|
+
# and specific dates in the `date` column
|
|
2128
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
2129
|
+
|
|
2130
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
2131
|
+
segments=["region", "date"]
|
|
2132
|
+
```
|
|
1959
2133
|
|
|
1960
2134
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
1961
2135
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -2171,12 +2345,16 @@ col_vals_eq(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2171
2345
|
(i.e., no validation steps will be created for them).
|
|
2172
2346
|
|
|
2173
2347
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2174
|
-
for more complex segmentation scenarios. The following inputs are
|
|
2348
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
2175
2349
|
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2350
|
+
```
|
|
2351
|
+
# Segments from all unique values in the `region` column
|
|
2352
|
+
# and specific dates in the `date` column
|
|
2353
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
2354
|
+
|
|
2355
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
2356
|
+
segments=["region", "date"]
|
|
2357
|
+
```
|
|
2180
2358
|
|
|
2181
2359
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2182
2360
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -2390,12 +2568,16 @@ col_vals_ne(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSe
|
|
|
2390
2568
|
(i.e., no validation steps will be created for them).
|
|
2391
2569
|
|
|
2392
2570
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2393
|
-
for more complex segmentation scenarios. The following inputs are
|
|
2571
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
2394
2572
|
|
|
2395
|
-
|
|
2396
|
-
|
|
2397
|
-
|
|
2398
|
-
|
|
2573
|
+
```
|
|
2574
|
+
# Segments from all unique values in the `region` column
|
|
2575
|
+
# and specific dates in the `date` column
|
|
2576
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
2577
|
+
|
|
2578
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
2579
|
+
segments=["region", "date"]
|
|
2580
|
+
```
|
|
2399
2581
|
|
|
2400
2582
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2401
2583
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -2619,12 +2801,16 @@ col_vals_between(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2619
2801
|
(i.e., no validation steps will be created for them).
|
|
2620
2802
|
|
|
2621
2803
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2622
|
-
for more complex segmentation scenarios. The following inputs are
|
|
2804
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
2623
2805
|
|
|
2624
|
-
|
|
2625
|
-
|
|
2626
|
-
|
|
2627
|
-
|
|
2806
|
+
```
|
|
2807
|
+
# Segments from all unique values in the `region` column
|
|
2808
|
+
# and specific dates in the `date` column
|
|
2809
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
2810
|
+
|
|
2811
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
2812
|
+
segments=["region", "date"]
|
|
2813
|
+
```
|
|
2628
2814
|
|
|
2629
2815
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2630
2816
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -2860,12 +3046,16 @@ col_vals_outside(self, columns: 'str | list[str] | Column | ColumnSelector | Col
|
|
|
2860
3046
|
(i.e., no validation steps will be created for them).
|
|
2861
3047
|
|
|
2862
3048
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
2863
|
-
for more complex segmentation scenarios. The following inputs are
|
|
3049
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
2864
3050
|
|
|
2865
|
-
|
|
2866
|
-
|
|
2867
|
-
|
|
2868
|
-
|
|
3051
|
+
```
|
|
3052
|
+
# Segments from all unique values in the `region` column
|
|
3053
|
+
# and specific dates in the `date` column
|
|
3054
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
3055
|
+
|
|
3056
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
3057
|
+
segments=["region", "date"]
|
|
3058
|
+
```
|
|
2869
3059
|
|
|
2870
3060
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
2871
3061
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3060,12 +3250,16 @@ col_vals_in_set(self, columns: 'str | list[str] | Column | ColumnSelector | Colu
|
|
|
3060
3250
|
(i.e., no validation steps will be created for them).
|
|
3061
3251
|
|
|
3062
3252
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3063
|
-
for more complex segmentation scenarios. The following inputs are
|
|
3253
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3064
3254
|
|
|
3065
|
-
|
|
3066
|
-
|
|
3067
|
-
|
|
3068
|
-
|
|
3255
|
+
```
|
|
3256
|
+
# Segments from all unique values in the `region` column
|
|
3257
|
+
# and specific dates in the `date` column
|
|
3258
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
3259
|
+
|
|
3260
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
3261
|
+
segments=["region", "date"]
|
|
3262
|
+
```
|
|
3069
3263
|
|
|
3070
3264
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3071
3265
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3246,12 +3440,16 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
3246
3440
|
(i.e., no validation steps will be created for them).
|
|
3247
3441
|
|
|
3248
3442
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3249
|
-
for more complex segmentation scenarios. The following inputs are
|
|
3443
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3250
3444
|
|
|
3251
|
-
|
|
3252
|
-
|
|
3253
|
-
|
|
3254
|
-
|
|
3445
|
+
```
|
|
3446
|
+
# Segments from all unique values in the `region` column
|
|
3447
|
+
# and specific dates in the `date` column
|
|
3448
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
3449
|
+
|
|
3450
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
3451
|
+
segments=["region", "date"]
|
|
3452
|
+
```
|
|
3255
3453
|
|
|
3256
3454
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3257
3455
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3347,9 +3545,9 @@ col_vals_not_in_set(self, columns: 'str | list[str] | Column | ColumnSelector |
|
|
|
3347
3545
|
|
|
3348
3546
|
col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3349
3547
|
|
|
3350
|
-
Validate whether values in a column are
|
|
3548
|
+
Validate whether values in a column are Null.
|
|
3351
3549
|
|
|
3352
|
-
The `col_vals_null()` validation method checks whether column values in a table are
|
|
3550
|
+
The `col_vals_null()` validation method checks whether column values in a table are Null.
|
|
3353
3551
|
This validation will operate over the number of test units that is equal to the number
|
|
3354
3552
|
of rows in the table.
|
|
3355
3553
|
|
|
@@ -3430,12 +3628,16 @@ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | Column
|
|
|
3430
3628
|
(i.e., no validation steps will be created for them).
|
|
3431
3629
|
|
|
3432
3630
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3433
|
-
for more complex segmentation scenarios. The following inputs are
|
|
3631
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3434
3632
|
|
|
3435
|
-
|
|
3436
|
-
|
|
3437
|
-
|
|
3438
|
-
|
|
3633
|
+
```
|
|
3634
|
+
# Segments from all unique values in the `region` column
|
|
3635
|
+
# and specific dates in the `date` column
|
|
3636
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
3637
|
+
|
|
3638
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
3639
|
+
segments=["region", "date"]
|
|
3640
|
+
```
|
|
3439
3641
|
|
|
3440
3642
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3441
3643
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3529,10 +3731,10 @@ col_vals_null(self, columns: 'str | list[str] | Column | ColumnSelector | Column
|
|
|
3529
3731
|
|
|
3530
3732
|
col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | ColumnSelectorNarwhals', pre: 'Callable | None' = None, segments: 'SegmentSpec | None' = None, thresholds: 'int | float | bool | tuple | dict | Thresholds' = None, actions: 'Actions | None' = None, brief: 'str | bool | None' = None, active: 'bool' = True) -> 'Validate'
|
|
3531
3733
|
|
|
3532
|
-
Validate whether values in a column are not
|
|
3734
|
+
Validate whether values in a column are not Null.
|
|
3533
3735
|
|
|
3534
3736
|
The `col_vals_not_null()` validation method checks whether column values in a table are not
|
|
3535
|
-
|
|
3737
|
+
Null. This validation will operate over the number of test units that is equal to the number
|
|
3536
3738
|
of rows in the table.
|
|
3537
3739
|
|
|
3538
3740
|
Parameters
|
|
@@ -3612,12 +3814,16 @@ col_vals_not_null(self, columns: 'str | list[str] | Column | ColumnSelector | Co
|
|
|
3612
3814
|
(i.e., no validation steps will be created for them).
|
|
3613
3815
|
|
|
3614
3816
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3615
|
-
for more complex segmentation scenarios. The following inputs are
|
|
3817
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3616
3818
|
|
|
3617
|
-
|
|
3618
|
-
|
|
3619
|
-
|
|
3620
|
-
|
|
3819
|
+
```
|
|
3820
|
+
# Segments from all unique values in the `region` column
|
|
3821
|
+
# and specific dates in the `date` column
|
|
3822
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
3823
|
+
|
|
3824
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
3825
|
+
segments=["region", "date"]
|
|
3826
|
+
```
|
|
3621
3827
|
|
|
3622
3828
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3623
3829
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3800,12 +4006,16 @@ col_vals_regex(self, columns: 'str | list[str] | Column | ColumnSelector | Colum
|
|
|
3800
4006
|
(i.e., no validation steps will be created for them).
|
|
3801
4007
|
|
|
3802
4008
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3803
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4009
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3804
4010
|
|
|
3805
|
-
|
|
3806
|
-
|
|
3807
|
-
|
|
3808
|
-
|
|
4011
|
+
```
|
|
4012
|
+
# Segments from all unique values in the `region` column
|
|
4013
|
+
# and specific dates in the `date` column
|
|
4014
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4015
|
+
|
|
4016
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4017
|
+
segments=["region", "date"]
|
|
4018
|
+
```
|
|
3809
4019
|
|
|
3810
4020
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3811
4021
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -3982,12 +4192,16 @@ col_vals_expr(self, expr: 'any', pre: 'Callable | None' = None, segments: 'Segme
|
|
|
3982
4192
|
(i.e., no validation steps will be created for them).
|
|
3983
4193
|
|
|
3984
4194
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
3985
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4195
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
3986
4196
|
|
|
3987
|
-
|
|
3988
|
-
|
|
3989
|
-
|
|
3990
|
-
|
|
4197
|
+
```
|
|
4198
|
+
# Segments from all unique values in the `region` column
|
|
4199
|
+
# and specific dates in the `date` column
|
|
4200
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4201
|
+
|
|
4202
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4203
|
+
segments=["region", "date"]
|
|
4204
|
+
```
|
|
3991
4205
|
|
|
3992
4206
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
3993
4207
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -4272,12 +4486,16 @@ rows_distinct(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
|
|
|
4272
4486
|
(i.e., no validation steps will be created for them).
|
|
4273
4487
|
|
|
4274
4488
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4275
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4489
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
4276
4490
|
|
|
4277
|
-
|
|
4278
|
-
|
|
4279
|
-
|
|
4280
|
-
|
|
4491
|
+
```
|
|
4492
|
+
# Segments from all unique values in the `region` column
|
|
4493
|
+
# and specific dates in the `date` column
|
|
4494
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4495
|
+
|
|
4496
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4497
|
+
segments=["region", "date"]
|
|
4498
|
+
```
|
|
4281
4499
|
|
|
4282
4500
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4283
4501
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -4458,12 +4676,16 @@ rows_complete(self, columns_subset: 'str | list[str] | None' = None, pre: 'Calla
|
|
|
4458
4676
|
(i.e., no validation steps will be created for them).
|
|
4459
4677
|
|
|
4460
4678
|
A list with a combination of column names and tuples can be provided as well. This allows
|
|
4461
|
-
for more complex segmentation scenarios. The following inputs are
|
|
4679
|
+
for more complex segmentation scenarios. The following inputs are both valid:
|
|
4462
4680
|
|
|
4463
|
-
|
|
4464
|
-
|
|
4465
|
-
|
|
4466
|
-
|
|
4681
|
+
```
|
|
4682
|
+
# Segments from all unique values in the `region` column
|
|
4683
|
+
# and specific dates in the `date` column
|
|
4684
|
+
segments=["region", ("date", ["2023-01-01", "2023-01-02"])]
|
|
4685
|
+
|
|
4686
|
+
# Segments from all unique values in the `region` and `date` columns
|
|
4687
|
+
segments=["region", "date"]
|
|
4688
|
+
```
|
|
4467
4689
|
|
|
4468
4690
|
The segmentation is performed during interrogation, and the resulting validation steps will
|
|
4469
4691
|
be numbered sequentially. Each segment will have its own validation step, and the results
|
|
@@ -7090,24 +7312,25 @@ get_step_report(self, i: 'int', columns_subset: 'str | list[str] | Column | None
|
|
|
7090
7312
|
Types of Step Reports
|
|
7091
7313
|
---------------------
|
|
7092
7314
|
The `get_step_report()` method produces a report based on the *type* of validation step.
|
|
7093
|
-
The following row-based validation methods will produce a
|
|
7094
|
-
|
|
7315
|
+
The following column-value or row-based validation step validation methods will produce a
|
|
7316
|
+
report that shows the rows of the data that failed:
|
|
7095
7317
|
|
|
7096
7318
|
- [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
|
|
7319
|
+
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
7097
7320
|
- [`col_vals_lt()`](`pointblank.Validate.col_vals_lt`)
|
|
7321
|
+
- [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
|
|
7098
7322
|
- [`col_vals_eq()`](`pointblank.Validate.col_vals_eq`)
|
|
7099
7323
|
- [`col_vals_ne()`](`pointblank.Validate.col_vals_ne`)
|
|
7100
|
-
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
7101
|
-
- [`col_vals_le()`](`pointblank.Validate.col_vals_le`)
|
|
7102
7324
|
- [`col_vals_between()`](`pointblank.Validate.col_vals_between`)
|
|
7103
7325
|
- [`col_vals_outside()`](`pointblank.Validate.col_vals_outside`)
|
|
7104
7326
|
- [`col_vals_in_set()`](`pointblank.Validate.col_vals_in_set`)
|
|
7105
7327
|
- [`col_vals_not_in_set()`](`pointblank.Validate.col_vals_not_in_set`)
|
|
7106
|
-
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
7107
7328
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
7108
7329
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
7109
|
-
- [`
|
|
7330
|
+
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
7331
|
+
- [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
|
|
7110
7332
|
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
7333
|
+
- [`rows_complete()`](`pointblank.Validate.rows_complete`)
|
|
7111
7334
|
|
|
7112
7335
|
The [`rows_distinct()`](`pointblank.Validate.rows_distinct`) validation step will produce a
|
|
7113
7336
|
report that shows duplicate rows (or duplicate values in one or a set of columns as defined
|
|
@@ -7325,10 +7548,10 @@ get_sundered_data(self, type='pass') -> 'FrameT'
|
|
|
7325
7548
|
Get the data that passed or failed the validation steps.
|
|
7326
7549
|
|
|
7327
7550
|
Validation of the data is one thing but, sometimes, you want to use the best part of the
|
|
7328
|
-
input dataset for something else. The `get_sundered_data()` method works with a Validate
|
|
7551
|
+
input dataset for something else. The `get_sundered_data()` method works with a `Validate`
|
|
7329
7552
|
object that has been interrogated (i.e., the
|
|
7330
7553
|
[`interrogate()`](`pointblank.Validate.interrogate`) method was used). We can get either the
|
|
7331
|
-
'pass' data piece (rows with no failing test units across all
|
|
7554
|
+
'pass' data piece (rows with no failing test units across all column-value based validation
|
|
7332
7555
|
functions), or, the 'fail' data piece (rows with at least one failing test unit across the
|
|
7333
7556
|
same series of validations).
|
|
7334
7557
|
|
|
@@ -7337,7 +7560,7 @@ get_sundered_data(self, type='pass') -> 'FrameT'
|
|
|
7337
7560
|
There are some caveats to sundering. The validation steps considered for this splitting will
|
|
7338
7561
|
only involve steps where:
|
|
7339
7562
|
|
|
7340
|
-
- of certain check types, where test units are cells checked
|
|
7563
|
+
- of certain check types, where test units are cells checked down a column (e.g., the
|
|
7341
7564
|
`col_vals_*()` methods)
|
|
7342
7565
|
- `active=` is not set to `False`
|
|
7343
7566
|
- `pre=` has not been given an expression for modifying the input table
|
|
@@ -7406,11 +7629,13 @@ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = Fals
|
|
|
7406
7629
|
Get the rows that failed for each validation step.
|
|
7407
7630
|
|
|
7408
7631
|
After the [`interrogate()`](`pointblank.Validate.interrogate`) method has been called, the
|
|
7409
|
-
`get_data_extracts()` method can be used to extract the rows that failed in each
|
|
7410
|
-
validation step (e.g.,
|
|
7411
|
-
|
|
7412
|
-
|
|
7413
|
-
|
|
7632
|
+
`get_data_extracts()` method can be used to extract the rows that failed in each
|
|
7633
|
+
column-value or row-based validation step (e.g.,
|
|
7634
|
+
[`col_vals_gt()`](`pointblank.Validate.col_vals_gt`),
|
|
7635
|
+
[`rows_distinct()`](`pointblank.Validate.rows_distinct`), etc.). The method returns a
|
|
7636
|
+
dictionary of tables containing the rows that failed in every validation step. If
|
|
7637
|
+
`frame=True` and `i=` is a scalar, the value is conveniently returned as a table (forgoing
|
|
7638
|
+
the dictionary structure).
|
|
7414
7639
|
|
|
7415
7640
|
Parameters
|
|
7416
7641
|
----------
|
|
@@ -7423,13 +7648,13 @@ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = Fals
|
|
|
7423
7648
|
Returns
|
|
7424
7649
|
-------
|
|
7425
7650
|
dict[int, FrameT | None] | FrameT | None
|
|
7426
|
-
A dictionary of tables containing the rows that failed in every
|
|
7427
|
-
step
|
|
7651
|
+
A dictionary of tables containing the rows that failed in every compatible validation
|
|
7652
|
+
step. Alternatively, it can be a DataFrame if `frame=True` and `i=` is a scalar.
|
|
7428
7653
|
|
|
7429
|
-
Validation Methods
|
|
7430
|
-
|
|
7431
|
-
The following validation methods
|
|
7432
|
-
failing test units.
|
|
7654
|
+
Compatible Validation Methods for Yielding Extracted Rows
|
|
7655
|
+
---------------------------------------------------------
|
|
7656
|
+
The following validation methods operate on column values and will have rows extracted when
|
|
7657
|
+
there are failing test units.
|
|
7433
7658
|
|
|
7434
7659
|
- [`col_vals_gt()`](`pointblank.Validate.col_vals_gt`)
|
|
7435
7660
|
- [`col_vals_ge()`](`pointblank.Validate.col_vals_ge`)
|
|
@@ -7444,11 +7669,20 @@ get_data_extracts(self, i: 'int | list[int] | None' = None, frame: 'bool' = Fals
|
|
|
7444
7669
|
- [`col_vals_null()`](`pointblank.Validate.col_vals_null`)
|
|
7445
7670
|
- [`col_vals_not_null()`](`pointblank.Validate.col_vals_not_null`)
|
|
7446
7671
|
- [`col_vals_regex()`](`pointblank.Validate.col_vals_regex`)
|
|
7672
|
+
- [`col_vals_expr()`](`pointblank.Validate.col_vals_expr`)
|
|
7673
|
+
- [`conjointly()`](`pointblank.Validate.conjointly`)
|
|
7674
|
+
|
|
7675
|
+
An extracted row for these validation methods means that a test unit failed for that row in
|
|
7676
|
+
the validation step.
|
|
7677
|
+
|
|
7678
|
+
These row-based validation methods will also have rows extracted should there be failing
|
|
7679
|
+
rows:
|
|
7680
|
+
|
|
7447
7681
|
- [`rows_distinct()`](`pointblank.Validate.rows_distinct`)
|
|
7682
|
+
- [`rows_complete()`](`pointblank.Validate.rows_complete`)
|
|
7448
7683
|
|
|
7449
|
-
|
|
7450
|
-
|
|
7451
|
-
understanding the nature of the failing test units.
|
|
7684
|
+
The extracted rows are a subset of the original table and are useful for further analysis
|
|
7685
|
+
or for understanding the nature of the failing test units.
|
|
7452
7686
|
|
|
7453
7687
|
Examples
|
|
7454
7688
|
--------
|
|
@@ -8455,7 +8689,7 @@ critical(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'd
|
|
|
8455
8689
|
Get the 'critical' level status for each validation step.
|
|
8456
8690
|
|
|
8457
8691
|
The 'critical' status for a validation step is `True` if the fraction of failing test units
|
|
8458
|
-
meets or exceeds the threshold for the
|
|
8692
|
+
meets or exceeds the threshold for the 'critical' level. Otherwise, the status is `False`.
|
|
8459
8693
|
|
|
8460
8694
|
The ascribed name of 'critical' is semantic and is thus simply a status indicator that could
|
|
8461
8695
|
be used to trigger some action to be take. Here's how it fits in with other status
|
|
@@ -8467,14 +8701,14 @@ critical(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'd
|
|
|
8467
8701
|
severity
|
|
8468
8702
|
- 'critical': the status obtained by calling `critical()`, most severe
|
|
8469
8703
|
|
|
8470
|
-
This method provides a dictionary of the
|
|
8471
|
-
|
|
8472
|
-
|
|
8704
|
+
This method provides a dictionary of the 'critical' status for each validation step. If the
|
|
8705
|
+
`scalar=True` argument is provided and `i=` is a scalar, the value is returned as a scalar
|
|
8706
|
+
instead of a dictionary.
|
|
8473
8707
|
|
|
8474
8708
|
Parameters
|
|
8475
8709
|
----------
|
|
8476
8710
|
i
|
|
8477
|
-
The validation step number(s) from which the
|
|
8711
|
+
The validation step number(s) from which the 'critical' status is obtained. Can be
|
|
8478
8712
|
provided as a list of integers or a single integer. If `None`, all steps are included.
|
|
8479
8713
|
scalar
|
|
8480
8714
|
If `True` and `i=` is a scalar, return the value as a scalar instead of a dictionary.
|
|
@@ -8482,7 +8716,7 @@ critical(self, i: 'int | list[int] | None' = None, scalar: 'bool' = False) -> 'd
|
|
|
8482
8716
|
Returns
|
|
8483
8717
|
-------
|
|
8484
8718
|
dict[int, bool] | bool
|
|
8485
|
-
A dictionary of the
|
|
8719
|
+
A dictionary of the 'critical' status for each validation step or a scalar value.
|
|
8486
8720
|
|
|
8487
8721
|
Examples
|
|
8488
8722
|
--------
|
|
@@ -8562,7 +8796,7 @@ datasets included in the package can be accessed via the `load_dataset()` functi
|
|
|
8562
8796
|
`config()` utility lets us set global configuration parameters. Want to chat with an assistant? Use
|
|
8563
8797
|
the `assistant()` function to get help with Pointblank.
|
|
8564
8798
|
|
|
8565
|
-
DataScan(data: '
|
|
8799
|
+
DataScan(data: 'IntoFrameT', tbl_name: 'str | None' = None) -> 'None'
|
|
8566
8800
|
|
|
8567
8801
|
Get a summary of a dataset.
|
|
8568
8802
|
|
|
@@ -8676,8 +8910,14 @@ preview(data: 'FrameT | Any', columns_subset: 'str | list[str] | Column | None'
|
|
|
8676
8910
|
Parameters
|
|
8677
8911
|
----------
|
|
8678
8912
|
data
|
|
8679
|
-
The table to preview, which could be a DataFrame object
|
|
8680
|
-
|
|
8913
|
+
The table to preview, which could be a DataFrame object, an Ibis table object, a CSV
|
|
8914
|
+
file path, a Parquet file path, or a database connection string. When providing a CSV or
|
|
8915
|
+
Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
|
|
8916
|
+
loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
|
|
8917
|
+
glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
|
|
8918
|
+
Connection strings enable direct database access via Ibis with optional table specification
|
|
8919
|
+
using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
|
|
8920
|
+
on the supported table types.
|
|
8681
8921
|
columns_subset
|
|
8682
8922
|
The columns to display in the table, by default `None` (all columns are shown). This can
|
|
8683
8923
|
be a string, a list of strings, a `Column` object, or a `ColumnSelector` object. The latter
|
|
@@ -8728,12 +8968,34 @@ preview(data: 'FrameT | Any', columns_subset: 'str | list[str] | Column | None'
|
|
|
8728
8968
|
- PySpark table (`"pyspark"`)*
|
|
8729
8969
|
- BigQuery table (`"bigquery"`)*
|
|
8730
8970
|
- Parquet table (`"parquet"`)*
|
|
8971
|
+
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
|
|
8972
|
+
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
|
|
8973
|
+
extension, or partitioned dataset)
|
|
8974
|
+
- Database connection strings (URI format with optional table specification)
|
|
8731
8975
|
|
|
8732
8976
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
8733
8977
|
`ibis.expr.types.relations.Table`). Furthermore, using `preview()` with these types of tables
|
|
8734
8978
|
requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or
|
|
8735
8979
|
Pandas DataFrame, the availability of Ibis is not needed.
|
|
8736
8980
|
|
|
8981
|
+
To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
|
|
8982
|
+
provided. The file will be automatically detected and loaded using the best available DataFrame
|
|
8983
|
+
library. The loading preference is Polars first, then Pandas as a fallback.
|
|
8984
|
+
|
|
8985
|
+
Connection strings follow database URL formats and must also specify a table using the
|
|
8986
|
+
`::table_name` suffix. Examples include:
|
|
8987
|
+
|
|
8988
|
+
```
|
|
8989
|
+
"duckdb:///path/to/database.ddb::table_name"
|
|
8990
|
+
"sqlite:///path/to/database.db::table_name"
|
|
8991
|
+
"postgresql://user:password@localhost:5432/database::table_name"
|
|
8992
|
+
"mysql://user:password@localhost:3306/database::table_name"
|
|
8993
|
+
"bigquery://project/dataset::table_name"
|
|
8994
|
+
"snowflake://user:password@account/database/schema::table_name"
|
|
8995
|
+
```
|
|
8996
|
+
|
|
8997
|
+
When using connection strings, the Ibis library with the appropriate backend driver is required.
|
|
8998
|
+
|
|
8737
8999
|
Examples
|
|
8738
9000
|
--------
|
|
8739
9001
|
It's easy to preview a table using the `preview()` function. Here's an example using the
|
|
@@ -8792,6 +9054,39 @@ preview(data: 'FrameT | Any', columns_subset: 'str | list[str] | Column | None'
|
|
|
8792
9054
|
columns_subset=pb.col(pb.starts_with("item") | pb.matches("player"))
|
|
8793
9055
|
)
|
|
8794
9056
|
```
|
|
9057
|
+
|
|
9058
|
+
### Working with CSV Files
|
|
9059
|
+
|
|
9060
|
+
The `preview()` function can directly accept CSV file paths, making it easy to preview data
|
|
9061
|
+
stored in CSV files without manual loading:
|
|
9062
|
+
|
|
9063
|
+
You can also use a Path object to specify the CSV file:
|
|
9064
|
+
|
|
9065
|
+
### Working with Parquet Files
|
|
9066
|
+
|
|
9067
|
+
The `preview()` function can directly accept Parquet files and datasets in various formats:
|
|
9068
|
+
|
|
9069
|
+
You can also use glob patterns and directories:
|
|
9070
|
+
|
|
9071
|
+
```python
|
|
9072
|
+
# Multiple Parquet files with glob patterns
|
|
9073
|
+
pb.preview("data/sales_*.parquet")
|
|
9074
|
+
|
|
9075
|
+
# Directory containing Parquet files
|
|
9076
|
+
pb.preview("parquet_data/")
|
|
9077
|
+
|
|
9078
|
+
# Partitioned Parquet dataset
|
|
9079
|
+
pb.preview("sales_data/") # Auto-discovers partition columns
|
|
9080
|
+
```
|
|
9081
|
+
|
|
9082
|
+
### Working with Database Connection Strings
|
|
9083
|
+
|
|
9084
|
+
The `preview()` function supports database connection strings for direct preview of database
|
|
9085
|
+
tables. Connection strings must specify a table using the `::table_name` suffix:
|
|
9086
|
+
|
|
9087
|
+
For comprehensive documentation on supported connection string formats, error handling, and
|
|
9088
|
+
installation requirements, see the [`connect_to_table()`](`pointblank.connect_to_table`)
|
|
9089
|
+
function.
|
|
8795
9090
|
|
|
8796
9091
|
|
|
8797
9092
|
col_summary_tbl(data: 'FrameT | Any', tbl_name: 'str | None' = None) -> 'GT'
|
|
@@ -9160,6 +9455,104 @@ load_dataset(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'glo
|
|
|
9160
9455
|
regions: North America, Europe, or Asia.
|
|
9161
9456
|
|
|
9162
9457
|
|
|
9458
|
+
get_data_path(dataset: "Literal['small_table', 'game_revenue', 'nycflights', 'global_sales']" = 'small_table', file_type: "Literal['csv', 'parquet', 'duckdb']" = 'csv') -> 'str'
|
|
9459
|
+
|
|
9460
|
+
Get the file path to a dataset included with the Pointblank package.
|
|
9461
|
+
|
|
9462
|
+
This function provides direct access to the file paths of datasets included with Pointblank.
|
|
9463
|
+
These paths can be used in examples and documentation to demonstrate file-based data loading
|
|
9464
|
+
without requiring the actual data files. The returned paths can be used with
|
|
9465
|
+
`Validate(data=path)` to demonstrate CSV and Parquet file loading capabilities.
|
|
9466
|
+
|
|
9467
|
+
Parameters
|
|
9468
|
+
----------
|
|
9469
|
+
dataset
|
|
9470
|
+
The name of the dataset to get the path for. Current options are `"small_table"`,
|
|
9471
|
+
`"game_revenue"`, `"nycflights"`, and `"global_sales"`.
|
|
9472
|
+
file_type
|
|
9473
|
+
The file format to get the path for. Options are `"csv"`, `"parquet"`, or `"duckdb"`.
|
|
9474
|
+
|
|
9475
|
+
Returns
|
|
9476
|
+
-------
|
|
9477
|
+
str
|
|
9478
|
+
The file path to the requested dataset file.
|
|
9479
|
+
|
|
9480
|
+
Included Datasets
|
|
9481
|
+
-----------------
|
|
9482
|
+
The available datasets are the same as those in [`load_dataset()`](`pointblank.load_dataset`):
|
|
9483
|
+
|
|
9484
|
+
- `"small_table"`: A small dataset with 13 rows and 8 columns. Ideal for testing and examples.
|
|
9485
|
+
- `"game_revenue"`: A dataset with 2000 rows and 11 columns. Revenue data for a game company.
|
|
9486
|
+
- `"nycflights"`: A dataset with 336,776 rows and 18 columns. Flight data from NYC airports.
|
|
9487
|
+
- `"global_sales"`: A dataset with 50,000 rows and 20 columns. Global sales data across regions.
|
|
9488
|
+
|
|
9489
|
+
File Types
|
|
9490
|
+
----------
|
|
9491
|
+
Each dataset is available in multiple formats:
|
|
9492
|
+
|
|
9493
|
+
- `"csv"`: Comma-separated values file (`.csv`)
|
|
9494
|
+
- `"parquet"`: Parquet file (`.parquet`)
|
|
9495
|
+
- `"duckdb"`: DuckDB database file (`.ddb`)
|
|
9496
|
+
|
|
9497
|
+
Examples
|
|
9498
|
+
--------
|
|
9499
|
+
Get the path to a CSV file and use it with `Validate`:
|
|
9500
|
+
|
|
9501
|
+
```python
|
|
9502
|
+
import pointblank as pb
|
|
9503
|
+
|
|
9504
|
+
# Get path to the small_table CSV file
|
|
9505
|
+
csv_path = pb.get_data_path("small_table", "csv")
|
|
9506
|
+
print(csv_path)
|
|
9507
|
+
|
|
9508
|
+
# Use the path directly with Validate
|
|
9509
|
+
validation = (
|
|
9510
|
+
pb.Validate(data=csv_path)
|
|
9511
|
+
.col_exists(["a", "b", "c"])
|
|
9512
|
+
.col_vals_gt(columns="d", value=0)
|
|
9513
|
+
.interrogate()
|
|
9514
|
+
)
|
|
9515
|
+
|
|
9516
|
+
validation
|
|
9517
|
+
```
|
|
9518
|
+
|
|
9519
|
+
Get a Parquet file path for validation examples:
|
|
9520
|
+
|
|
9521
|
+
```python
|
|
9522
|
+
# Get path to the game_revenue Parquet file
|
|
9523
|
+
parquet_path = pb.get_data_path(dataset="game_revenue", file_type="parquet")
|
|
9524
|
+
|
|
9525
|
+
# Validate the Parquet file directly
|
|
9526
|
+
validation = (
|
|
9527
|
+
pb.Validate(data=parquet_path, label="Game Revenue Data Validation")
|
|
9528
|
+
.col_vals_not_null(columns=["player_id", "session_id"])
|
|
9529
|
+
.col_vals_gt(columns="item_revenue", value=0)
|
|
9530
|
+
.interrogate()
|
|
9531
|
+
)
|
|
9532
|
+
|
|
9533
|
+
validation
|
|
9534
|
+
```
|
|
9535
|
+
|
|
9536
|
+
This is particularly useful for documentation examples where you want to demonstrate
|
|
9537
|
+
file-based workflows without requiring users to have specific data files:
|
|
9538
|
+
|
|
9539
|
+
```python
|
|
9540
|
+
# Example showing CSV file validation
|
|
9541
|
+
sales_csv = pb.get_data_path(dataset="global_sales", file_type="csv")
|
|
9542
|
+
|
|
9543
|
+
validation = (
|
|
9544
|
+
pb.Validate(data=sales_csv, label="Sales Data Validation")
|
|
9545
|
+
.col_exists(["customer_id", "product_id", "amount"])
|
|
9546
|
+
.col_vals_regex(columns="customer_id", pattern=r"CUST_[0-9]{6}")
|
|
9547
|
+
.interrogate()
|
|
9548
|
+
)
|
|
9549
|
+
```
|
|
9550
|
+
|
|
9551
|
+
See Also
|
|
9552
|
+
--------
|
|
9553
|
+
[`load_dataset()`](`pointblank.load_dataset`) for loading datasets directly as table objects.
|
|
9554
|
+
|
|
9555
|
+
|
|
9163
9556
|
|
|
9164
9557
|
## The Utility Functions family
|
|
9165
9558
|
|