pointblank 0.11.0__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/assistant.py +14 -3
- pointblank/cli.py +2418 -1511
- pointblank/compare.py +9 -0
- pointblank/datascan.py +25 -3
- pointblank/validate.py +346 -37
- {pointblank-0.11.0.dist-info → pointblank-0.11.1.dist-info}/METADATA +16 -10
- {pointblank-0.11.0.dist-info → pointblank-0.11.1.dist-info}/RECORD +11 -11
- {pointblank-0.11.0.dist-info → pointblank-0.11.1.dist-info}/WHEEL +0 -0
- {pointblank-0.11.0.dist-info → pointblank-0.11.1.dist-info}/entry_points.txt +0 -0
- {pointblank-0.11.0.dist-info → pointblank-0.11.1.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.11.0.dist-info → pointblank-0.11.1.dist-info}/top_level.txt +0 -0
pointblank/compare.py
CHANGED
|
@@ -10,6 +10,15 @@ if TYPE_CHECKING:
|
|
|
10
10
|
|
|
11
11
|
class Compare:
|
|
12
12
|
def __init__(self, a: IntoFrame, b: IntoFrame) -> None:
|
|
13
|
+
# Import processing functions from validate module
|
|
14
|
+
from pointblank.validate import _process_data
|
|
15
|
+
|
|
16
|
+
# Process input data for table a
|
|
17
|
+
a = _process_data(a)
|
|
18
|
+
|
|
19
|
+
# Process input data for table b
|
|
20
|
+
b = _process_data(b)
|
|
21
|
+
|
|
13
22
|
self.a: IntoFrame = a
|
|
14
23
|
self.b: IntoFrame = b
|
|
15
24
|
|
pointblank/datascan.py
CHANGED
|
@@ -56,7 +56,9 @@ class DataScan:
|
|
|
56
56
|
Parameters
|
|
57
57
|
----------
|
|
58
58
|
data
|
|
59
|
-
The data to scan and summarize.
|
|
59
|
+
The data to scan and summarize. This could be a DataFrame object, an Ibis table object,
|
|
60
|
+
a CSV file path, a Parquet file path, a GitHub URL pointing to a CSV or Parquet file,
|
|
61
|
+
or a database connection string.
|
|
60
62
|
tbl_name
|
|
61
63
|
Optionally, the name of the table could be provided as `tbl_name`.
|
|
62
64
|
|
|
@@ -122,6 +124,14 @@ class DataScan:
|
|
|
122
124
|
|
|
123
125
|
# TODO: This needs to be generically typed at the class level, ie. DataScan[T]
|
|
124
126
|
def __init__(self, data: IntoFrameT, tbl_name: str | None = None) -> None:
|
|
127
|
+
# Import processing functions from validate module
|
|
128
|
+
from pointblank.validate import (
|
|
129
|
+
_process_data,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Process input data to handle different data source types
|
|
133
|
+
data = _process_data(data)
|
|
134
|
+
|
|
125
135
|
as_native = nw.from_native(data)
|
|
126
136
|
|
|
127
137
|
if as_native.implementation.name == "IBIS" and as_native._level == "lazy":
|
|
@@ -514,8 +524,9 @@ def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
|
|
|
514
524
|
Parameters
|
|
515
525
|
----------
|
|
516
526
|
data
|
|
517
|
-
The table to summarize, which could be a DataFrame object
|
|
518
|
-
|
|
527
|
+
The table to summarize, which could be a DataFrame object, an Ibis table object, a CSV
|
|
528
|
+
file path, a Parquet file path, or a database connection string. Read the *Supported Input
|
|
529
|
+
Table Types* section for details on the supported table types.
|
|
519
530
|
tbl_name
|
|
520
531
|
Optionally, the name of the table could be provided as `tbl_name=`.
|
|
521
532
|
|
|
@@ -535,6 +546,11 @@ def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
|
|
|
535
546
|
- PostgreSQL table (`"postgresql"`)*
|
|
536
547
|
- SQLite table (`"sqlite"`)*
|
|
537
548
|
- Parquet table (`"parquet"`)*
|
|
549
|
+
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
|
|
550
|
+
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
|
|
551
|
+
extension, or partitioned dataset)
|
|
552
|
+
- GitHub URLs (direct links to CSV or Parquet files on GitHub)
|
|
553
|
+
- Database connection strings (URI format with optional table specification)
|
|
538
554
|
|
|
539
555
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
540
556
|
`ibis.expr.types.relations.Table`). Furthermore, using `col_summary_tbl()` with these types of
|
|
@@ -566,5 +582,11 @@ def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
|
|
|
566
582
|
```
|
|
567
583
|
"""
|
|
568
584
|
|
|
585
|
+
# Import processing functions from validate module
|
|
586
|
+
from pointblank.validate import _process_data
|
|
587
|
+
|
|
588
|
+
# Process input data to handle different data source types
|
|
589
|
+
data = _process_data(data)
|
|
590
|
+
|
|
569
591
|
scanner = DataScan(data=data, tbl_name=tbl_name)
|
|
570
592
|
return scanner.get_tabular_report()
|
pointblank/validate.py
CHANGED
|
@@ -735,9 +735,157 @@ def get_data_path(
|
|
|
735
735
|
return tmp_file.name
|
|
736
736
|
|
|
737
737
|
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
738
|
+
def _process_data(data: FrameT | Any) -> FrameT | Any:
|
|
739
|
+
"""
|
|
740
|
+
Centralized data processing pipeline that handles all supported input types.
|
|
741
|
+
|
|
742
|
+
This function consolidates the data processing pipeline used across multiple
|
|
743
|
+
classes and functions in Pointblank. It processes data through a consistent
|
|
744
|
+
sequence of transformations to handle different data source types.
|
|
745
|
+
|
|
746
|
+
The processing order is important:
|
|
747
|
+
|
|
748
|
+
1. GitHub URLs (must come before connection string processing)
|
|
749
|
+
2. Database connection strings
|
|
750
|
+
3. CSV file paths
|
|
751
|
+
4. Parquet file paths
|
|
752
|
+
|
|
753
|
+
Parameters
|
|
754
|
+
----------
|
|
755
|
+
data : FrameT | Any
|
|
756
|
+
The input data which could be:
|
|
757
|
+
- a DataFrame object (Polars, Pandas, Ibis, etc.)
|
|
758
|
+
- a GitHub URL pointing to a CSV or Parquet file
|
|
759
|
+
- a database connection string (e.g., "duckdb:///path/to/file.ddb::table_name")
|
|
760
|
+
- a CSV file path (string or Path object with .csv extension)
|
|
761
|
+
- a Parquet file path, glob pattern, directory, or partitioned dataset
|
|
762
|
+
- any other data type (returned unchanged)
|
|
763
|
+
|
|
764
|
+
Returns
|
|
765
|
+
-------
|
|
766
|
+
FrameT | Any
|
|
767
|
+
Processed data as a DataFrame if input was a supported data source type,
|
|
768
|
+
otherwise the original data unchanged.
|
|
769
|
+
"""
|
|
770
|
+
# Handle GitHub URL input (e.g., "https://github.com/user/repo/blob/main/data.csv")
|
|
771
|
+
data = _process_github_url(data)
|
|
772
|
+
|
|
773
|
+
# Handle connection string input (e.g., "duckdb:///path/to/file.ddb::table_name")
|
|
774
|
+
data = _process_connection_string(data)
|
|
775
|
+
|
|
776
|
+
# Handle CSV file input (e.g., "data.csv" or Path("data.csv"))
|
|
777
|
+
data = _process_csv_input(data)
|
|
778
|
+
|
|
779
|
+
# Handle Parquet file input (e.g., "data.parquet", "data/*.parquet", "data/")
|
|
780
|
+
data = _process_parquet_input(data)
|
|
781
|
+
|
|
782
|
+
return data
|
|
783
|
+
|
|
784
|
+
|
|
785
|
+
def _process_github_url(data: FrameT | Any) -> FrameT | Any:
|
|
786
|
+
"""
|
|
787
|
+
Process data parameter to handle GitHub URLs pointing to CSV or Parquet files.
|
|
788
|
+
|
|
789
|
+
Handles both standard GitHub URLs and raw GitHub content URLs, downloading the content
|
|
790
|
+
and processing it as a local file.
|
|
791
|
+
|
|
792
|
+
Supports:
|
|
793
|
+
- Standard github.com URLs pointing to CSV or Parquet files (automatically transformed to raw URLs)
|
|
794
|
+
- Raw raw.githubusercontent.com URLs pointing to CSV or Parquet files (processed directly)
|
|
795
|
+
- Both CSV and Parquet file formats
|
|
796
|
+
- Automatic temporary file management and cleanup
|
|
797
|
+
|
|
798
|
+
Parameters
|
|
799
|
+
----------
|
|
800
|
+
data : FrameT | Any
|
|
801
|
+
The data parameter which may be a GitHub URL string or any other data type.
|
|
802
|
+
|
|
803
|
+
Returns
|
|
804
|
+
-------
|
|
805
|
+
FrameT | Any
|
|
806
|
+
If the input is a supported GitHub URL, returns a DataFrame loaded from the downloaded file.
|
|
807
|
+
Otherwise, returns the original data unchanged.
|
|
808
|
+
|
|
809
|
+
Examples
|
|
810
|
+
--------
|
|
811
|
+
Standard GitHub URL (automatically transformed):
|
|
812
|
+
>>> url = "https://github.com/user/repo/blob/main/data.csv"
|
|
813
|
+
>>> df = _process_github_url(url)
|
|
814
|
+
|
|
815
|
+
Raw GitHub URL (used directly):
|
|
816
|
+
>>> raw_url = "https://raw.githubusercontent.com/user/repo/main/data.csv"
|
|
817
|
+
>>> df = _process_github_url(raw_url)
|
|
818
|
+
"""
|
|
819
|
+
import re
|
|
820
|
+
import tempfile
|
|
821
|
+
from urllib.parse import urlparse
|
|
822
|
+
from urllib.request import urlopen
|
|
823
|
+
|
|
824
|
+
# Check if data is a string that looks like a GitHub URL
|
|
825
|
+
if not isinstance(data, str):
|
|
826
|
+
return data
|
|
827
|
+
|
|
828
|
+
# Parse the URL to check if it's a GitHub URL
|
|
829
|
+
try:
|
|
830
|
+
parsed = urlparse(data)
|
|
831
|
+
except Exception:
|
|
832
|
+
return data
|
|
833
|
+
|
|
834
|
+
# Check if it's a GitHub URL (standard or raw)
|
|
835
|
+
is_standard_github = parsed.netloc in ["github.com", "www.github.com"]
|
|
836
|
+
is_raw_github = parsed.netloc == "raw.githubusercontent.com"
|
|
837
|
+
|
|
838
|
+
if not (is_standard_github or is_raw_github):
|
|
839
|
+
return data
|
|
840
|
+
|
|
841
|
+
# Check if it points to a CSV or Parquet file
|
|
842
|
+
path_lower = parsed.path.lower()
|
|
843
|
+
if not (path_lower.endswith(".csv") or path_lower.endswith(".parquet")):
|
|
844
|
+
return data
|
|
845
|
+
|
|
846
|
+
# Determine the raw URL to download from
|
|
847
|
+
if is_raw_github:
|
|
848
|
+
# Already a raw GitHub URL, use it directly
|
|
849
|
+
raw_url = data
|
|
850
|
+
else:
|
|
851
|
+
# Transform GitHub URL to raw content URL
|
|
852
|
+
# Pattern: https://github.com/user/repo/blob/branch/path/file.ext
|
|
853
|
+
# Becomes: https://raw.githubusercontent.com/user/repo/branch/path/file.ext
|
|
854
|
+
github_pattern = r"github\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.+)"
|
|
855
|
+
match = re.search(github_pattern, data)
|
|
856
|
+
|
|
857
|
+
if not match:
|
|
858
|
+
# If URL doesn't match expected GitHub blob pattern, return original data
|
|
859
|
+
return data
|
|
860
|
+
|
|
861
|
+
user, repo, branch, file_path = match.groups()
|
|
862
|
+
raw_url = f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/{file_path}"
|
|
863
|
+
|
|
864
|
+
# Download the file content to a temporary file
|
|
865
|
+
try:
|
|
866
|
+
with urlopen(raw_url) as response:
|
|
867
|
+
content = response.read()
|
|
868
|
+
|
|
869
|
+
# Determine file extension
|
|
870
|
+
file_ext = ".csv" if path_lower.endswith(".csv") else ".parquet"
|
|
871
|
+
|
|
872
|
+
# Create a temporary file
|
|
873
|
+
with tempfile.NamedTemporaryFile(mode="wb", suffix=file_ext, delete=False) as tmp_file:
|
|
874
|
+
tmp_file.write(content)
|
|
875
|
+
tmp_file_path = tmp_file.name
|
|
876
|
+
|
|
877
|
+
# Process the temporary file using existing CSV or Parquet processing functions
|
|
878
|
+
if file_ext == ".csv":
|
|
879
|
+
return _process_csv_input(tmp_file_path)
|
|
880
|
+
else: # .parquet
|
|
881
|
+
return _process_parquet_input(tmp_file_path)
|
|
882
|
+
|
|
883
|
+
except Exception:
|
|
884
|
+
# If download or processing fails, return original data
|
|
885
|
+
return data
|
|
886
|
+
|
|
887
|
+
except Exception as e:
|
|
888
|
+
raise RuntimeError(f"Failed to download or process GitHub file from {raw_url}: {e}") from e
|
|
741
889
|
|
|
742
890
|
|
|
743
891
|
def _process_connection_string(data: FrameT | Any) -> FrameT | Any:
|
|
@@ -1215,14 +1363,7 @@ def preview(
|
|
|
1215
1363
|
"""
|
|
1216
1364
|
|
|
1217
1365
|
# Process input data to handle different data source types
|
|
1218
|
-
|
|
1219
|
-
data = _process_connection_string(data)
|
|
1220
|
-
|
|
1221
|
-
# Handle CSV file input (e.g., "data.csv" or Path("data.csv"))
|
|
1222
|
-
data = _process_csv_input(data)
|
|
1223
|
-
|
|
1224
|
-
# Handle Parquet file input (e.g., "data.parquet", "data/*.parquet", "data/")
|
|
1225
|
-
data = _process_parquet_input(data)
|
|
1366
|
+
data = _process_data(data)
|
|
1226
1367
|
|
|
1227
1368
|
if incl_header is None:
|
|
1228
1369
|
incl_header = global_config.preview_incl_header
|
|
@@ -1635,9 +1776,9 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
|
|
|
1635
1776
|
Parameters
|
|
1636
1777
|
----------
|
|
1637
1778
|
data
|
|
1638
|
-
The table for which to display the missing values. This could be a DataFrame object
|
|
1639
|
-
Ibis table object
|
|
1640
|
-
supported table types.
|
|
1779
|
+
The table for which to display the missing values. This could be a DataFrame object, an
|
|
1780
|
+
Ibis table object, a CSV file path, a Parquet file path, or a database connection string.
|
|
1781
|
+
Read the *Supported Input Table Types* section for details on the supported table types.
|
|
1641
1782
|
|
|
1642
1783
|
Returns
|
|
1643
1784
|
-------
|
|
@@ -1660,6 +1801,10 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
|
|
|
1660
1801
|
- PySpark table (`"pyspark"`)*
|
|
1661
1802
|
- BigQuery table (`"bigquery"`)*
|
|
1662
1803
|
- Parquet table (`"parquet"`)*
|
|
1804
|
+
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
|
|
1805
|
+
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
|
|
1806
|
+
extension, or partitioned dataset)
|
|
1807
|
+
- Database connection strings (URI format with optional table specification)
|
|
1663
1808
|
|
|
1664
1809
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
1665
1810
|
`ibis.expr.types.relations.Table`). Furthermore, using `missing_vals_tbl()` with these types of
|
|
@@ -1702,6 +1847,9 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
|
|
|
1702
1847
|
sector. Many columns have no missing values at all, and those sectors are colored light blue.
|
|
1703
1848
|
"""
|
|
1704
1849
|
|
|
1850
|
+
# Process input data to handle different data source types
|
|
1851
|
+
data = _process_data(data)
|
|
1852
|
+
|
|
1705
1853
|
# Make a copy of the data to avoid modifying the original
|
|
1706
1854
|
data = copy.deepcopy(data)
|
|
1707
1855
|
|
|
@@ -2164,14 +2312,15 @@ def get_column_count(data: FrameT | Any) -> int:
|
|
|
2164
2312
|
|
|
2165
2313
|
The `get_column_count()` function returns the number of columns in a table. The function works
|
|
2166
2314
|
with any table that is supported by the `pointblank` library, including Pandas, Polars, and Ibis
|
|
2167
|
-
backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.).
|
|
2315
|
+
backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.). It also supports
|
|
2316
|
+
direct input of CSV files, Parquet files, and database connection strings.
|
|
2168
2317
|
|
|
2169
2318
|
Parameters
|
|
2170
2319
|
----------
|
|
2171
2320
|
data
|
|
2172
|
-
The table for which to get the column count, which could be a DataFrame object
|
|
2173
|
-
table object
|
|
2174
|
-
table types.
|
|
2321
|
+
The table for which to get the column count, which could be a DataFrame object, an Ibis
|
|
2322
|
+
table object, a CSV file path, a Parquet file path, or a database connection string.
|
|
2323
|
+
Read the *Supported Input Table Types* section for details on the supported table types.
|
|
2175
2324
|
|
|
2176
2325
|
Returns
|
|
2177
2326
|
-------
|
|
@@ -2194,12 +2343,39 @@ def get_column_count(data: FrameT | Any) -> int:
|
|
|
2194
2343
|
- PySpark table (`"pyspark"`)*
|
|
2195
2344
|
- BigQuery table (`"bigquery"`)*
|
|
2196
2345
|
- Parquet table (`"parquet"`)*
|
|
2346
|
+
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
|
|
2347
|
+
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
|
|
2348
|
+
extension, or partitioned dataset)
|
|
2349
|
+
- Database connection strings (URI format with optional table specification)
|
|
2197
2350
|
|
|
2198
2351
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
2199
2352
|
`ibis.expr.types.relations.Table`). Furthermore, using `get_column_count()` with these types of
|
|
2200
2353
|
tables requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a
|
|
2201
2354
|
Polars or Pandas DataFrame, the availability of Ibis is not needed.
|
|
2202
2355
|
|
|
2356
|
+
To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
|
|
2357
|
+
provided. The file will be automatically detected and loaded using the best available DataFrame
|
|
2358
|
+
library. The loading preference is Polars first, then Pandas as a fallback.
|
|
2359
|
+
|
|
2360
|
+
GitHub URLs pointing to CSV or Parquet files are automatically detected and converted to raw
|
|
2361
|
+
content URLs for downloading. The URL format should be:
|
|
2362
|
+
`https://github.com/user/repo/blob/branch/path/file.csv` or
|
|
2363
|
+
`https://github.com/user/repo/blob/branch/path/file.parquet`
|
|
2364
|
+
|
|
2365
|
+
Connection strings follow database URL formats and must also specify a table using the
|
|
2366
|
+
`::table_name` suffix. Examples include:
|
|
2367
|
+
|
|
2368
|
+
```
|
|
2369
|
+
"duckdb:///path/to/database.ddb::table_name"
|
|
2370
|
+
"sqlite:///path/to/database.db::table_name"
|
|
2371
|
+
"postgresql://user:password@localhost:5432/database::table_name"
|
|
2372
|
+
"mysql://user:password@localhost:3306/database::table_name"
|
|
2373
|
+
"bigquery://project/dataset::table_name"
|
|
2374
|
+
"snowflake://user:password@account/database/schema::table_name"
|
|
2375
|
+
```
|
|
2376
|
+
|
|
2377
|
+
When using connection strings, the Ibis library with the appropriate backend driver is required.
|
|
2378
|
+
|
|
2203
2379
|
Examples
|
|
2204
2380
|
--------
|
|
2205
2381
|
To get the number of columns in a table, we can use the `get_column_count()` function. Here's an
|
|
@@ -2224,9 +2400,63 @@ def get_column_count(data: FrameT | Any) -> int:
|
|
|
2224
2400
|
pb.get_column_count(small_table_duckdb)
|
|
2225
2401
|
```
|
|
2226
2402
|
|
|
2403
|
+
#### Working with CSV Files
|
|
2404
|
+
|
|
2405
|
+
The `get_column_count()` function can directly accept CSV file paths:
|
|
2406
|
+
|
|
2407
|
+
```{python}
|
|
2408
|
+
# Get a path to a CSV file from the package data
|
|
2409
|
+
csv_path = pb.get_data_path("global_sales", "csv")
|
|
2410
|
+
|
|
2411
|
+
pb.get_column_count(csv_path)
|
|
2412
|
+
```
|
|
2413
|
+
|
|
2414
|
+
#### Working with Parquet Files
|
|
2415
|
+
|
|
2416
|
+
The function supports various Parquet input formats:
|
|
2417
|
+
|
|
2418
|
+
```{python}
|
|
2419
|
+
# Single Parquet file from package data
|
|
2420
|
+
parquet_path = pb.get_data_path("nycflights", "parquet")
|
|
2421
|
+
|
|
2422
|
+
pb.get_column_count(parquet_path)
|
|
2423
|
+
```
|
|
2424
|
+
|
|
2425
|
+
You can also use glob patterns and directories:
|
|
2426
|
+
|
|
2427
|
+
```python
|
|
2428
|
+
# Multiple Parquet files with glob patterns
|
|
2429
|
+
pb.get_column_count("data/sales_*.parquet")
|
|
2430
|
+
|
|
2431
|
+
# Directory containing Parquet files
|
|
2432
|
+
pb.get_column_count("parquet_data/")
|
|
2433
|
+
|
|
2434
|
+
# Partitioned Parquet dataset
|
|
2435
|
+
pb.get_column_count("sales_data/") # Auto-discovers partition columns
|
|
2436
|
+
```
|
|
2437
|
+
|
|
2438
|
+
#### Working with Database Connection Strings
|
|
2439
|
+
|
|
2440
|
+
The function supports database connection strings for direct access to database tables:
|
|
2441
|
+
|
|
2442
|
+
```{python}
|
|
2443
|
+
# Get path to a DuckDB database file from package data
|
|
2444
|
+
duckdb_path = pb.get_data_path("game_revenue", "duckdb")
|
|
2445
|
+
|
|
2446
|
+
pb.get_column_count(f"duckdb:///{duckdb_path}::game_revenue")
|
|
2447
|
+
```
|
|
2448
|
+
|
|
2227
2449
|
The function always returns the number of columns in the table as an integer value, which is
|
|
2228
2450
|
`8` for the `small_table` dataset.
|
|
2229
2451
|
"""
|
|
2452
|
+
from pathlib import Path
|
|
2453
|
+
|
|
2454
|
+
# Process different input types
|
|
2455
|
+
if isinstance(data, str) or isinstance(data, Path):
|
|
2456
|
+
data = _process_data(data)
|
|
2457
|
+
elif isinstance(data, list):
|
|
2458
|
+
# Handle list of file paths (likely Parquet files)
|
|
2459
|
+
data = _process_parquet_input(data)
|
|
2230
2460
|
|
|
2231
2461
|
if "ibis.expr.types.relations.Table" in str(type(data)):
|
|
2232
2462
|
return len(data.columns)
|
|
@@ -2250,14 +2480,15 @@ def get_row_count(data: FrameT | Any) -> int:
|
|
|
2250
2480
|
|
|
2251
2481
|
The `get_row_count()` function returns the number of rows in a table. The function works with
|
|
2252
2482
|
any table that is supported by the `pointblank` library, including Pandas, Polars, and Ibis
|
|
2253
|
-
backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.).
|
|
2483
|
+
backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.). It also supports
|
|
2484
|
+
direct input of CSV files, Parquet files, and database connection strings.
|
|
2254
2485
|
|
|
2255
2486
|
Parameters
|
|
2256
2487
|
----------
|
|
2257
2488
|
data
|
|
2258
|
-
The table for which to get the row count, which could be a DataFrame object
|
|
2259
|
-
object
|
|
2260
|
-
types.
|
|
2489
|
+
The table for which to get the row count, which could be a DataFrame object, an Ibis table
|
|
2490
|
+
object, a CSV file path, a Parquet file path, or a database connection string.
|
|
2491
|
+
Read the *Supported Input Table Types* section for details on the supported table types.
|
|
2261
2492
|
|
|
2262
2493
|
Returns
|
|
2263
2494
|
-------
|
|
@@ -2280,12 +2511,40 @@ def get_row_count(data: FrameT | Any) -> int:
|
|
|
2280
2511
|
- PySpark table (`"pyspark"`)*
|
|
2281
2512
|
- BigQuery table (`"bigquery"`)*
|
|
2282
2513
|
- Parquet table (`"parquet"`)*
|
|
2514
|
+
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
|
|
2515
|
+
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
|
|
2516
|
+
extension, or partitioned dataset)
|
|
2517
|
+
- GitHub URLs (direct links to CSV or Parquet files on GitHub)
|
|
2518
|
+
- Database connection strings (URI format with optional table specification)
|
|
2283
2519
|
|
|
2284
2520
|
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
|
|
2285
2521
|
`ibis.expr.types.relations.Table`). Furthermore, using `get_row_count()` with these types of
|
|
2286
2522
|
tables requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a
|
|
2287
2523
|
Polars or Pandas DataFrame, the availability of Ibis is not needed.
|
|
2288
2524
|
|
|
2525
|
+
To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
|
|
2526
|
+
provided. The file will be automatically detected and loaded using the best available DataFrame
|
|
2527
|
+
library. The loading preference is Polars first, then Pandas as a fallback.
|
|
2528
|
+
|
|
2529
|
+
GitHub URLs pointing to CSV or Parquet files are automatically detected and converted to raw
|
|
2530
|
+
content URLs for downloading. The URL format should be:
|
|
2531
|
+
`https://github.com/user/repo/blob/branch/path/file.csv` or
|
|
2532
|
+
`https://github.com/user/repo/blob/branch/path/file.parquet`
|
|
2533
|
+
|
|
2534
|
+
Connection strings follow database URL formats and must also specify a table using the
|
|
2535
|
+
`::table_name` suffix. Examples include:
|
|
2536
|
+
|
|
2537
|
+
```
|
|
2538
|
+
"duckdb:///path/to/database.ddb::table_name"
|
|
2539
|
+
"sqlite:///path/to/database.db::table_name"
|
|
2540
|
+
"postgresql://user:password@localhost:5432/database::table_name"
|
|
2541
|
+
"mysql://user:password@localhost:3306/database::table_name"
|
|
2542
|
+
"bigquery://project/dataset::table_name"
|
|
2543
|
+
"snowflake://user:password@account/database/schema::table_name"
|
|
2544
|
+
```
|
|
2545
|
+
|
|
2546
|
+
When using connection strings, the Ibis library with the appropriate backend driver is required.
|
|
2547
|
+
|
|
2289
2548
|
Examples
|
|
2290
2549
|
--------
|
|
2291
2550
|
Getting the number of rows in a table is easily done by using the `get_row_count()` function.
|
|
@@ -2310,9 +2569,63 @@ def get_row_count(data: FrameT | Any) -> int:
|
|
|
2310
2569
|
pb.get_row_count(game_revenue_duckdb)
|
|
2311
2570
|
```
|
|
2312
2571
|
|
|
2572
|
+
#### Working with CSV Files
|
|
2573
|
+
|
|
2574
|
+
The `get_row_count()` function can directly accept CSV file paths:
|
|
2575
|
+
|
|
2576
|
+
```{python}
|
|
2577
|
+
# Get a path to a CSV file from the package data
|
|
2578
|
+
csv_path = pb.get_data_path("global_sales", "csv")
|
|
2579
|
+
|
|
2580
|
+
pb.get_row_count(csv_path)
|
|
2581
|
+
```
|
|
2582
|
+
|
|
2583
|
+
#### Working with Parquet Files
|
|
2584
|
+
|
|
2585
|
+
The function supports various Parquet input formats:
|
|
2586
|
+
|
|
2587
|
+
```{python}
|
|
2588
|
+
# Single Parquet file from package data
|
|
2589
|
+
parquet_path = pb.get_data_path("nycflights", "parquet")
|
|
2590
|
+
|
|
2591
|
+
pb.get_row_count(parquet_path)
|
|
2592
|
+
```
|
|
2593
|
+
|
|
2594
|
+
You can also use glob patterns and directories:
|
|
2595
|
+
|
|
2596
|
+
```python
|
|
2597
|
+
# Multiple Parquet files with glob patterns
|
|
2598
|
+
pb.get_row_count("data/sales_*.parquet")
|
|
2599
|
+
|
|
2600
|
+
# Directory containing Parquet files
|
|
2601
|
+
pb.get_row_count("parquet_data/")
|
|
2602
|
+
|
|
2603
|
+
# Partitioned Parquet dataset
|
|
2604
|
+
pb.get_row_count("sales_data/") # Auto-discovers partition columns
|
|
2605
|
+
```
|
|
2606
|
+
|
|
2607
|
+
#### Working with Database Connection Strings
|
|
2608
|
+
|
|
2609
|
+
The function supports database connection strings for direct access to database tables:
|
|
2610
|
+
|
|
2611
|
+
```{python}
|
|
2612
|
+
# Get path to a DuckDB database file from package data
|
|
2613
|
+
duckdb_path = pb.get_data_path("game_revenue", "duckdb")
|
|
2614
|
+
|
|
2615
|
+
pb.get_row_count(f"duckdb:///{duckdb_path}::game_revenue")
|
|
2616
|
+
```
|
|
2617
|
+
|
|
2313
2618
|
The function always returns the number of rows in the table as an integer value, which is `2000`
|
|
2314
2619
|
for the `game_revenue` dataset.
|
|
2315
2620
|
"""
|
|
2621
|
+
from pathlib import Path
|
|
2622
|
+
|
|
2623
|
+
# Process different input types
|
|
2624
|
+
if isinstance(data, str) or isinstance(data, Path):
|
|
2625
|
+
data = _process_data(data)
|
|
2626
|
+
elif isinstance(data, list):
|
|
2627
|
+
# Handle list of file paths (likely Parquet files)
|
|
2628
|
+
data = _process_parquet_input(data)
|
|
2316
2629
|
|
|
2317
2630
|
if "ibis.expr.types.relations.Table" in str(type(data)):
|
|
2318
2631
|
# Determine whether Pandas or Polars is available to get the row count
|
|
@@ -2717,13 +3030,15 @@ class Validate:
|
|
|
2717
3030
|
----------
|
|
2718
3031
|
data
|
|
2719
3032
|
The table to validate, which could be a DataFrame object, an Ibis table object, a CSV
|
|
2720
|
-
file path, a Parquet file path,
|
|
2721
|
-
|
|
2722
|
-
|
|
2723
|
-
|
|
2724
|
-
|
|
2725
|
-
|
|
2726
|
-
|
|
3033
|
+
file path, a Parquet file path, a GitHub URL pointing to a CSV or Parquet file, or a
|
|
3034
|
+
database connection string. When providing a CSV or Parquet file path (as a string or
|
|
3035
|
+
`pathlib.Path` object), the file will be automatically loaded using an available DataFrame
|
|
3036
|
+
library (Polars or Pandas). Parquet input also supports glob patterns, directories
|
|
3037
|
+
containing .parquet files, and Spark-style partitioned datasets. GitHub URLs are
|
|
3038
|
+
automatically transformed to raw content URLs and downloaded. Connection strings enable
|
|
3039
|
+
direct database access via Ibis with optional table specification using the `::table_name`
|
|
3040
|
+
suffix. Read the *Supported Input Table Types* section for details on the supported table
|
|
3041
|
+
types.
|
|
2727
3042
|
tbl_name
|
|
2728
3043
|
An optional name to assign to the input table object. If no value is provided, a name will
|
|
2729
3044
|
be generated based on whatever information is available. This table name will be displayed
|
|
@@ -3243,14 +3558,8 @@ class Validate:
|
|
|
3243
3558
|
locale: str | None = None
|
|
3244
3559
|
|
|
3245
3560
|
def __post_init__(self):
|
|
3246
|
-
#
|
|
3247
|
-
self.data =
|
|
3248
|
-
|
|
3249
|
-
# Handle CSV file input for the data parameter
|
|
3250
|
-
self.data = _process_csv_input(self.data)
|
|
3251
|
-
|
|
3252
|
-
# Handle Parquet file input for the data parameter
|
|
3253
|
-
self.data = _process_parquet_input(self.data)
|
|
3561
|
+
# Process data through the centralized data processing pipeline
|
|
3562
|
+
self.data = _process_data(self.data)
|
|
3254
3563
|
|
|
3255
3564
|
# Check input of the `thresholds=` argument
|
|
3256
3565
|
_check_thresholds(thresholds=self.thresholds)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pointblank
|
|
3
|
-
Version: 0.11.
|
|
3
|
+
Version: 0.11.1
|
|
4
4
|
Summary: Find out if your data is what you think it is.
|
|
5
5
|
Author-email: Richard Iannone <riannone@me.com>
|
|
6
6
|
License: MIT License
|
|
@@ -249,31 +249,37 @@ Pointblank includes a powerful CLI utility called `pb` that lets you run data va
|
|
|
249
249
|
# Get a quick preview of your data
|
|
250
250
|
pb preview small_table
|
|
251
251
|
|
|
252
|
-
#
|
|
253
|
-
pb
|
|
252
|
+
# Preview data from GitHub URLs
|
|
253
|
+
pb preview "https://github.com/user/repo/blob/main/data.csv"
|
|
254
254
|
|
|
255
|
-
#
|
|
256
|
-
pb
|
|
255
|
+
# Check for missing values in Parquet files
|
|
256
|
+
pb missing data.parquet
|
|
257
|
+
|
|
258
|
+
# Generate column summaries from database connections
|
|
259
|
+
pb scan "duckdb:///data/sales.ddb::customers"
|
|
257
260
|
```
|
|
258
261
|
|
|
259
262
|
**Run Essential Validations**
|
|
260
263
|
|
|
261
264
|
```bash
|
|
262
265
|
# Check for duplicate rows
|
|
263
|
-
pb validate
|
|
266
|
+
pb validate small_table --check rows-distinct
|
|
267
|
+
|
|
268
|
+
# Validate data directly from GitHub
|
|
269
|
+
pb validate "https://github.com/user/repo/blob/main/sales.csv" --check col-vals-not-null --column customer_id
|
|
264
270
|
|
|
265
|
-
# Verify no null values
|
|
266
|
-
pb validate
|
|
271
|
+
# Verify no null values in Parquet datasets
|
|
272
|
+
pb validate "data/*.parquet" --check col-vals-not-null --column a
|
|
267
273
|
|
|
268
274
|
# Extract failing data for debugging
|
|
269
|
-
pb validate
|
|
275
|
+
pb validate small_table --check col-vals-gt --column a --value 5 --show-extract
|
|
270
276
|
```
|
|
271
277
|
|
|
272
278
|
**Integrate with CI/CD**
|
|
273
279
|
|
|
274
280
|
```bash
|
|
275
281
|
# Use exit codes for automation (0 = pass, 1 = fail)
|
|
276
|
-
pb validate
|
|
282
|
+
pb validate small_table --check rows-distinct && echo "✅ Quality checks passed"
|
|
277
283
|
```
|
|
278
284
|
|
|
279
285
|
Learn more in our [CLI documentation](https://posit-dev.github.io/pointblank/user-guide/cli.html).
|
|
@@ -9,18 +9,18 @@ pointblank/_utils.py,sha256=ttgYKKfufsUAiEBFfmWcejLz8hm6ff88DK_rDzk7VtE,28430
|
|
|
9
9
|
pointblank/_utils_check_args.py,sha256=rFEc1nbCN8ftsQQWVjCNWmQ2QmUDxkfgmoJclrZeTLs,5489
|
|
10
10
|
pointblank/_utils_html.py,sha256=uJWvS9JwQVEZgwsGmScA_u_EBRND75rzUvnJPalbRVs,3731
|
|
11
11
|
pointblank/actions.py,sha256=D6o9B2_ES9PNQg9HZwREacrrt-3A5bhdrBkL1UXz__s,18281
|
|
12
|
-
pointblank/assistant.py,sha256=
|
|
13
|
-
pointblank/cli.py,sha256=
|
|
12
|
+
pointblank/assistant.py,sha256=YsQ9U1wacVIuYFRIJ4maBbBDTzEQPzirhUUPgySosM4,15428
|
|
13
|
+
pointblank/cli.py,sha256=aS1auedTJFk7SMPy5hfItkV5_9olUEE1CCRkAel4Thk,156930
|
|
14
14
|
pointblank/column.py,sha256=_FJjpjv760D1p6YGgqbwmKYktouG7AJ2A9uIMYQBTYA,76560
|
|
15
|
-
pointblank/compare.py,sha256=
|
|
16
|
-
pointblank/datascan.py,sha256=
|
|
15
|
+
pointblank/compare.py,sha256=kFd18CehHz7g-2MF1kSmJSdOoAP80q_9PaF6QzHC1ds,866
|
|
16
|
+
pointblank/datascan.py,sha256=nmTcRLW8nAZfvRS_Nf00Wgx4oUX-o6WFOZqLDbedbu8,24563
|
|
17
17
|
pointblank/draft.py,sha256=cusr4fBiNncCKIOU8UwvJcvkBeBuUnqH_UfYp9dtNss,15777
|
|
18
18
|
pointblank/scan_profile.py,sha256=lZU5hlnzznDATNn9W3gNdyuFm05WDP8y1RjDJEcE5zg,10426
|
|
19
19
|
pointblank/scan_profile_stats.py,sha256=qdzoGXB-zi2hmpA4mTz6LLTqMnb-NRG9ndxU9cxS72w,4461
|
|
20
20
|
pointblank/schema.py,sha256=d93omncsV2lVbatM_QUFeCfCFA42WPZcgO_kE-ktjfU,45107
|
|
21
21
|
pointblank/tf.py,sha256=8o_8m4i01teulEe3-YYMotSNf3tImjBMInsvdjSAO5Q,8844
|
|
22
22
|
pointblank/thresholds.py,sha256=mybeLzTVdmN04NLKoV-jiSBXsWknwHO0Gox0ttVN_MU,25766
|
|
23
|
-
pointblank/validate.py,sha256=
|
|
23
|
+
pointblank/validate.py,sha256=AHy0WfNYyHV8fM3D8XHnuNPP1A1VGwrt6R9fWpwwY5Q,680283
|
|
24
24
|
pointblank/data/api-docs.txt,sha256=_mKEb3zuI6TR0bPNkpr5Y-GUtbB3Qv5WESR7MFuL06I,506515
|
|
25
25
|
pointblank/data/game_revenue-duckdb.zip,sha256=tKIVx48OGLYGsQPS3h5AjA2Nyq_rfEpLCjBiFUWhagU,35880
|
|
26
26
|
pointblank/data/game_revenue.zip,sha256=7c9EvHLyi93CHUd4p3dM4CZ-GucFCtXKSPxgLojL32U,33749
|
|
@@ -31,9 +31,9 @@ pointblank/data/nycflights.zip,sha256=yVjbUaKUz2LydSdF9cABuir0VReHBBgV7shiNWSd0m
|
|
|
31
31
|
pointblank/data/polars-api-docs.txt,sha256=KGcS-BOtUs9zgpkWfXD-GFdFh4O_zjdkpX7msHjztLg,198045
|
|
32
32
|
pointblank/data/small_table-duckdb.zip,sha256=BhTaZ2CRS4-9Z1uVhOU6HggvW3XCar7etMznfENIcOc,2028
|
|
33
33
|
pointblank/data/small_table.zip,sha256=lmFb90Nb-v5X559Ikjg31YLAXuRyMkD9yLRElkXPMzQ,472
|
|
34
|
-
pointblank-0.11.
|
|
35
|
-
pointblank-0.11.
|
|
36
|
-
pointblank-0.11.
|
|
37
|
-
pointblank-0.11.
|
|
38
|
-
pointblank-0.11.
|
|
39
|
-
pointblank-0.11.
|
|
34
|
+
pointblank-0.11.1.dist-info/licenses/LICENSE,sha256=apLF-HWPNU7pT5bmf5KmZpD5Cklpy2u-BN_0xBoRMLY,1081
|
|
35
|
+
pointblank-0.11.1.dist-info/METADATA,sha256=ii-Rawr1JWm6WLqTm5zGNpxtbQbwpRSFzaAbuUa6PFQ,16609
|
|
36
|
+
pointblank-0.11.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
37
|
+
pointblank-0.11.1.dist-info/entry_points.txt,sha256=GqqqOTOH8uZe22wLcvYjzpizqk_j4MNcUo2YM14ryCw,42
|
|
38
|
+
pointblank-0.11.1.dist-info/top_level.txt,sha256=-wHrS1SvV8-nhvc3w-PPYs1C1WtEc1pK-eGjubbCCKc,11
|
|
39
|
+
pointblank-0.11.1.dist-info/RECORD,,
|