pointblank 0.11.0__py3-none-any.whl → 0.11.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/compare.py CHANGED
@@ -10,6 +10,15 @@ if TYPE_CHECKING:
10
10
 
11
11
  class Compare:
12
12
  def __init__(self, a: IntoFrame, b: IntoFrame) -> None:
13
+ # Import processing functions from validate module
14
+ from pointblank.validate import _process_data
15
+
16
+ # Process input data for table a
17
+ a = _process_data(a)
18
+
19
+ # Process input data for table b
20
+ b = _process_data(b)
21
+
13
22
  self.a: IntoFrame = a
14
23
  self.b: IntoFrame = b
15
24
 
pointblank/datascan.py CHANGED
@@ -56,7 +56,9 @@ class DataScan:
56
56
  Parameters
57
57
  ----------
58
58
  data
59
- The data to scan and summarize.
59
+ The data to scan and summarize. This could be a DataFrame object, an Ibis table object,
60
+ a CSV file path, a Parquet file path, a GitHub URL pointing to a CSV or Parquet file,
61
+ or a database connection string.
60
62
  tbl_name
61
63
  Optionally, the name of the table could be provided as `tbl_name`.
62
64
 
@@ -122,6 +124,14 @@ class DataScan:
122
124
 
123
125
  # TODO: This needs to be generically typed at the class level, ie. DataScan[T]
124
126
  def __init__(self, data: IntoFrameT, tbl_name: str | None = None) -> None:
127
+ # Import processing functions from validate module
128
+ from pointblank.validate import (
129
+ _process_data,
130
+ )
131
+
132
+ # Process input data to handle different data source types
133
+ data = _process_data(data)
134
+
125
135
  as_native = nw.from_native(data)
126
136
 
127
137
  if as_native.implementation.name == "IBIS" and as_native._level == "lazy":
@@ -514,8 +524,9 @@ def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
514
524
  Parameters
515
525
  ----------
516
526
  data
517
- The table to summarize, which could be a DataFrame object or an Ibis table object. Read the
518
- *Supported Input Table Types* section for details on the supported table types.
527
+ The table to summarize, which could be a DataFrame object, an Ibis table object, a CSV
528
+ file path, a Parquet file path, or a database connection string. Read the *Supported Input
529
+ Table Types* section for details on the supported table types.
519
530
  tbl_name
520
531
  Optionally, the name of the table could be provided as `tbl_name=`.
521
532
 
@@ -535,6 +546,11 @@ def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
535
546
  - PostgreSQL table (`"postgresql"`)*
536
547
  - SQLite table (`"sqlite"`)*
537
548
  - Parquet table (`"parquet"`)*
549
+ - CSV files (string path or `pathlib.Path` object with `.csv` extension)
550
+ - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
551
+ extension, or partitioned dataset)
552
+ - GitHub URLs (direct links to CSV or Parquet files on GitHub)
553
+ - Database connection strings (URI format with optional table specification)
538
554
 
539
555
  The table types marked with an asterisk need to be prepared as Ibis tables (with type of
540
556
  `ibis.expr.types.relations.Table`). Furthermore, using `col_summary_tbl()` with these types of
@@ -566,5 +582,11 @@ def col_summary_tbl(data: FrameT | Any, tbl_name: str | None = None) -> GT:
566
582
  ```
567
583
  """
568
584
 
585
+ # Import processing functions from validate module
586
+ from pointblank.validate import _process_data
587
+
588
+ # Process input data to handle different data source types
589
+ data = _process_data(data)
590
+
569
591
  scanner = DataScan(data=data, tbl_name=tbl_name)
570
592
  return scanner.get_tabular_report()
pointblank/validate.py CHANGED
@@ -735,9 +735,157 @@ def get_data_path(
735
735
  return tmp_file.name
736
736
 
737
737
 
738
- # =============================================================================
739
- # Utility functions for processing input data (shared by preview() and Validate class)
740
- # =============================================================================
738
+ def _process_data(data: FrameT | Any) -> FrameT | Any:
739
+ """
740
+ Centralized data processing pipeline that handles all supported input types.
741
+
742
+ This function consolidates the data processing pipeline used across multiple
743
+ classes and functions in Pointblank. It processes data through a consistent
744
+ sequence of transformations to handle different data source types.
745
+
746
+ The processing order is important:
747
+
748
+ 1. GitHub URLs (must come before connection string processing)
749
+ 2. Database connection strings
750
+ 3. CSV file paths
751
+ 4. Parquet file paths
752
+
753
+ Parameters
754
+ ----------
755
+ data : FrameT | Any
756
+ The input data which could be:
757
+ - a DataFrame object (Polars, Pandas, Ibis, etc.)
758
+ - a GitHub URL pointing to a CSV or Parquet file
759
+ - a database connection string (e.g., "duckdb:///path/to/file.ddb::table_name")
760
+ - a CSV file path (string or Path object with .csv extension)
761
+ - a Parquet file path, glob pattern, directory, or partitioned dataset
762
+ - any other data type (returned unchanged)
763
+
764
+ Returns
765
+ -------
766
+ FrameT | Any
767
+ Processed data as a DataFrame if input was a supported data source type,
768
+ otherwise the original data unchanged.
769
+ """
770
+ # Handle GitHub URL input (e.g., "https://github.com/user/repo/blob/main/data.csv")
771
+ data = _process_github_url(data)
772
+
773
+ # Handle connection string input (e.g., "duckdb:///path/to/file.ddb::table_name")
774
+ data = _process_connection_string(data)
775
+
776
+ # Handle CSV file input (e.g., "data.csv" or Path("data.csv"))
777
+ data = _process_csv_input(data)
778
+
779
+ # Handle Parquet file input (e.g., "data.parquet", "data/*.parquet", "data/")
780
+ data = _process_parquet_input(data)
781
+
782
+ return data
783
+
784
+
785
+ def _process_github_url(data: FrameT | Any) -> FrameT | Any:
786
+ """
787
+ Process data parameter to handle GitHub URLs pointing to CSV or Parquet files.
788
+
789
+ Handles both standard GitHub URLs and raw GitHub content URLs, downloading the content
790
+ and processing it as a local file.
791
+
792
+ Supports:
793
+ - Standard github.com URLs pointing to CSV or Parquet files (automatically transformed to raw URLs)
794
+ - Raw raw.githubusercontent.com URLs pointing to CSV or Parquet files (processed directly)
795
+ - Both CSV and Parquet file formats
796
+ - Automatic temporary file management and cleanup
797
+
798
+ Parameters
799
+ ----------
800
+ data : FrameT | Any
801
+ The data parameter which may be a GitHub URL string or any other data type.
802
+
803
+ Returns
804
+ -------
805
+ FrameT | Any
806
+ If the input is a supported GitHub URL, returns a DataFrame loaded from the downloaded file.
807
+ Otherwise, returns the original data unchanged.
808
+
809
+ Examples
810
+ --------
811
+ Standard GitHub URL (automatically transformed):
812
+ >>> url = "https://github.com/user/repo/blob/main/data.csv"
813
+ >>> df = _process_github_url(url)
814
+
815
+ Raw GitHub URL (used directly):
816
+ >>> raw_url = "https://raw.githubusercontent.com/user/repo/main/data.csv"
817
+ >>> df = _process_github_url(raw_url)
818
+ """
819
+ import re
820
+ import tempfile
821
+ from urllib.parse import urlparse
822
+ from urllib.request import urlopen
823
+
824
+ # Check if data is a string that looks like a GitHub URL
825
+ if not isinstance(data, str):
826
+ return data
827
+
828
+ # Parse the URL to check if it's a GitHub URL
829
+ try:
830
+ parsed = urlparse(data)
831
+ except Exception:
832
+ return data
833
+
834
+ # Check if it's a GitHub URL (standard or raw)
835
+ is_standard_github = parsed.netloc in ["github.com", "www.github.com"]
836
+ is_raw_github = parsed.netloc == "raw.githubusercontent.com"
837
+
838
+ if not (is_standard_github or is_raw_github):
839
+ return data
840
+
841
+ # Check if it points to a CSV or Parquet file
842
+ path_lower = parsed.path.lower()
843
+ if not (path_lower.endswith(".csv") or path_lower.endswith(".parquet")):
844
+ return data
845
+
846
+ # Determine the raw URL to download from
847
+ if is_raw_github:
848
+ # Already a raw GitHub URL, use it directly
849
+ raw_url = data
850
+ else:
851
+ # Transform GitHub URL to raw content URL
852
+ # Pattern: https://github.com/user/repo/blob/branch/path/file.ext
853
+ # Becomes: https://raw.githubusercontent.com/user/repo/branch/path/file.ext
854
+ github_pattern = r"github\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.+)"
855
+ match = re.search(github_pattern, data)
856
+
857
+ if not match:
858
+ # If URL doesn't match expected GitHub blob pattern, return original data
859
+ return data
860
+
861
+ user, repo, branch, file_path = match.groups()
862
+ raw_url = f"https://raw.githubusercontent.com/{user}/{repo}/{branch}/{file_path}"
863
+
864
+ # Download the file content to a temporary file
865
+ try:
866
+ with urlopen(raw_url) as response:
867
+ content = response.read()
868
+
869
+ # Determine file extension
870
+ file_ext = ".csv" if path_lower.endswith(".csv") else ".parquet"
871
+
872
+ # Create a temporary file
873
+ with tempfile.NamedTemporaryFile(mode="wb", suffix=file_ext, delete=False) as tmp_file:
874
+ tmp_file.write(content)
875
+ tmp_file_path = tmp_file.name
876
+
877
+ # Process the temporary file using existing CSV or Parquet processing functions
878
+ if file_ext == ".csv":
879
+ return _process_csv_input(tmp_file_path)
880
+ else: # .parquet
881
+ return _process_parquet_input(tmp_file_path)
882
+
883
+ except Exception:
884
+ # If download or processing fails, return original data
885
+ return data
886
+
887
+ except Exception as e:
888
+ raise RuntimeError(f"Failed to download or process GitHub file from {raw_url}: {e}") from e
741
889
 
742
890
 
743
891
  def _process_connection_string(data: FrameT | Any) -> FrameT | Any:
@@ -1215,14 +1363,7 @@ def preview(
1215
1363
  """
1216
1364
 
1217
1365
  # Process input data to handle different data source types
1218
- # Handle connection string input (e.g., "duckdb:///path/to/file.ddb::table_name")
1219
- data = _process_connection_string(data)
1220
-
1221
- # Handle CSV file input (e.g., "data.csv" or Path("data.csv"))
1222
- data = _process_csv_input(data)
1223
-
1224
- # Handle Parquet file input (e.g., "data.parquet", "data/*.parquet", "data/")
1225
- data = _process_parquet_input(data)
1366
+ data = _process_data(data)
1226
1367
 
1227
1368
  if incl_header is None:
1228
1369
  incl_header = global_config.preview_incl_header
@@ -1635,9 +1776,9 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
1635
1776
  Parameters
1636
1777
  ----------
1637
1778
  data
1638
- The table for which to display the missing values. This could be a DataFrame object or an
1639
- Ibis table object. Read the *Supported Input Table Types* section for details on the
1640
- supported table types.
1779
+ The table for which to display the missing values. This could be a DataFrame object, an
1780
+ Ibis table object, a CSV file path, a Parquet file path, or a database connection string.
1781
+ Read the *Supported Input Table Types* section for details on the supported table types.
1641
1782
 
1642
1783
  Returns
1643
1784
  -------
@@ -1660,6 +1801,10 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
1660
1801
  - PySpark table (`"pyspark"`)*
1661
1802
  - BigQuery table (`"bigquery"`)*
1662
1803
  - Parquet table (`"parquet"`)*
1804
+ - CSV files (string path or `pathlib.Path` object with `.csv` extension)
1805
+ - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
1806
+ extension, or partitioned dataset)
1807
+ - Database connection strings (URI format with optional table specification)
1663
1808
 
1664
1809
  The table types marked with an asterisk need to be prepared as Ibis tables (with type of
1665
1810
  `ibis.expr.types.relations.Table`). Furthermore, using `missing_vals_tbl()` with these types of
@@ -1702,6 +1847,9 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
1702
1847
  sector. Many columns have no missing values at all, and those sectors are colored light blue.
1703
1848
  """
1704
1849
 
1850
+ # Process input data to handle different data source types
1851
+ data = _process_data(data)
1852
+
1705
1853
  # Make a copy of the data to avoid modifying the original
1706
1854
  data = copy.deepcopy(data)
1707
1855
 
@@ -2164,14 +2312,15 @@ def get_column_count(data: FrameT | Any) -> int:
2164
2312
 
2165
2313
  The `get_column_count()` function returns the number of columns in a table. The function works
2166
2314
  with any table that is supported by the `pointblank` library, including Pandas, Polars, and Ibis
2167
- backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.).
2315
+ backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.). It also supports
2316
+ direct input of CSV files, Parquet files, and database connection strings.
2168
2317
 
2169
2318
  Parameters
2170
2319
  ----------
2171
2320
  data
2172
- The table for which to get the column count, which could be a DataFrame object or an Ibis
2173
- table object. Read the *Supported Input Table Types* section for details on the supported
2174
- table types.
2321
+ The table for which to get the column count, which could be a DataFrame object, an Ibis
2322
+ table object, a CSV file path, a Parquet file path, or a database connection string.
2323
+ Read the *Supported Input Table Types* section for details on the supported table types.
2175
2324
 
2176
2325
  Returns
2177
2326
  -------
@@ -2194,12 +2343,39 @@ def get_column_count(data: FrameT | Any) -> int:
2194
2343
  - PySpark table (`"pyspark"`)*
2195
2344
  - BigQuery table (`"bigquery"`)*
2196
2345
  - Parquet table (`"parquet"`)*
2346
+ - CSV files (string path or `pathlib.Path` object with `.csv` extension)
2347
+ - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
2348
+ extension, or partitioned dataset)
2349
+ - Database connection strings (URI format with optional table specification)
2197
2350
 
2198
2351
  The table types marked with an asterisk need to be prepared as Ibis tables (with type of
2199
2352
  `ibis.expr.types.relations.Table`). Furthermore, using `get_column_count()` with these types of
2200
2353
  tables requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a
2201
2354
  Polars or Pandas DataFrame, the availability of Ibis is not needed.
2202
2355
 
2356
+ To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
2357
+ provided. The file will be automatically detected and loaded using the best available DataFrame
2358
+ library. The loading preference is Polars first, then Pandas as a fallback.
2359
+
2360
+ GitHub URLs pointing to CSV or Parquet files are automatically detected and converted to raw
2361
+ content URLs for downloading. The URL format should be:
2362
+ `https://github.com/user/repo/blob/branch/path/file.csv` or
2363
+ `https://github.com/user/repo/blob/branch/path/file.parquet`
2364
+
2365
+ Connection strings follow database URL formats and must also specify a table using the
2366
+ `::table_name` suffix. Examples include:
2367
+
2368
+ ```
2369
+ "duckdb:///path/to/database.ddb::table_name"
2370
+ "sqlite:///path/to/database.db::table_name"
2371
+ "postgresql://user:password@localhost:5432/database::table_name"
2372
+ "mysql://user:password@localhost:3306/database::table_name"
2373
+ "bigquery://project/dataset::table_name"
2374
+ "snowflake://user:password@account/database/schema::table_name"
2375
+ ```
2376
+
2377
+ When using connection strings, the Ibis library with the appropriate backend driver is required.
2378
+
2203
2379
  Examples
2204
2380
  --------
2205
2381
  To get the number of columns in a table, we can use the `get_column_count()` function. Here's an
@@ -2224,9 +2400,63 @@ def get_column_count(data: FrameT | Any) -> int:
2224
2400
  pb.get_column_count(small_table_duckdb)
2225
2401
  ```
2226
2402
 
2403
+ #### Working with CSV Files
2404
+
2405
+ The `get_column_count()` function can directly accept CSV file paths:
2406
+
2407
+ ```{python}
2408
+ # Get a path to a CSV file from the package data
2409
+ csv_path = pb.get_data_path("global_sales", "csv")
2410
+
2411
+ pb.get_column_count(csv_path)
2412
+ ```
2413
+
2414
+ #### Working with Parquet Files
2415
+
2416
+ The function supports various Parquet input formats:
2417
+
2418
+ ```{python}
2419
+ # Single Parquet file from package data
2420
+ parquet_path = pb.get_data_path("nycflights", "parquet")
2421
+
2422
+ pb.get_column_count(parquet_path)
2423
+ ```
2424
+
2425
+ You can also use glob patterns and directories:
2426
+
2427
+ ```python
2428
+ # Multiple Parquet files with glob patterns
2429
+ pb.get_column_count("data/sales_*.parquet")
2430
+
2431
+ # Directory containing Parquet files
2432
+ pb.get_column_count("parquet_data/")
2433
+
2434
+ # Partitioned Parquet dataset
2435
+ pb.get_column_count("sales_data/") # Auto-discovers partition columns
2436
+ ```
2437
+
2438
+ #### Working with Database Connection Strings
2439
+
2440
+ The function supports database connection strings for direct access to database tables:
2441
+
2442
+ ```{python}
2443
+ # Get path to a DuckDB database file from package data
2444
+ duckdb_path = pb.get_data_path("game_revenue", "duckdb")
2445
+
2446
+ pb.get_column_count(f"duckdb:///{duckdb_path}::game_revenue")
2447
+ ```
2448
+
2227
2449
  The function always returns the number of columns in the table as an integer value, which is
2228
2450
  `8` for the `small_table` dataset.
2229
2451
  """
2452
+ from pathlib import Path
2453
+
2454
+ # Process different input types
2455
+ if isinstance(data, str) or isinstance(data, Path):
2456
+ data = _process_data(data)
2457
+ elif isinstance(data, list):
2458
+ # Handle list of file paths (likely Parquet files)
2459
+ data = _process_parquet_input(data)
2230
2460
 
2231
2461
  if "ibis.expr.types.relations.Table" in str(type(data)):
2232
2462
  return len(data.columns)
@@ -2250,14 +2480,15 @@ def get_row_count(data: FrameT | Any) -> int:
2250
2480
 
2251
2481
  The `get_row_count()` function returns the number of rows in a table. The function works with
2252
2482
  any table that is supported by the `pointblank` library, including Pandas, Polars, and Ibis
2253
- backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.).
2483
+ backend tables (e.g., DuckDB, MySQL, PostgreSQL, SQLite, Parquet, etc.). It also supports
2484
+ direct input of CSV files, Parquet files, and database connection strings.
2254
2485
 
2255
2486
  Parameters
2256
2487
  ----------
2257
2488
  data
2258
- The table for which to get the row count, which could be a DataFrame object or an Ibis table
2259
- object. Read the *Supported Input Table Types* section for details on the supported table
2260
- types.
2489
+ The table for which to get the row count, which could be a DataFrame object, an Ibis table
2490
+ object, a CSV file path, a Parquet file path, or a database connection string.
2491
+ Read the *Supported Input Table Types* section for details on the supported table types.
2261
2492
 
2262
2493
  Returns
2263
2494
  -------
@@ -2280,12 +2511,40 @@ def get_row_count(data: FrameT | Any) -> int:
2280
2511
  - PySpark table (`"pyspark"`)*
2281
2512
  - BigQuery table (`"bigquery"`)*
2282
2513
  - Parquet table (`"parquet"`)*
2514
+ - CSV files (string path or `pathlib.Path` object with `.csv` extension)
2515
+ - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
2516
+ extension, or partitioned dataset)
2517
+ - GitHub URLs (direct links to CSV or Parquet files on GitHub)
2518
+ - Database connection strings (URI format with optional table specification)
2283
2519
 
2284
2520
  The table types marked with an asterisk need to be prepared as Ibis tables (with type of
2285
2521
  `ibis.expr.types.relations.Table`). Furthermore, using `get_row_count()` with these types of
2286
2522
  tables requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a
2287
2523
  Polars or Pandas DataFrame, the availability of Ibis is not needed.
2288
2524
 
2525
+ To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
2526
+ provided. The file will be automatically detected and loaded using the best available DataFrame
2527
+ library. The loading preference is Polars first, then Pandas as a fallback.
2528
+
2529
+ GitHub URLs pointing to CSV or Parquet files are automatically detected and converted to raw
2530
+ content URLs for downloading. The URL format should be:
2531
+ `https://github.com/user/repo/blob/branch/path/file.csv` or
2532
+ `https://github.com/user/repo/blob/branch/path/file.parquet`
2533
+
2534
+ Connection strings follow database URL formats and must also specify a table using the
2535
+ `::table_name` suffix. Examples include:
2536
+
2537
+ ```
2538
+ "duckdb:///path/to/database.ddb::table_name"
2539
+ "sqlite:///path/to/database.db::table_name"
2540
+ "postgresql://user:password@localhost:5432/database::table_name"
2541
+ "mysql://user:password@localhost:3306/database::table_name"
2542
+ "bigquery://project/dataset::table_name"
2543
+ "snowflake://user:password@account/database/schema::table_name"
2544
+ ```
2545
+
2546
+ When using connection strings, the Ibis library with the appropriate backend driver is required.
2547
+
2289
2548
  Examples
2290
2549
  --------
2291
2550
  Getting the number of rows in a table is easily done by using the `get_row_count()` function.
@@ -2310,9 +2569,63 @@ def get_row_count(data: FrameT | Any) -> int:
2310
2569
  pb.get_row_count(game_revenue_duckdb)
2311
2570
  ```
2312
2571
 
2572
+ #### Working with CSV Files
2573
+
2574
+ The `get_row_count()` function can directly accept CSV file paths:
2575
+
2576
+ ```{python}
2577
+ # Get a path to a CSV file from the package data
2578
+ csv_path = pb.get_data_path("global_sales", "csv")
2579
+
2580
+ pb.get_row_count(csv_path)
2581
+ ```
2582
+
2583
+ #### Working with Parquet Files
2584
+
2585
+ The function supports various Parquet input formats:
2586
+
2587
+ ```{python}
2588
+ # Single Parquet file from package data
2589
+ parquet_path = pb.get_data_path("nycflights", "parquet")
2590
+
2591
+ pb.get_row_count(parquet_path)
2592
+ ```
2593
+
2594
+ You can also use glob patterns and directories:
2595
+
2596
+ ```python
2597
+ # Multiple Parquet files with glob patterns
2598
+ pb.get_row_count("data/sales_*.parquet")
2599
+
2600
+ # Directory containing Parquet files
2601
+ pb.get_row_count("parquet_data/")
2602
+
2603
+ # Partitioned Parquet dataset
2604
+ pb.get_row_count("sales_data/") # Auto-discovers partition columns
2605
+ ```
2606
+
2607
+ #### Working with Database Connection Strings
2608
+
2609
+ The function supports database connection strings for direct access to database tables:
2610
+
2611
+ ```{python}
2612
+ # Get path to a DuckDB database file from package data
2613
+ duckdb_path = pb.get_data_path("game_revenue", "duckdb")
2614
+
2615
+ pb.get_row_count(f"duckdb:///{duckdb_path}::game_revenue")
2616
+ ```
2617
+
2313
2618
  The function always returns the number of rows in the table as an integer value, which is `2000`
2314
2619
  for the `game_revenue` dataset.
2315
2620
  """
2621
+ from pathlib import Path
2622
+
2623
+ # Process different input types
2624
+ if isinstance(data, str) or isinstance(data, Path):
2625
+ data = _process_data(data)
2626
+ elif isinstance(data, list):
2627
+ # Handle list of file paths (likely Parquet files)
2628
+ data = _process_parquet_input(data)
2316
2629
 
2317
2630
  if "ibis.expr.types.relations.Table" in str(type(data)):
2318
2631
  # Determine whether Pandas or Polars is available to get the row count
@@ -2717,13 +3030,15 @@ class Validate:
2717
3030
  ----------
2718
3031
  data
2719
3032
  The table to validate, which could be a DataFrame object, an Ibis table object, a CSV
2720
- file path, a Parquet file path, or a database connection string. When providing a CSV or
2721
- Parquet file path (as a string or `pathlib.Path` object), the file will be automatically
2722
- loaded using an available DataFrame library (Polars or Pandas). Parquet input also supports
2723
- glob patterns, directories containing .parquet files, and Spark-style partitioned datasets.
2724
- Connection strings enable direct database access via Ibis with optional table specification
2725
- using the `::table_name` suffix. Read the *Supported Input Table Types* section for details
2726
- on the supported table types.
3033
+ file path, a Parquet file path, a GitHub URL pointing to a CSV or Parquet file, or a
3034
+ database connection string. When providing a CSV or Parquet file path (as a string or
3035
+ `pathlib.Path` object), the file will be automatically loaded using an available DataFrame
3036
+ library (Polars or Pandas). Parquet input also supports glob patterns, directories
3037
+ containing .parquet files, and Spark-style partitioned datasets. GitHub URLs are
3038
+ automatically transformed to raw content URLs and downloaded. Connection strings enable
3039
+ direct database access via Ibis with optional table specification using the `::table_name`
3040
+ suffix. Read the *Supported Input Table Types* section for details on the supported table
3041
+ types.
2727
3042
  tbl_name
2728
3043
  An optional name to assign to the input table object. If no value is provided, a name will
2729
3044
  be generated based on whatever information is available. This table name will be displayed
@@ -3243,14 +3558,8 @@ class Validate:
3243
3558
  locale: str | None = None
3244
3559
 
3245
3560
  def __post_init__(self):
3246
- # Handle connection string input for the data parameter
3247
- self.data = _process_connection_string(self.data)
3248
-
3249
- # Handle CSV file input for the data parameter
3250
- self.data = _process_csv_input(self.data)
3251
-
3252
- # Handle Parquet file input for the data parameter
3253
- self.data = _process_parquet_input(self.data)
3561
+ # Process data through the centralized data processing pipeline
3562
+ self.data = _process_data(self.data)
3254
3563
 
3255
3564
  # Check input of the `thresholds=` argument
3256
3565
  _check_thresholds(thresholds=self.thresholds)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pointblank
3
- Version: 0.11.0
3
+ Version: 0.11.2
4
4
  Summary: Find out if your data is what you think it is.
5
5
  Author-email: Richard Iannone <riannone@me.com>
6
6
  License: MIT License
@@ -156,11 +156,11 @@ validation
156
156
 
157
157
  ## Why Choose Pointblank?
158
158
 
159
- - **Works with your existing stack** - Seamlessly integrates with Polars, Pandas, DuckDB, MySQL, PostgreSQL, SQLite, Parquet, PySpark, Snowflake, and more!
160
- - **Beautiful, interactive reports** - Crystal-clear validation results that highlight issues and help communicate data quality
161
- - **Composable validation pipeline** - Chain validation steps into a complete data quality workflow
162
- - **Threshold-based alerts** - Set 'warning', 'error', and 'critical' thresholds with custom actions
163
- - **Practical outputs** - Use validation results to filter tables, extract problematic data, or trigger downstream processes
159
+ - **Works with your existing stack**: Seamlessly integrates with Polars, Pandas, DuckDB, MySQL, PostgreSQL, SQLite, Parquet, PySpark, Snowflake, and more!
160
+ - **Beautiful, interactive reports**: Crystal-clear validation results that highlight issues and help communicate data quality
161
+ - **Composable validation pipeline**: Chain validation steps into a complete data quality workflow
162
+ - **Threshold-based alerts**: Set 'warning', 'error', and 'critical' thresholds with custom actions
163
+ - **Practical outputs**: Use validation results to filter tables, extract problematic data, or trigger downstream processes
164
164
 
165
165
  ## Real-World Example
166
166
 
@@ -240,7 +240,7 @@ validation.get_step_report(i=3).show("browser") # Get failing records from step
240
240
  Pointblank includes a powerful CLI utility called `pb` that lets you run data validation workflows directly from the command line. Perfect for CI/CD pipelines, scheduled data quality checks, or quick validation tasks.
241
241
 
242
242
  <div align="center">
243
- <img src="https://posit-dev.github.io/pointblank/assets/vhs/cli-complete-workflow.gif" width="800px">
243
+ <img src="https://posit-dev.github.io/pointblank/assets/vhs/cli-complete-workflow.gif" width="100%">
244
244
  </div>
245
245
 
246
246
  **Explore Your Data**
@@ -249,43 +249,47 @@ Pointblank includes a powerful CLI utility called `pb` that lets you run data va
249
249
  # Get a quick preview of your data
250
250
  pb preview small_table
251
251
 
252
- # Check for missing values
253
- pb missing small_table
252
+ # Preview data from GitHub URLs
253
+ pb preview "https://github.com/user/repo/blob/main/data.csv"
254
254
 
255
- # Generate column summaries
256
- pb scan small_table
255
+ # Check for missing values in Parquet files
256
+ pb missing data.parquet
257
+
258
+ # Generate column summaries from database connections
259
+ pb scan "duckdb:///data/sales.ddb::customers"
257
260
  ```
258
261
 
259
262
  **Run Essential Validations**
260
263
 
261
264
  ```bash
262
265
  # Check for duplicate rows
263
- pb validate-simple small_table --check rows-distinct
266
+ pb validate small_table --check rows-distinct
267
+
268
+ # Validate data directly from GitHub
269
+ pb validate "https://github.com/user/repo/blob/main/sales.csv" --check col-vals-not-null --column customer_id
264
270
 
265
- # Verify no null values
266
- pb validate-simple small_table --check col-vals-not-null --column a
271
+ # Verify no null values in Parquet datasets
272
+ pb validate "data/*.parquet" --check col-vals-not-null --column a
267
273
 
268
274
  # Extract failing data for debugging
269
- pb validate-simple small_table --check col-vals-gt --column a --value 5 --show-extract
275
+ pb validate small_table --check col-vals-gt --column a --value 5 --show-extract
270
276
  ```
271
277
 
272
278
  **Integrate with CI/CD**
273
279
 
274
280
  ```bash
275
281
  # Use exit codes for automation (0 = pass, 1 = fail)
276
- pb validate-simple small_table --check rows-distinct && echo "✅ Quality checks passed"
282
+ pb validate small_table --check rows-distinct --exit-code
277
283
  ```
278
284
 
279
- Learn more in our [CLI documentation](https://posit-dev.github.io/pointblank/user-guide/cli.html).
280
-
281
285
  ## Features That Set Pointblank Apart
282
286
 
283
- - **Complete validation workflow** - From data access to validation to reporting in a single pipeline
284
- - **Built for collaboration** - Share results with colleagues through beautiful interactive reports
285
- - **Practical outputs** - Get exactly what you need: counts, extracts, summaries, or full reports
286
- - **Flexible deployment** - Use in notebooks, scripts, or data pipelines
287
- - **Customizable** - Tailor validation steps and reporting to your specific needs
288
- - **Internationalization** - Reports can be generated in over 20 languages, including English, Spanish, French, and German
287
+ - **Complete validation workflow**: From data access to validation to reporting in a single pipeline
288
+ - **Built for collaboration**: Share results with colleagues through beautiful interactive reports
289
+ - **Practical outputs**: Get exactly what you need: counts, extracts, summaries, or full reports
290
+ - **Flexible deployment**: Use in notebooks, scripts, or data pipelines
291
+ - **Customizable**: Tailor validation steps and reporting to your specific needs
292
+ - **Internationalization**: Reports can be generated in over 20 languages, including English, Spanish, French, and German
289
293
 
290
294
  ## Documentation and Examples
291
295