pointblank 0.11.6__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +2 -0
- pointblank/_constants.py +0 -1
- pointblank/_interrogation.py +244 -606
- pointblank/_utils.py +65 -3
- pointblank/assistant.py +9 -0
- pointblank/cli.py +39 -24
- pointblank/data/api-docs.txt +658 -29
- pointblank/schema.py +17 -0
- pointblank/segments.py +163 -0
- pointblank/validate.py +344 -92
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/METADATA +59 -6
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/RECORD +16 -15
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/WHEEL +0 -0
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/entry_points.txt +0 -0
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.11.6.dist-info → pointblank-0.12.1.dist-info}/top_level.txt +0 -0
pointblank/_utils.py
CHANGED
|
@@ -10,7 +10,7 @@ from great_tables import GT
|
|
|
10
10
|
from great_tables.gt import _get_column_of_values
|
|
11
11
|
from narwhals.typing import FrameT
|
|
12
12
|
|
|
13
|
-
from pointblank._constants import ASSERTION_TYPE_METHOD_MAP, GENERAL_COLUMN_TYPES
|
|
13
|
+
from pointblank._constants import ASSERTION_TYPE_METHOD_MAP, GENERAL_COLUMN_TYPES, IBIS_BACKENDS
|
|
14
14
|
|
|
15
15
|
if TYPE_CHECKING:
|
|
16
16
|
from collections.abc import Mapping
|
|
@@ -66,11 +66,13 @@ def _get_tbl_type(data: FrameT | Any) -> str:
|
|
|
66
66
|
except Exception as e:
|
|
67
67
|
raise TypeError("The `data` object is not a DataFrame or Ibis Table.") from e
|
|
68
68
|
|
|
69
|
-
# Detect through regex if the table is a polars or
|
|
69
|
+
# Detect through regex if the table is a polars, pandas, or Spark DataFrame
|
|
70
70
|
if re.search(r"polars", df_ns_str, re.IGNORECASE):
|
|
71
71
|
return "polars"
|
|
72
72
|
elif re.search(r"pandas", df_ns_str, re.IGNORECASE):
|
|
73
73
|
return "pandas"
|
|
74
|
+
elif re.search(r"pyspark", df_ns_str, re.IGNORECASE):
|
|
75
|
+
return "pyspark"
|
|
74
76
|
|
|
75
77
|
# If ibis is present, then get the table's backend name
|
|
76
78
|
ibis_present = _is_lib_present(lib_name="ibis")
|
|
@@ -108,6 +110,41 @@ def _get_tbl_type(data: FrameT | Any) -> str:
|
|
|
108
110
|
return "unknown" # pragma: no cover
|
|
109
111
|
|
|
110
112
|
|
|
113
|
+
def _process_ibis_through_narwhals(data: FrameT | Any, tbl_type: str) -> tuple[FrameT | Any, str]:
|
|
114
|
+
"""
|
|
115
|
+
Process Ibis tables through Narwhals to unify the processing pathway.
|
|
116
|
+
|
|
117
|
+
This function takes an Ibis table and wraps it with Narwhals, allowing
|
|
118
|
+
all downstream processing to use the unified Narwhals API instead of
|
|
119
|
+
Ibis-specific code paths.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
data : FrameT | Any
|
|
124
|
+
The data table, potentially an Ibis table
|
|
125
|
+
tbl_type : str
|
|
126
|
+
The detected table type
|
|
127
|
+
|
|
128
|
+
Returns
|
|
129
|
+
-------
|
|
130
|
+
tuple[FrameT | Any, str]
|
|
131
|
+
A tuple of (processed_data, updated_tbl_type) where:
|
|
132
|
+
- processed_data is the Narwhals-wrapped table if it was Ibis, otherwise original data
|
|
133
|
+
- updated_tbl_type is "narwhals" if it was Ibis, otherwise original tbl_type
|
|
134
|
+
"""
|
|
135
|
+
# Check if this is an Ibis table type
|
|
136
|
+
if tbl_type in IBIS_BACKENDS:
|
|
137
|
+
try:
|
|
138
|
+
# Wrap with Narwhals
|
|
139
|
+
narwhals_wrapped = nw.from_native(data)
|
|
140
|
+
return narwhals_wrapped, "narwhals"
|
|
141
|
+
except Exception:
|
|
142
|
+
# If Narwhals can't handle it, fall back to original approach
|
|
143
|
+
return data, tbl_type
|
|
144
|
+
|
|
145
|
+
return data, tbl_type
|
|
146
|
+
|
|
147
|
+
|
|
111
148
|
def _is_narwhals_table(data: any) -> bool:
|
|
112
149
|
# Check if the data is a Narwhals DataFrame
|
|
113
150
|
type_str = str(type(data)).lower()
|
|
@@ -164,7 +201,7 @@ def _check_any_df_lib(method_used: str) -> None:
|
|
|
164
201
|
def _is_value_a_df(value: Any) -> bool:
|
|
165
202
|
try:
|
|
166
203
|
ns = nw.get_native_namespace(value)
|
|
167
|
-
if "polars" in str(ns) or "pandas" in str(ns):
|
|
204
|
+
if "polars" in str(ns) or "pandas" in str(ns) or "pyspark" in str(ns):
|
|
168
205
|
return True
|
|
169
206
|
else: # pragma: no cover
|
|
170
207
|
return False
|
|
@@ -619,6 +656,10 @@ def _get_api_text() -> str:
|
|
|
619
656
|
"expr_col",
|
|
620
657
|
]
|
|
621
658
|
|
|
659
|
+
segments_exported = [
|
|
660
|
+
"seg_group",
|
|
661
|
+
]
|
|
662
|
+
|
|
622
663
|
interrogation_exported = [
|
|
623
664
|
"Validate.interrogate",
|
|
624
665
|
"Validate.get_tabular_report",
|
|
@@ -648,6 +689,12 @@ def _get_api_text() -> str:
|
|
|
648
689
|
"assistant",
|
|
649
690
|
"load_dataset",
|
|
650
691
|
"get_data_path",
|
|
692
|
+
"connect_to_table",
|
|
693
|
+
]
|
|
694
|
+
|
|
695
|
+
yaml_exported = [
|
|
696
|
+
"yaml_interrogate",
|
|
697
|
+
"validate_yaml",
|
|
651
698
|
]
|
|
652
699
|
|
|
653
700
|
utility_exported = [
|
|
@@ -679,6 +726,10 @@ many steps). Furthermore, the `col()` function can be used to declare a comparis
|
|
|
679
726
|
for the `value=` argument in many `col_vals_*()` methods) when you can't use a fixed value
|
|
680
727
|
for comparison."""
|
|
681
728
|
|
|
729
|
+
segments_desc = (
|
|
730
|
+
"""Combine multiple values into a single segment using `seg_*()` helper functions."""
|
|
731
|
+
)
|
|
732
|
+
|
|
682
733
|
interrogation_desc = """The validation plan is put into action when `interrogate()` is called.
|
|
683
734
|
The workflow for performing a comprehensive validation is then: (1) `Validate()`, (2) adding
|
|
684
735
|
validation steps, (3) `interrogate()`. After interrogation of the data, we can view a validation
|
|
@@ -694,6 +745,11 @@ datasets included in the package can be accessed via the `load_dataset()` functi
|
|
|
694
745
|
`config()` utility lets us set global configuration parameters. Want to chat with an assistant? Use
|
|
695
746
|
the `assistant()` function to get help with Pointblank."""
|
|
696
747
|
|
|
748
|
+
yaml_desc = """The *YAML* group contains functions that allow for the use of YAML to orchestrate
|
|
749
|
+
validation workflows. The `yaml_interrogate()` function can be used to run a validation workflow from
|
|
750
|
+
YAML strings or files. The `validate_yaml()` function checks if the YAML configuration
|
|
751
|
+
passes its own validity checks."""
|
|
752
|
+
|
|
697
753
|
utility_desc = """The Utility Functions group contains functions that are useful for accessing
|
|
698
754
|
metadata about the target data. Use `get_column_count()` or `get_row_count()` to get the number of
|
|
699
755
|
columns or rows in a table. The `get_action_metadata()` function is useful when building custom
|
|
@@ -718,12 +774,18 @@ table information, and timing details."""
|
|
|
718
774
|
api_text += f"""\n## The Column Selection family\n\n{column_selection_desc}\n\n"""
|
|
719
775
|
api_text += get_api_details(module=pointblank, exported_list=column_selection_exported)
|
|
720
776
|
|
|
777
|
+
api_text += f"""\n## The Segments family\n\n{segments_desc}\n\n"""
|
|
778
|
+
api_text += get_api_details(module=pointblank, exported_list=segments_exported)
|
|
779
|
+
|
|
721
780
|
api_text += f"""\n## The Interrogation and Reporting family\n\n{interrogation_desc}\n\n"""
|
|
722
781
|
api_text += get_api_details(module=pointblank, exported_list=interrogation_exported)
|
|
723
782
|
|
|
724
783
|
api_text += f"""\n## The Inspection and Assistance family\n\n{inspect_desc}\n\n"""
|
|
725
784
|
api_text += get_api_details(module=pointblank, exported_list=inspect_exported)
|
|
726
785
|
|
|
786
|
+
api_text += f"""\n## The YAML family\n\n{yaml_desc}\n\n"""
|
|
787
|
+
api_text += get_api_details(module=pointblank, exported_list=yaml_exported)
|
|
788
|
+
|
|
727
789
|
api_text += f"""\n## The Utility Functions family\n\n{utility_desc}\n\n"""
|
|
728
790
|
api_text += get_api_details(module=pointblank, exported_list=utility_exported)
|
|
729
791
|
|
pointblank/assistant.py
CHANGED
|
@@ -138,10 +138,15 @@ def assistant(
|
|
|
138
138
|
|
|
139
139
|
- Polars DataFrame (`"polars"`)
|
|
140
140
|
- Pandas DataFrame (`"pandas"`)
|
|
141
|
+
- PySpark table (`"pyspark"`)
|
|
141
142
|
- DuckDB table (`"duckdb"`)*
|
|
142
143
|
- MySQL table (`"mysql"`)*
|
|
143
144
|
- PostgreSQL table (`"postgresql"`)*
|
|
144
145
|
- SQLite table (`"sqlite"`)*
|
|
146
|
+
- Microsoft SQL Server table (`"mssql"`)*
|
|
147
|
+
- Snowflake table (`"snowflake"`)*
|
|
148
|
+
- Databricks table (`"databricks"`)*
|
|
149
|
+
- BigQuery table (`"bigquery"`)*
|
|
145
150
|
- Parquet table (`"parquet"`)*
|
|
146
151
|
- CSV files (string path or `pathlib.Path` object with `.csv` extension)
|
|
147
152
|
- Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
|
|
@@ -152,6 +157,10 @@ def assistant(
|
|
|
152
157
|
`ibis.expr.types.relations.Table`). Furthermore, using `assistant()` with these types of tables
|
|
153
158
|
requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or
|
|
154
159
|
Pandas DataFrame, the availability of Ibis is not needed.
|
|
160
|
+
|
|
161
|
+
To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
|
|
162
|
+
provided. The file will be automatically detected and loaded using the best available DataFrame
|
|
163
|
+
library. The loading preference is Polars first, then Pandas as a fallback.
|
|
155
164
|
"""
|
|
156
165
|
|
|
157
166
|
# Check that the chatlas package is installed
|
pointblank/cli.py
CHANGED
|
@@ -1360,10 +1360,10 @@ def preview(
|
|
|
1360
1360
|
For tables with many columns, use these options to control which columns are displayed:
|
|
1361
1361
|
|
|
1362
1362
|
\b
|
|
1363
|
-
- --columns: Specify exact columns (
|
|
1364
|
-
- --col-range: Select column range (
|
|
1365
|
-
- --col-first: Show first N columns (
|
|
1366
|
-
- --col-last: Show last N columns (
|
|
1363
|
+
- --columns: Specify exact columns (--columns "name,age,email")
|
|
1364
|
+
- --col-range: Select column range (--col-range "1:10", --col-range "5:", --col-range ":15")
|
|
1365
|
+
- --col-first: Show first N columns (--col-first 5)
|
|
1366
|
+
- --col-last: Show last N columns (--col-last 3)
|
|
1367
1367
|
|
|
1368
1368
|
Tables with >15 columns automatically show first 7 and last 7 columns with indicators.
|
|
1369
1369
|
"""
|
|
@@ -1920,31 +1920,43 @@ def validate(
|
|
|
1920
1920
|
|
|
1921
1921
|
AVAILABLE CHECK_TYPES:
|
|
1922
1922
|
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
The default CHECK_TYPE is 'rows-distinct' which checks for duplicate rows.
|
|
1923
|
+
Require no additional options:
|
|
1926
1924
|
|
|
1927
1925
|
\b
|
|
1928
1926
|
- rows-distinct: Check if all rows in the dataset are unique (no duplicates)
|
|
1929
1927
|
- rows-complete: Check if all rows are complete (no missing values in any column)
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
- col-
|
|
1935
|
-
- col-vals-
|
|
1936
|
-
|
|
1928
|
+
|
|
1929
|
+
Require --column:
|
|
1930
|
+
|
|
1931
|
+
\b
|
|
1932
|
+
- col-exists: Check if a specific column exists in the dataset
|
|
1933
|
+
- col-vals-not-null: Check if all values in a column are not null/missing
|
|
1934
|
+
|
|
1935
|
+
Require --column and --value:
|
|
1936
|
+
|
|
1937
|
+
\b
|
|
1938
|
+
- col-vals-gt: Check if column values are greater than a fixed value
|
|
1939
|
+
- col-vals-ge: Check if column values are greater than or equal to a fixed value
|
|
1940
|
+
- col-vals-lt: Check if column values are less than a fixed value
|
|
1941
|
+
- col-vals-le: Check if column values are less than or equal to a fixed value
|
|
1942
|
+
|
|
1943
|
+
Require --column and --set:
|
|
1944
|
+
|
|
1945
|
+
\b
|
|
1946
|
+
- col-vals-in-set: Check if column values are in an allowed set
|
|
1947
|
+
|
|
1948
|
+
Use --list-checks to see all available validation methods with examples. The default CHECK_TYPE
|
|
1949
|
+
is 'rows-distinct' which checks for duplicate rows.
|
|
1937
1950
|
|
|
1938
1951
|
Examples:
|
|
1939
1952
|
|
|
1940
1953
|
\b
|
|
1941
|
-
pb validate data.csv
|
|
1942
|
-
pb validate data.csv --list-checks
|
|
1954
|
+
pb validate data.csv # Uses default validation (rows-distinct)
|
|
1955
|
+
pb validate data.csv --list-checks # Show all available checks
|
|
1943
1956
|
pb validate data.csv --check rows-distinct
|
|
1944
1957
|
pb validate data.csv --check rows-distinct --show-extract
|
|
1945
1958
|
pb validate data.csv --check rows-distinct --write-extract failing_rows_folder
|
|
1946
1959
|
pb validate data.csv --check rows-distinct --exit-code
|
|
1947
|
-
pb validate data.csv --check rows-complete
|
|
1948
1960
|
pb validate data.csv --check col-exists --column price
|
|
1949
1961
|
pb validate data.csv --check col-vals-not-null --column email
|
|
1950
1962
|
pb validate data.csv --check col-vals-gt --column score --value 50
|
|
@@ -1952,7 +1964,6 @@ def validate(
|
|
|
1952
1964
|
|
|
1953
1965
|
Multiple validations in one command:
|
|
1954
1966
|
pb validate data.csv --check rows-distinct --check rows-complete
|
|
1955
|
-
pb validate data.csv --check col-vals-not-null --column email --check col-vals-gt --column age --value 18
|
|
1956
1967
|
"""
|
|
1957
1968
|
try:
|
|
1958
1969
|
import sys
|
|
@@ -4627,36 +4638,40 @@ def pl(
|
|
|
4627
4638
|
pb pl "pl.read_csv('data.csv').select(['name', 'age'])"
|
|
4628
4639
|
pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)"
|
|
4629
4640
|
|
|
4641
|
+
\b
|
|
4630
4642
|
# Multi-line with editor (supports multiple statements)
|
|
4631
4643
|
pb pl --edit
|
|
4632
4644
|
|
|
4645
|
+
\b
|
|
4633
4646
|
# Multi-statement code example in editor:
|
|
4634
4647
|
# csv = pl.read_csv('data.csv')
|
|
4635
4648
|
# result = csv.select(['name', 'age']).filter(pl.col('age') > 25)
|
|
4636
4649
|
|
|
4650
|
+
\b
|
|
4637
4651
|
# Multi-line with a specific editor
|
|
4638
4652
|
pb pl --edit --editor nano
|
|
4639
4653
|
pb pl --edit --editor code
|
|
4640
4654
|
pb pl --edit --editor micro
|
|
4641
4655
|
|
|
4656
|
+
\b
|
|
4642
4657
|
# From file
|
|
4643
4658
|
pb pl --file query.py
|
|
4644
4659
|
|
|
4645
|
-
|
|
4646
|
-
|
|
4660
|
+
\b
|
|
4661
|
+
Piping to other pb commands
|
|
4662
|
+
pb pl "pl.read_csv('data.csv').head(20)" --pipe | pb validate --check rows-distinct
|
|
4647
4663
|
pb pl --edit --pipe | pb preview --head 10
|
|
4648
4664
|
pb pl --edit --pipe | pb scan --output-html report.html
|
|
4649
4665
|
pb pl --edit --pipe | pb missing --output-html missing_report.html
|
|
4650
4666
|
|
|
4651
|
-
Use --output-format to change how results are displayed:
|
|
4652
|
-
|
|
4653
4667
|
\b
|
|
4668
|
+
Use --output-format to change how results are displayed:
|
|
4654
4669
|
pb pl "pl.read_csv('data.csv')" --output-format scan
|
|
4655
4670
|
pb pl "pl.read_csv('data.csv')" --output-format missing
|
|
4656
4671
|
pb pl "pl.read_csv('data.csv')" --output-format info
|
|
4657
4672
|
|
|
4658
|
-
Note: For multi-statement code, assign your final result to a variable like
|
|
4659
|
-
'
|
|
4673
|
+
Note: For multi-statement code, assign your final result to a variable like 'result', 'df',
|
|
4674
|
+
'data', or ensure it's the last expression.
|
|
4660
4675
|
"""
|
|
4661
4676
|
try:
|
|
4662
4677
|
# Check if Polars is available
|