pointblank 0.11.6__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/_utils.py CHANGED
@@ -10,7 +10,7 @@ from great_tables import GT
10
10
  from great_tables.gt import _get_column_of_values
11
11
  from narwhals.typing import FrameT
12
12
 
13
- from pointblank._constants import ASSERTION_TYPE_METHOD_MAP, GENERAL_COLUMN_TYPES
13
+ from pointblank._constants import ASSERTION_TYPE_METHOD_MAP, GENERAL_COLUMN_TYPES, IBIS_BACKENDS
14
14
 
15
15
  if TYPE_CHECKING:
16
16
  from collections.abc import Mapping
@@ -66,11 +66,13 @@ def _get_tbl_type(data: FrameT | Any) -> str:
66
66
  except Exception as e:
67
67
  raise TypeError("The `data` object is not a DataFrame or Ibis Table.") from e
68
68
 
69
- # Detect through regex if the table is a polars or pandas DataFrame
69
+ # Detect through regex if the table is a polars, pandas, or Spark DataFrame
70
70
  if re.search(r"polars", df_ns_str, re.IGNORECASE):
71
71
  return "polars"
72
72
  elif re.search(r"pandas", df_ns_str, re.IGNORECASE):
73
73
  return "pandas"
74
+ elif re.search(r"pyspark", df_ns_str, re.IGNORECASE):
75
+ return "pyspark"
74
76
 
75
77
  # If ibis is present, then get the table's backend name
76
78
  ibis_present = _is_lib_present(lib_name="ibis")
@@ -108,6 +110,41 @@ def _get_tbl_type(data: FrameT | Any) -> str:
108
110
  return "unknown" # pragma: no cover
109
111
 
110
112
 
113
+ def _process_ibis_through_narwhals(data: FrameT | Any, tbl_type: str) -> tuple[FrameT | Any, str]:
114
+ """
115
+ Process Ibis tables through Narwhals to unify the processing pathway.
116
+
117
+ This function takes an Ibis table and wraps it with Narwhals, allowing
118
+ all downstream processing to use the unified Narwhals API instead of
119
+ Ibis-specific code paths.
120
+
121
+ Parameters
122
+ ----------
123
+ data : FrameT | Any
124
+ The data table, potentially an Ibis table
125
+ tbl_type : str
126
+ The detected table type
127
+
128
+ Returns
129
+ -------
130
+ tuple[FrameT | Any, str]
131
+ A tuple of (processed_data, updated_tbl_type) where:
132
+ - processed_data is the Narwhals-wrapped table if it was Ibis, otherwise original data
133
+ - updated_tbl_type is "narwhals" if it was Ibis, otherwise original tbl_type
134
+ """
135
+ # Check if this is an Ibis table type
136
+ if tbl_type in IBIS_BACKENDS:
137
+ try:
138
+ # Wrap with Narwhals
139
+ narwhals_wrapped = nw.from_native(data)
140
+ return narwhals_wrapped, "narwhals"
141
+ except Exception:
142
+ # If Narwhals can't handle it, fall back to original approach
143
+ return data, tbl_type
144
+
145
+ return data, tbl_type
146
+
147
+
111
148
  def _is_narwhals_table(data: any) -> bool:
112
149
  # Check if the data is a Narwhals DataFrame
113
150
  type_str = str(type(data)).lower()
@@ -164,7 +201,7 @@ def _check_any_df_lib(method_used: str) -> None:
164
201
  def _is_value_a_df(value: Any) -> bool:
165
202
  try:
166
203
  ns = nw.get_native_namespace(value)
167
- if "polars" in str(ns) or "pandas" in str(ns):
204
+ if "polars" in str(ns) or "pandas" in str(ns) or "pyspark" in str(ns):
168
205
  return True
169
206
  else: # pragma: no cover
170
207
  return False
@@ -619,6 +656,10 @@ def _get_api_text() -> str:
619
656
  "expr_col",
620
657
  ]
621
658
 
659
+ segments_exported = [
660
+ "seg_group",
661
+ ]
662
+
622
663
  interrogation_exported = [
623
664
  "Validate.interrogate",
624
665
  "Validate.get_tabular_report",
@@ -648,6 +689,12 @@ def _get_api_text() -> str:
648
689
  "assistant",
649
690
  "load_dataset",
650
691
  "get_data_path",
692
+ "connect_to_table",
693
+ ]
694
+
695
+ yaml_exported = [
696
+ "yaml_interrogate",
697
+ "validate_yaml",
651
698
  ]
652
699
 
653
700
  utility_exported = [
@@ -679,6 +726,10 @@ many steps). Furthermore, the `col()` function can be used to declare a comparis
679
726
  for the `value=` argument in many `col_vals_*()` methods) when you can't use a fixed value
680
727
  for comparison."""
681
728
 
729
+ segments_desc = (
730
+ """Combine multiple values into a single segment using `seg_*()` helper functions."""
731
+ )
732
+
682
733
  interrogation_desc = """The validation plan is put into action when `interrogate()` is called.
683
734
  The workflow for performing a comprehensive validation is then: (1) `Validate()`, (2) adding
684
735
  validation steps, (3) `interrogate()`. After interrogation of the data, we can view a validation
@@ -694,6 +745,11 @@ datasets included in the package can be accessed via the `load_dataset()` functi
694
745
  `config()` utility lets us set global configuration parameters. Want to chat with an assistant? Use
695
746
  the `assistant()` function to get help with Pointblank."""
696
747
 
748
+ yaml_desc = """The *YAML* group contains functions that allow for the use of YAML to orchestrate
749
+ validation workflows. The `yaml_interrogate()` function can be used to run a validation workflow from
750
+ YAML strings or files. The `validate_yaml()` function checks if the YAML configuration
751
+ passes its own validity checks."""
752
+
697
753
  utility_desc = """The Utility Functions group contains functions that are useful for accessing
698
754
  metadata about the target data. Use `get_column_count()` or `get_row_count()` to get the number of
699
755
  columns or rows in a table. The `get_action_metadata()` function is useful when building custom
@@ -718,12 +774,18 @@ table information, and timing details."""
718
774
  api_text += f"""\n## The Column Selection family\n\n{column_selection_desc}\n\n"""
719
775
  api_text += get_api_details(module=pointblank, exported_list=column_selection_exported)
720
776
 
777
+ api_text += f"""\n## The Segments family\n\n{segments_desc}\n\n"""
778
+ api_text += get_api_details(module=pointblank, exported_list=segments_exported)
779
+
721
780
  api_text += f"""\n## The Interrogation and Reporting family\n\n{interrogation_desc}\n\n"""
722
781
  api_text += get_api_details(module=pointblank, exported_list=interrogation_exported)
723
782
 
724
783
  api_text += f"""\n## The Inspection and Assistance family\n\n{inspect_desc}\n\n"""
725
784
  api_text += get_api_details(module=pointblank, exported_list=inspect_exported)
726
785
 
786
+ api_text += f"""\n## The YAML family\n\n{yaml_desc}\n\n"""
787
+ api_text += get_api_details(module=pointblank, exported_list=yaml_exported)
788
+
727
789
  api_text += f"""\n## The Utility Functions family\n\n{utility_desc}\n\n"""
728
790
  api_text += get_api_details(module=pointblank, exported_list=utility_exported)
729
791
 
pointblank/assistant.py CHANGED
@@ -138,10 +138,15 @@ def assistant(
138
138
 
139
139
  - Polars DataFrame (`"polars"`)
140
140
  - Pandas DataFrame (`"pandas"`)
141
+ - PySpark table (`"pyspark"`)
141
142
  - DuckDB table (`"duckdb"`)*
142
143
  - MySQL table (`"mysql"`)*
143
144
  - PostgreSQL table (`"postgresql"`)*
144
145
  - SQLite table (`"sqlite"`)*
146
+ - Microsoft SQL Server table (`"mssql"`)*
147
+ - Snowflake table (`"snowflake"`)*
148
+ - Databricks table (`"databricks"`)*
149
+ - BigQuery table (`"bigquery"`)*
145
150
  - Parquet table (`"parquet"`)*
146
151
  - CSV files (string path or `pathlib.Path` object with `.csv` extension)
147
152
  - Parquet files (string path, `pathlib.Path` object, glob pattern, directory with `.parquet`
@@ -152,6 +157,10 @@ def assistant(
152
157
  `ibis.expr.types.relations.Table`). Furthermore, using `assistant()` with these types of tables
153
158
  requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a Polars or
154
159
  Pandas DataFrame, the availability of Ibis is not needed.
160
+
161
+ To use a CSV file, ensure that a string or `pathlib.Path` object with a `.csv` extension is
162
+ provided. The file will be automatically detected and loaded using the best available DataFrame
163
+ library. The loading preference is Polars first, then Pandas as a fallback.
155
164
  """
156
165
 
157
166
  # Check that the chatlas package is installed
pointblank/cli.py CHANGED
@@ -1360,10 +1360,10 @@ def preview(
1360
1360
  For tables with many columns, use these options to control which columns are displayed:
1361
1361
 
1362
1362
  \b
1363
- - --columns: Specify exact columns (e.g., --columns "name,age,email")
1364
- - --col-range: Select column range (e.g., --col-range "1:10", --col-range "5:", --col-range ":15")
1365
- - --col-first: Show first N columns (e.g., --col-first 5)
1366
- - --col-last: Show last N columns (e.g., --col-last 3)
1363
+ - --columns: Specify exact columns (--columns "name,age,email")
1364
+ - --col-range: Select column range (--col-range "1:10", --col-range "5:", --col-range ":15")
1365
+ - --col-first: Show first N columns (--col-first 5)
1366
+ - --col-last: Show last N columns (--col-last 3)
1367
1367
 
1368
1368
  Tables with >15 columns automatically show first 7 and last 7 columns with indicators.
1369
1369
  """
@@ -1920,31 +1920,43 @@ def validate(
1920
1920
 
1921
1921
  AVAILABLE CHECK_TYPES:
1922
1922
 
1923
- Use --list-checks to see all available validation methods with examples.
1924
-
1925
- The default CHECK_TYPE is 'rows-distinct' which checks for duplicate rows.
1923
+ Require no additional options:
1926
1924
 
1927
1925
  \b
1928
1926
  - rows-distinct: Check if all rows in the dataset are unique (no duplicates)
1929
1927
  - rows-complete: Check if all rows are complete (no missing values in any column)
1930
- - col-exists: Check if a specific column exists in the dataset (requires --column)
1931
- - col-vals-not-null: Check if all values in a column are not null/missing (requires --column)
1932
- - col-vals-gt: Check if all values in a column are greater than a comparison value (requires --column and --value)
1933
- - col-vals-ge: Check if all values in a column are greater than or equal to a comparison value (requires --column and --value)
1934
- - col-vals-lt: Check if all values in a column are less than a comparison value (requires --column and --value)
1935
- - col-vals-le: Check if all values in a column are less than or equal to a comparison value (requires --column and --value)
1936
- - col-vals-in-set: Check if all values in a column are in an allowed set (requires --column and --set)
1928
+
1929
+ Require --column:
1930
+
1931
+ \b
1932
+ - col-exists: Check if a specific column exists in the dataset
1933
+ - col-vals-not-null: Check if all values in a column are not null/missing
1934
+
1935
+ Require --column and --value:
1936
+
1937
+ \b
1938
+ - col-vals-gt: Check if column values are greater than a fixed value
1939
+ - col-vals-ge: Check if column values are greater than or equal to a fixed value
1940
+ - col-vals-lt: Check if column values are less than a fixed value
1941
+ - col-vals-le: Check if column values are less than or equal to a fixed value
1942
+
1943
+ Require --column and --set:
1944
+
1945
+ \b
1946
+ - col-vals-in-set: Check if column values are in an allowed set
1947
+
1948
+ Use --list-checks to see all available validation methods with examples. The default CHECK_TYPE
1949
+ is 'rows-distinct' which checks for duplicate rows.
1937
1950
 
1938
1951
  Examples:
1939
1952
 
1940
1953
  \b
1941
- pb validate data.csv # Uses default validation (rows-distinct)
1942
- pb validate data.csv --list-checks # Show all available checks
1954
+ pb validate data.csv # Uses default validation (rows-distinct)
1955
+ pb validate data.csv --list-checks # Show all available checks
1943
1956
  pb validate data.csv --check rows-distinct
1944
1957
  pb validate data.csv --check rows-distinct --show-extract
1945
1958
  pb validate data.csv --check rows-distinct --write-extract failing_rows_folder
1946
1959
  pb validate data.csv --check rows-distinct --exit-code
1947
- pb validate data.csv --check rows-complete
1948
1960
  pb validate data.csv --check col-exists --column price
1949
1961
  pb validate data.csv --check col-vals-not-null --column email
1950
1962
  pb validate data.csv --check col-vals-gt --column score --value 50
@@ -1952,7 +1964,6 @@ def validate(
1952
1964
 
1953
1965
  Multiple validations in one command:
1954
1966
  pb validate data.csv --check rows-distinct --check rows-complete
1955
- pb validate data.csv --check col-vals-not-null --column email --check col-vals-gt --column age --value 18
1956
1967
  """
1957
1968
  try:
1958
1969
  import sys
@@ -4627,36 +4638,40 @@ def pl(
4627
4638
  pb pl "pl.read_csv('data.csv').select(['name', 'age'])"
4628
4639
  pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)"
4629
4640
 
4641
+ \b
4630
4642
  # Multi-line with editor (supports multiple statements)
4631
4643
  pb pl --edit
4632
4644
 
4645
+ \b
4633
4646
  # Multi-statement code example in editor:
4634
4647
  # csv = pl.read_csv('data.csv')
4635
4648
  # result = csv.select(['name', 'age']).filter(pl.col('age') > 25)
4636
4649
 
4650
+ \b
4637
4651
  # Multi-line with a specific editor
4638
4652
  pb pl --edit --editor nano
4639
4653
  pb pl --edit --editor code
4640
4654
  pb pl --edit --editor micro
4641
4655
 
4656
+ \b
4642
4657
  # From file
4643
4658
  pb pl --file query.py
4644
4659
 
4645
- # Piping to other pb commands
4646
- pb pl "pl.read_csv('data.csv').filter(pl.col('age') > 25)" --pipe | pb validate --check rows-distinct
4660
+ \b
4661
+ Piping to other pb commands
4662
+ pb pl "pl.read_csv('data.csv').head(20)" --pipe | pb validate --check rows-distinct
4647
4663
  pb pl --edit --pipe | pb preview --head 10
4648
4664
  pb pl --edit --pipe | pb scan --output-html report.html
4649
4665
  pb pl --edit --pipe | pb missing --output-html missing_report.html
4650
4666
 
4651
- Use --output-format to change how results are displayed:
4652
-
4653
4667
  \b
4668
+ Use --output-format to change how results are displayed:
4654
4669
  pb pl "pl.read_csv('data.csv')" --output-format scan
4655
4670
  pb pl "pl.read_csv('data.csv')" --output-format missing
4656
4671
  pb pl "pl.read_csv('data.csv')" --output-format info
4657
4672
 
4658
- Note: For multi-statement code, assign your final result to a variable like
4659
- 'result', 'df', 'data', or ensure it's the last expression.
4673
+ Note: For multi-statement code, assign your final result to a variable like 'result', 'df',
4674
+ 'data', or ensure it's the last expression.
4660
4675
  """
4661
4676
  try:
4662
4677
  # Check if Polars is available