pointblank 0.13.4__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +4 -0
- pointblank/_constants.py +117 -0
- pointblank/_constants_translations.py +487 -2
- pointblank/_interrogation.py +1065 -12
- pointblank/_spec_utils.py +1015 -0
- pointblank/_utils.py +17 -7
- pointblank/_utils_ai.py +875 -0
- pointblank/assistant.py +1 -1
- pointblank/cli.py +128 -115
- pointblank/column.py +1 -1
- pointblank/data/api-docs.txt +1838 -130
- pointblank/data/validations/README.md +108 -0
- pointblank/data/validations/complex_preprocessing.json +54 -0
- pointblank/data/validations/complex_preprocessing.pkl +0 -0
- pointblank/data/validations/generate_test_files.py +127 -0
- pointblank/data/validations/multiple_steps.json +83 -0
- pointblank/data/validations/multiple_steps.pkl +0 -0
- pointblank/data/validations/narwhals_function.json +28 -0
- pointblank/data/validations/narwhals_function.pkl +0 -0
- pointblank/data/validations/no_preprocessing.json +83 -0
- pointblank/data/validations/no_preprocessing.pkl +0 -0
- pointblank/data/validations/pandas_compatible.json +28 -0
- pointblank/data/validations/pandas_compatible.pkl +0 -0
- pointblank/data/validations/preprocessing_functions.py +46 -0
- pointblank/data/validations/simple_preprocessing.json +57 -0
- pointblank/data/validations/simple_preprocessing.pkl +0 -0
- pointblank/datascan.py +4 -4
- pointblank/draft.py +52 -3
- pointblank/scan_profile.py +6 -6
- pointblank/schema.py +8 -82
- pointblank/thresholds.py +1 -1
- pointblank/validate.py +3069 -437
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/METADATA +67 -8
- pointblank-0.15.0.dist-info/RECORD +56 -0
- pointblank-0.13.4.dist-info/RECORD +0 -39
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/WHEEL +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/entry_points.txt +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/top_level.txt +0 -0
pointblank/draft.py
CHANGED
|
@@ -38,10 +38,15 @@ class DraftValidation:
|
|
|
38
38
|
The data to be used for drafting a validation plan.
|
|
39
39
|
model
|
|
40
40
|
The model to be used. This should be in the form of `provider:model` (e.g.,
|
|
41
|
-
`"anthropic:claude-
|
|
41
|
+
`"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
|
|
42
42
|
`"ollama"`, and `"bedrock"`.
|
|
43
43
|
api_key
|
|
44
44
|
The API key to be used for the model.
|
|
45
|
+
verify_ssl
|
|
46
|
+
Whether to verify SSL certificates when making requests to the LLM provider. Set to `False`
|
|
47
|
+
to disable SSL verification (e.g., when behind a corporate firewall with self-signed
|
|
48
|
+
certificates). Defaults to `True`. Use with caution as disabling SSL verification can pose
|
|
49
|
+
security risks.
|
|
45
50
|
|
|
46
51
|
Returns
|
|
47
52
|
-------
|
|
@@ -83,6 +88,33 @@ class DraftValidation:
|
|
|
83
88
|
There's no need to have the `python-dotenv` package installed when using `.env` files in this
|
|
84
89
|
way.
|
|
85
90
|
|
|
91
|
+
Notes on SSL Certificate Verification
|
|
92
|
+
--------------------------------------
|
|
93
|
+
By default, SSL certificate verification is enabled for all requests to LLM providers. However,
|
|
94
|
+
in certain network environments (such as corporate networks with self-signed certificates or
|
|
95
|
+
firewall proxies), you may encounter SSL certificate verification errors.
|
|
96
|
+
|
|
97
|
+
To disable SSL verification, set the `verify_ssl` parameter to `False`:
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
import pointblank as pb
|
|
101
|
+
|
|
102
|
+
data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb")
|
|
103
|
+
|
|
104
|
+
# Disable SSL verification for networks with self-signed certificates
|
|
105
|
+
pb.DraftValidation(
|
|
106
|
+
data=data,
|
|
107
|
+
model="anthropic:claude-sonnet-4-5",
|
|
108
|
+
verify_ssl=False
|
|
109
|
+
)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
:::{.callout-warning}
|
|
113
|
+
Disabling SSL verification (through `verify_ssl=False`) can expose your API keys and data to
|
|
114
|
+
man-in-the-middle attacks. Only use this option in trusted network environments and when
|
|
115
|
+
absolutely necessary.
|
|
116
|
+
:::
|
|
117
|
+
|
|
86
118
|
Notes on Data Sent to the Model Provider
|
|
87
119
|
----------------------------------------
|
|
88
120
|
The data sent to the model provider is a JSON summary of the table. This data summary is
|
|
@@ -109,7 +141,7 @@ class DraftValidation:
|
|
|
109
141
|
Let's look at how the `DraftValidation` class can be used to draft a validation plan for a
|
|
110
142
|
table. The table to be used is `"nycflights"`, which is available here via the
|
|
111
143
|
[`load_dataset()`](`pointblank.load_dataset`) function. The model to be used is
|
|
112
|
-
`"anthropic:claude-
|
|
144
|
+
`"anthropic:claude-sonnet-4-5"` (which performs very well compared to other LLMs). The
|
|
113
145
|
example assumes that the API key is stored in an `.env` file as `ANTHROPIC_API_KEY`.
|
|
114
146
|
|
|
115
147
|
```python
|
|
@@ -119,7 +151,7 @@ class DraftValidation:
|
|
|
119
151
|
data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb")
|
|
120
152
|
|
|
121
153
|
# Draft a validation plan for the "nycflights" table
|
|
122
|
-
pb.DraftValidation(data=data, model="anthropic:claude-
|
|
154
|
+
pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5")
|
|
123
155
|
```
|
|
124
156
|
|
|
125
157
|
The output will be a drafted validation plan for the `"nycflights"` table and this will appear
|
|
@@ -194,6 +226,7 @@ class DraftValidation:
|
|
|
194
226
|
data: FrameT | Any
|
|
195
227
|
model: str
|
|
196
228
|
api_key: str | None = None
|
|
229
|
+
verify_ssl: bool = True
|
|
197
230
|
response: str = field(init=False)
|
|
198
231
|
|
|
199
232
|
def __post_init__(self):
|
|
@@ -280,6 +313,18 @@ class DraftValidation:
|
|
|
280
313
|
" per line)"
|
|
281
314
|
)
|
|
282
315
|
|
|
316
|
+
# Create httpx client with SSL verification settings
|
|
317
|
+
# This will be passed to the LLM provider's chat client
|
|
318
|
+
try:
|
|
319
|
+
import httpx # noqa
|
|
320
|
+
except ImportError: # pragma: no cover
|
|
321
|
+
raise ImportError( # pragma: no cover
|
|
322
|
+
"The `httpx` package is required for SSL configuration. "
|
|
323
|
+
"Please install it using `pip install httpx`."
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
http_client = httpx.AsyncClient(verify=self.verify_ssl)
|
|
327
|
+
|
|
283
328
|
if provider == "anthropic": # pragma: no cover
|
|
284
329
|
# Check that the anthropic package is installed
|
|
285
330
|
try:
|
|
@@ -296,6 +341,7 @@ class DraftValidation:
|
|
|
296
341
|
model=model_name,
|
|
297
342
|
system_prompt="You are a terse assistant and a Python expert.",
|
|
298
343
|
api_key=self.api_key,
|
|
344
|
+
kwargs={"http_client": http_client},
|
|
299
345
|
)
|
|
300
346
|
|
|
301
347
|
if provider == "openai": # pragma: no cover
|
|
@@ -314,6 +360,7 @@ class DraftValidation:
|
|
|
314
360
|
model=model_name,
|
|
315
361
|
system_prompt="You are a terse assistant and a Python expert.",
|
|
316
362
|
api_key=self.api_key,
|
|
363
|
+
kwargs={"http_client": http_client},
|
|
317
364
|
)
|
|
318
365
|
|
|
319
366
|
if provider == "ollama": # pragma: no cover
|
|
@@ -331,6 +378,7 @@ class DraftValidation:
|
|
|
331
378
|
chat = ChatOllama( # pragma: no cover
|
|
332
379
|
model=model_name,
|
|
333
380
|
system_prompt="You are a terse assistant and a Python expert.",
|
|
381
|
+
kwargs={"http_client": http_client},
|
|
334
382
|
)
|
|
335
383
|
|
|
336
384
|
if provider == "bedrock": # pragma: no cover
|
|
@@ -339,6 +387,7 @@ class DraftValidation:
|
|
|
339
387
|
chat = ChatBedrockAnthropic( # pragma: no cover
|
|
340
388
|
model=model_name,
|
|
341
389
|
system_prompt="You are a terse assistant and a Python expert.",
|
|
390
|
+
kwargs={"http_client": http_client},
|
|
342
391
|
)
|
|
343
392
|
|
|
344
393
|
self.response = str(chat.chat(prompt, stream=False, echo="none")) # pragma: no cover
|
pointblank/scan_profile.py
CHANGED
|
@@ -299,12 +299,12 @@ class _DataProfile: # TODO: feels redundant and weird
|
|
|
299
299
|
# instantiations that require consistent types.
|
|
300
300
|
all_same_type: bool = all(type(v) is first_type for v in values[1:])
|
|
301
301
|
if not all_same_type:
|
|
302
|
-
if strict:
|
|
303
|
-
msg = f"Some types in {key!s} stat are different. Turn off `strict` to bypass."
|
|
304
|
-
raise TypeError(msg)
|
|
305
|
-
for d in cols:
|
|
306
|
-
if key in d:
|
|
307
|
-
d[key] = str(d[key])
|
|
302
|
+
if strict: # pragma: no cover
|
|
303
|
+
msg = f"Some types in {key!s} stat are different. Turn off `strict` to bypass." # pragma: no cover
|
|
304
|
+
raise TypeError(msg) # pragma: no cover
|
|
305
|
+
for d in cols: # pragma: no cover
|
|
306
|
+
if key in d: # pragma: no cover
|
|
307
|
+
d[key] = str(d[key]) # pragma: no cover
|
|
308
308
|
|
|
309
309
|
return nw.from_dict(transpose_dicts(cols), backend=self.implementation)
|
|
310
310
|
|
pointblank/schema.py
CHANGED
|
@@ -343,15 +343,15 @@ class Schema:
|
|
|
343
343
|
schema_dict = {k: str(v) for k, v in schema_dict.items()}
|
|
344
344
|
self.columns = list(schema_dict.items())
|
|
345
345
|
|
|
346
|
-
elif table_type == "pyspark":
|
|
346
|
+
elif table_type == "pyspark": # pragma: no cover
|
|
347
347
|
# Convert PySpark DataFrame to Narwhals to get schema
|
|
348
|
-
nw_df = nw.from_native(self.tbl)
|
|
349
|
-
if _is_lazy_frame(data=nw_df):
|
|
350
|
-
schema_dict = dict(nw_df.collect_schema())
|
|
351
|
-
else:
|
|
352
|
-
schema_dict = dict(nw_df.schema.items())
|
|
353
|
-
schema_dict = {k: str(v) for k, v in schema_dict.items()}
|
|
354
|
-
self.columns = list(schema_dict.items())
|
|
348
|
+
nw_df = nw.from_native(self.tbl) # pragma: no cover
|
|
349
|
+
if _is_lazy_frame(data=nw_df): # pragma: no cover
|
|
350
|
+
schema_dict = dict(nw_df.collect_schema()) # pragma: no cover
|
|
351
|
+
else: # pragma: no cover
|
|
352
|
+
schema_dict = dict(nw_df.schema.items()) # pragma: no cover
|
|
353
|
+
schema_dict = {k: str(v) for k, v in schema_dict.items()} # pragma: no cover
|
|
354
|
+
self.columns = list(schema_dict.items()) # pragma: no cover
|
|
355
355
|
|
|
356
356
|
elif table_type in IBIS_BACKENDS:
|
|
357
357
|
schema_dict = dict(self.tbl.schema().items())
|
|
@@ -888,80 +888,6 @@ def _schema_info_generate_params_dict(
|
|
|
888
888
|
}
|
|
889
889
|
|
|
890
890
|
|
|
891
|
-
def _check_schema_match(
|
|
892
|
-
data_tbl: any,
|
|
893
|
-
schema: Schema,
|
|
894
|
-
complete: bool = True,
|
|
895
|
-
in_order: bool = True,
|
|
896
|
-
case_sensitive_colnames: bool = True,
|
|
897
|
-
case_sensitive_dtypes: bool = True,
|
|
898
|
-
full_match_dtypes: bool = True,
|
|
899
|
-
) -> bool:
|
|
900
|
-
"""
|
|
901
|
-
Check if the schema matches the target table.
|
|
902
|
-
|
|
903
|
-
This function performs schema validation and returns a boolean result.
|
|
904
|
-
|
|
905
|
-
Parameters
|
|
906
|
-
----------
|
|
907
|
-
data_tbl
|
|
908
|
-
The target table to validate.
|
|
909
|
-
schema
|
|
910
|
-
The expected schema.
|
|
911
|
-
complete
|
|
912
|
-
Whether the schema should be complete.
|
|
913
|
-
in_order
|
|
914
|
-
Whether the schema should be in order.
|
|
915
|
-
case_sensitive_colnames
|
|
916
|
-
Whether column names are case-sensitive.
|
|
917
|
-
case_sensitive_dtypes
|
|
918
|
-
Whether data types are case-sensitive.
|
|
919
|
-
full_match_dtypes
|
|
920
|
-
Whether data types must match exactly.
|
|
921
|
-
|
|
922
|
-
Returns
|
|
923
|
-
-------
|
|
924
|
-
bool
|
|
925
|
-
True if the schema matches, False otherwise.
|
|
926
|
-
"""
|
|
927
|
-
validation_info = _get_schema_validation_info(
|
|
928
|
-
data_tbl=data_tbl,
|
|
929
|
-
schema=schema,
|
|
930
|
-
passed=False, # This will be determined by the logic below
|
|
931
|
-
complete=complete,
|
|
932
|
-
in_order=in_order,
|
|
933
|
-
case_sensitive_colnames=case_sensitive_colnames,
|
|
934
|
-
case_sensitive_dtypes=case_sensitive_dtypes,
|
|
935
|
-
full_match_dtypes=full_match_dtypes,
|
|
936
|
-
)
|
|
937
|
-
|
|
938
|
-
# Determine if the schema validation passed based on the validation info
|
|
939
|
-
passed = True
|
|
940
|
-
|
|
941
|
-
# Check completeness requirement
|
|
942
|
-
if complete and not validation_info["columns_full_set"]:
|
|
943
|
-
passed = False
|
|
944
|
-
|
|
945
|
-
# Check order requirement
|
|
946
|
-
if in_order and not validation_info["columns_matched_in_order"]:
|
|
947
|
-
passed = False
|
|
948
|
-
|
|
949
|
-
# Check if all expected columns were found
|
|
950
|
-
if validation_info["columns_not_found"]:
|
|
951
|
-
passed = False
|
|
952
|
-
|
|
953
|
-
# Check column-specific validations
|
|
954
|
-
for col_info in validation_info["columns"].values():
|
|
955
|
-
if not col_info["colname_matched"]:
|
|
956
|
-
passed = False
|
|
957
|
-
if not col_info.get(
|
|
958
|
-
"dtype_matched", True
|
|
959
|
-
): # dtype_matched may not exist if no dtypes specified
|
|
960
|
-
passed = False
|
|
961
|
-
|
|
962
|
-
return passed
|
|
963
|
-
|
|
964
|
-
|
|
965
891
|
def _get_schema_validation_info(
|
|
966
892
|
data_tbl: any,
|
|
967
893
|
schema: Schema,
|
pointblank/thresholds.py
CHANGED
|
@@ -559,7 +559,7 @@ class FinalActions:
|
|
|
559
559
|
def send_alert():
|
|
560
560
|
summary = pb.get_validation_summary()
|
|
561
561
|
if summary["highest_severity"] == "critical":
|
|
562
|
-
print(f"ALERT: Critical validation failures found in {summary['
|
|
562
|
+
print(f"ALERT: Critical validation failures found in {summary['tbl_name']}")
|
|
563
563
|
|
|
564
564
|
validation = (
|
|
565
565
|
pb.Validate(
|