pointblank 0.13.4__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. pointblank/__init__.py +4 -0
  2. pointblank/_constants.py +117 -0
  3. pointblank/_constants_translations.py +487 -2
  4. pointblank/_interrogation.py +1065 -12
  5. pointblank/_spec_utils.py +1015 -0
  6. pointblank/_utils.py +17 -7
  7. pointblank/_utils_ai.py +875 -0
  8. pointblank/assistant.py +1 -1
  9. pointblank/cli.py +128 -115
  10. pointblank/column.py +1 -1
  11. pointblank/data/api-docs.txt +1838 -130
  12. pointblank/data/validations/README.md +108 -0
  13. pointblank/data/validations/complex_preprocessing.json +54 -0
  14. pointblank/data/validations/complex_preprocessing.pkl +0 -0
  15. pointblank/data/validations/generate_test_files.py +127 -0
  16. pointblank/data/validations/multiple_steps.json +83 -0
  17. pointblank/data/validations/multiple_steps.pkl +0 -0
  18. pointblank/data/validations/narwhals_function.json +28 -0
  19. pointblank/data/validations/narwhals_function.pkl +0 -0
  20. pointblank/data/validations/no_preprocessing.json +83 -0
  21. pointblank/data/validations/no_preprocessing.pkl +0 -0
  22. pointblank/data/validations/pandas_compatible.json +28 -0
  23. pointblank/data/validations/pandas_compatible.pkl +0 -0
  24. pointblank/data/validations/preprocessing_functions.py +46 -0
  25. pointblank/data/validations/simple_preprocessing.json +57 -0
  26. pointblank/data/validations/simple_preprocessing.pkl +0 -0
  27. pointblank/datascan.py +4 -4
  28. pointblank/draft.py +52 -3
  29. pointblank/scan_profile.py +6 -6
  30. pointblank/schema.py +8 -82
  31. pointblank/thresholds.py +1 -1
  32. pointblank/validate.py +3069 -437
  33. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/METADATA +67 -8
  34. pointblank-0.15.0.dist-info/RECORD +56 -0
  35. pointblank-0.13.4.dist-info/RECORD +0 -39
  36. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/WHEEL +0 -0
  37. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/entry_points.txt +0 -0
  38. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/licenses/LICENSE +0 -0
  39. {pointblank-0.13.4.dist-info → pointblank-0.15.0.dist-info}/top_level.txt +0 -0
pointblank/draft.py CHANGED
@@ -38,10 +38,15 @@ class DraftValidation:
38
38
  The data to be used for drafting a validation plan.
39
39
  model
40
40
  The model to be used. This should be in the form of `provider:model` (e.g.,
41
- `"anthropic:claude-3-5-sonnet-latest"`). Supported providers are `"anthropic"`, `"openai"`,
41
+ `"anthropic:claude-sonnet-4-5"`). Supported providers are `"anthropic"`, `"openai"`,
42
42
  `"ollama"`, and `"bedrock"`.
43
43
  api_key
44
44
  The API key to be used for the model.
45
+ verify_ssl
46
+ Whether to verify SSL certificates when making requests to the LLM provider. Set to `False`
47
+ to disable SSL verification (e.g., when behind a corporate firewall with self-signed
48
+ certificates). Defaults to `True`. Use with caution as disabling SSL verification can pose
49
+ security risks.
45
50
 
46
51
  Returns
47
52
  -------
@@ -83,6 +88,33 @@ class DraftValidation:
83
88
  There's no need to have the `python-dotenv` package installed when using `.env` files in this
84
89
  way.
85
90
 
91
+ Notes on SSL Certificate Verification
92
+ --------------------------------------
93
+ By default, SSL certificate verification is enabled for all requests to LLM providers. However,
94
+ in certain network environments (such as corporate networks with self-signed certificates or
95
+ firewall proxies), you may encounter SSL certificate verification errors.
96
+
97
+ To disable SSL verification, set the `verify_ssl` parameter to `False`:
98
+
99
+ ```python
100
+ import pointblank as pb
101
+
102
+ data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb")
103
+
104
+ # Disable SSL verification for networks with self-signed certificates
105
+ pb.DraftValidation(
106
+ data=data,
107
+ model="anthropic:claude-sonnet-4-5",
108
+ verify_ssl=False
109
+ )
110
+ ```
111
+
112
+ :::{.callout-warning}
113
+ Disabling SSL verification (through `verify_ssl=False`) can expose your API keys and data to
114
+ man-in-the-middle attacks. Only use this option in trusted network environments and when
115
+ absolutely necessary.
116
+ :::
117
+
86
118
  Notes on Data Sent to the Model Provider
87
119
  ----------------------------------------
88
120
  The data sent to the model provider is a JSON summary of the table. This data summary is
@@ -109,7 +141,7 @@ class DraftValidation:
109
141
  Let's look at how the `DraftValidation` class can be used to draft a validation plan for a
110
142
  table. The table to be used is `"nycflights"`, which is available here via the
111
143
  [`load_dataset()`](`pointblank.load_dataset`) function. The model to be used is
112
- `"anthropic:claude-3-5-sonnet-latest"` (which performs very well compared to other LLMs). The
144
+ `"anthropic:claude-sonnet-4-5"` (which performs very well compared to other LLMs). The
113
145
  example assumes that the API key is stored in an `.env` file as `ANTHROPIC_API_KEY`.
114
146
 
115
147
  ```python
@@ -119,7 +151,7 @@ class DraftValidation:
119
151
  data = pb.load_dataset(dataset="nycflights", tbl_type="duckdb")
120
152
 
121
153
  # Draft a validation plan for the "nycflights" table
122
- pb.DraftValidation(data=data, model="anthropic:claude-3-5-sonnet-latest")
154
+ pb.DraftValidation(data=data, model="anthropic:claude-sonnet-4-5")
123
155
  ```
124
156
 
125
157
  The output will be a drafted validation plan for the `"nycflights"` table and this will appear
@@ -194,6 +226,7 @@ class DraftValidation:
194
226
  data: FrameT | Any
195
227
  model: str
196
228
  api_key: str | None = None
229
+ verify_ssl: bool = True
197
230
  response: str = field(init=False)
198
231
 
199
232
  def __post_init__(self):
@@ -280,6 +313,18 @@ class DraftValidation:
280
313
  " per line)"
281
314
  )
282
315
 
316
+ # Create httpx client with SSL verification settings
317
+ # This will be passed to the LLM provider's chat client
318
+ try:
319
+ import httpx # noqa
320
+ except ImportError: # pragma: no cover
321
+ raise ImportError( # pragma: no cover
322
+ "The `httpx` package is required for SSL configuration. "
323
+ "Please install it using `pip install httpx`."
324
+ )
325
+
326
+ http_client = httpx.AsyncClient(verify=self.verify_ssl)
327
+
283
328
  if provider == "anthropic": # pragma: no cover
284
329
  # Check that the anthropic package is installed
285
330
  try:
@@ -296,6 +341,7 @@ class DraftValidation:
296
341
  model=model_name,
297
342
  system_prompt="You are a terse assistant and a Python expert.",
298
343
  api_key=self.api_key,
344
+ kwargs={"http_client": http_client},
299
345
  )
300
346
 
301
347
  if provider == "openai": # pragma: no cover
@@ -314,6 +360,7 @@ class DraftValidation:
314
360
  model=model_name,
315
361
  system_prompt="You are a terse assistant and a Python expert.",
316
362
  api_key=self.api_key,
363
+ kwargs={"http_client": http_client},
317
364
  )
318
365
 
319
366
  if provider == "ollama": # pragma: no cover
@@ -331,6 +378,7 @@ class DraftValidation:
331
378
  chat = ChatOllama( # pragma: no cover
332
379
  model=model_name,
333
380
  system_prompt="You are a terse assistant and a Python expert.",
381
+ kwargs={"http_client": http_client},
334
382
  )
335
383
 
336
384
  if provider == "bedrock": # pragma: no cover
@@ -339,6 +387,7 @@ class DraftValidation:
339
387
  chat = ChatBedrockAnthropic( # pragma: no cover
340
388
  model=model_name,
341
389
  system_prompt="You are a terse assistant and a Python expert.",
390
+ kwargs={"http_client": http_client},
342
391
  )
343
392
 
344
393
  self.response = str(chat.chat(prompt, stream=False, echo="none")) # pragma: no cover
@@ -299,12 +299,12 @@ class _DataProfile: # TODO: feels redundant and weird
299
299
  # instantiations that require consistent types.
300
300
  all_same_type: bool = all(type(v) is first_type for v in values[1:])
301
301
  if not all_same_type:
302
- if strict:
303
- msg = f"Some types in {key!s} stat are different. Turn off `strict` to bypass."
304
- raise TypeError(msg)
305
- for d in cols:
306
- if key in d:
307
- d[key] = str(d[key])
302
+ if strict: # pragma: no cover
303
+ msg = f"Some types in {key!s} stat are different. Turn off `strict` to bypass." # pragma: no cover
304
+ raise TypeError(msg) # pragma: no cover
305
+ for d in cols: # pragma: no cover
306
+ if key in d: # pragma: no cover
307
+ d[key] = str(d[key]) # pragma: no cover
308
308
 
309
309
  return nw.from_dict(transpose_dicts(cols), backend=self.implementation)
310
310
 
pointblank/schema.py CHANGED
@@ -343,15 +343,15 @@ class Schema:
343
343
  schema_dict = {k: str(v) for k, v in schema_dict.items()}
344
344
  self.columns = list(schema_dict.items())
345
345
 
346
- elif table_type == "pyspark":
346
+ elif table_type == "pyspark": # pragma: no cover
347
347
  # Convert PySpark DataFrame to Narwhals to get schema
348
- nw_df = nw.from_native(self.tbl)
349
- if _is_lazy_frame(data=nw_df):
350
- schema_dict = dict(nw_df.collect_schema())
351
- else:
352
- schema_dict = dict(nw_df.schema.items())
353
- schema_dict = {k: str(v) for k, v in schema_dict.items()}
354
- self.columns = list(schema_dict.items())
348
+ nw_df = nw.from_native(self.tbl) # pragma: no cover
349
+ if _is_lazy_frame(data=nw_df): # pragma: no cover
350
+ schema_dict = dict(nw_df.collect_schema()) # pragma: no cover
351
+ else: # pragma: no cover
352
+ schema_dict = dict(nw_df.schema.items()) # pragma: no cover
353
+ schema_dict = {k: str(v) for k, v in schema_dict.items()} # pragma: no cover
354
+ self.columns = list(schema_dict.items()) # pragma: no cover
355
355
 
356
356
  elif table_type in IBIS_BACKENDS:
357
357
  schema_dict = dict(self.tbl.schema().items())
@@ -888,80 +888,6 @@ def _schema_info_generate_params_dict(
888
888
  }
889
889
 
890
890
 
891
- def _check_schema_match(
892
- data_tbl: any,
893
- schema: Schema,
894
- complete: bool = True,
895
- in_order: bool = True,
896
- case_sensitive_colnames: bool = True,
897
- case_sensitive_dtypes: bool = True,
898
- full_match_dtypes: bool = True,
899
- ) -> bool:
900
- """
901
- Check if the schema matches the target table.
902
-
903
- This function performs schema validation and returns a boolean result.
904
-
905
- Parameters
906
- ----------
907
- data_tbl
908
- The target table to validate.
909
- schema
910
- The expected schema.
911
- complete
912
- Whether the schema should be complete.
913
- in_order
914
- Whether the schema should be in order.
915
- case_sensitive_colnames
916
- Whether column names are case-sensitive.
917
- case_sensitive_dtypes
918
- Whether data types are case-sensitive.
919
- full_match_dtypes
920
- Whether data types must match exactly.
921
-
922
- Returns
923
- -------
924
- bool
925
- True if the schema matches, False otherwise.
926
- """
927
- validation_info = _get_schema_validation_info(
928
- data_tbl=data_tbl,
929
- schema=schema,
930
- passed=False, # This will be determined by the logic below
931
- complete=complete,
932
- in_order=in_order,
933
- case_sensitive_colnames=case_sensitive_colnames,
934
- case_sensitive_dtypes=case_sensitive_dtypes,
935
- full_match_dtypes=full_match_dtypes,
936
- )
937
-
938
- # Determine if the schema validation passed based on the validation info
939
- passed = True
940
-
941
- # Check completeness requirement
942
- if complete and not validation_info["columns_full_set"]:
943
- passed = False
944
-
945
- # Check order requirement
946
- if in_order and not validation_info["columns_matched_in_order"]:
947
- passed = False
948
-
949
- # Check if all expected columns were found
950
- if validation_info["columns_not_found"]:
951
- passed = False
952
-
953
- # Check column-specific validations
954
- for col_info in validation_info["columns"].values():
955
- if not col_info["colname_matched"]:
956
- passed = False
957
- if not col_info.get(
958
- "dtype_matched", True
959
- ): # dtype_matched may not exist if no dtypes specified
960
- passed = False
961
-
962
- return passed
963
-
964
-
965
891
  def _get_schema_validation_info(
966
892
  data_tbl: any,
967
893
  schema: Schema,
pointblank/thresholds.py CHANGED
@@ -559,7 +559,7 @@ class FinalActions:
559
559
  def send_alert():
560
560
  summary = pb.get_validation_summary()
561
561
  if summary["highest_severity"] == "critical":
562
- print(f"ALERT: Critical validation failures found in {summary['table_name']}")
562
+ print(f"ALERT: Critical validation failures found in {summary['tbl_name']}")
563
563
 
564
564
  validation = (
565
565
  pb.Validate(