pointblank 0.11.5__py3-none-any.whl → 0.11.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/validate.py +3 -1
- pointblank/yaml.py +162 -19
- {pointblank-0.11.5.dist-info → pointblank-0.11.6.dist-info}/METADATA +2 -2
- {pointblank-0.11.5.dist-info → pointblank-0.11.6.dist-info}/RECORD +8 -8
- {pointblank-0.11.5.dist-info → pointblank-0.11.6.dist-info}/WHEEL +0 -0
- {pointblank-0.11.5.dist-info → pointblank-0.11.6.dist-info}/entry_points.txt +0 -0
- {pointblank-0.11.5.dist-info → pointblank-0.11.6.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.11.5.dist-info → pointblank-0.11.6.dist-info}/top_level.txt +0 -0
pointblank/validate.py
CHANGED
|
@@ -11684,7 +11684,9 @@ class Validate:
|
|
|
11684
11684
|
# Determine the rows that passed all validation steps by checking if all `pb_is_good_`
|
|
11685
11685
|
# columns are `True`
|
|
11686
11686
|
labeled_tbl_nw = (
|
|
11687
|
-
labeled_tbl_nw.with_columns(
|
|
11687
|
+
labeled_tbl_nw.with_columns(
|
|
11688
|
+
pb_is_good_all=nw.all_horizontal(pb_is_good_cols, ignore_nulls=True)
|
|
11689
|
+
)
|
|
11688
11690
|
.join(data_nw, on=index_name, how="left")
|
|
11689
11691
|
.drop(index_name)
|
|
11690
11692
|
)
|
pointblank/yaml.py
CHANGED
|
@@ -5,6 +5,7 @@ from typing import Any, Union
|
|
|
5
5
|
|
|
6
6
|
import yaml
|
|
7
7
|
|
|
8
|
+
from pointblank._utils import _is_lib_present
|
|
8
9
|
from pointblank.thresholds import Actions
|
|
9
10
|
from pointblank.validate import Validate, load_dataset
|
|
10
11
|
|
|
@@ -217,6 +218,8 @@ class YAMLValidator:
|
|
|
217
218
|
"col_count_match": "col_count_match",
|
|
218
219
|
"row_count_match": "row_count_match",
|
|
219
220
|
"col_schema_match": "col_schema_match",
|
|
221
|
+
"conjointly": "conjointly",
|
|
222
|
+
"specially": "specially",
|
|
220
223
|
}
|
|
221
224
|
|
|
222
225
|
def __init__(self):
|
|
@@ -345,7 +348,7 @@ class YAMLValidator:
|
|
|
345
348
|
f"or list of strings/dictionaries"
|
|
346
349
|
)
|
|
347
350
|
|
|
348
|
-
def _load_data_source(self, tbl_spec: str) -> Any:
|
|
351
|
+
def _load_data_source(self, tbl_spec: str, df_library: str = "polars") -> Any:
|
|
349
352
|
"""Load data source based on table specification.
|
|
350
353
|
|
|
351
354
|
Parameters
|
|
@@ -354,6 +357,8 @@ class YAMLValidator:
|
|
|
354
357
|
Data source specification. Can be (1) a dataset name for `load_dataset()`, (2) a CSV file
|
|
355
358
|
path (relative or absolute), (3) a Parquet file path (relative or absolute), or (4) a
|
|
356
359
|
Python code snippet to be executed for dynamic data loading.
|
|
360
|
+
df_library
|
|
361
|
+
DataFrame library to use for loading datasets and CSV files. Options: "polars", "pandas", "duckdb".
|
|
357
362
|
|
|
358
363
|
Returns
|
|
359
364
|
-------
|
|
@@ -374,20 +379,79 @@ class YAMLValidator:
|
|
|
374
379
|
if processed_tbl_spec is not tbl_spec or not isinstance(processed_tbl_spec, str):
|
|
375
380
|
return processed_tbl_spec
|
|
376
381
|
|
|
382
|
+
# Check if it's a CSV file and handle with specified library
|
|
383
|
+
if isinstance(processed_tbl_spec, str) and processed_tbl_spec.endswith(".csv"):
|
|
384
|
+
return self._load_csv_file(processed_tbl_spec, df_library)
|
|
385
|
+
|
|
377
386
|
# Use the centralized data processing pipeline from validate.py
|
|
378
|
-
# This handles
|
|
387
|
+
# This handles Parquet files and other data sources
|
|
379
388
|
processed_data = _process_data(processed_tbl_spec)
|
|
380
389
|
|
|
381
390
|
# If _process_data returns the original string unchanged,
|
|
382
|
-
# then it's not a file path, so try load_dataset
|
|
391
|
+
# then it's not a file path, so try load_dataset with specified library
|
|
383
392
|
if processed_data is processed_tbl_spec and isinstance(processed_tbl_spec, str):
|
|
384
|
-
return load_dataset(processed_tbl_spec)
|
|
393
|
+
return load_dataset(processed_tbl_spec, tbl_type=df_library)
|
|
385
394
|
else:
|
|
386
395
|
return processed_data
|
|
387
396
|
|
|
388
397
|
except Exception as e:
|
|
389
398
|
raise YAMLValidationError(f"Failed to load data source '{tbl_spec}': {e}")
|
|
390
399
|
|
|
400
|
+
def _load_csv_file(self, file_path: str, df_library: str) -> Any:
|
|
401
|
+
"""Load CSV file using the specified DataFrame library.
|
|
402
|
+
|
|
403
|
+
Parameters
|
|
404
|
+
----------
|
|
405
|
+
file_path
|
|
406
|
+
Path to the CSV file.
|
|
407
|
+
df_library
|
|
408
|
+
DataFrame library to use: "polars", "pandas", or "duckdb".
|
|
409
|
+
|
|
410
|
+
Returns
|
|
411
|
+
-------
|
|
412
|
+
Loaded DataFrame object.
|
|
413
|
+
|
|
414
|
+
Raises
|
|
415
|
+
------
|
|
416
|
+
YAMLValidationError
|
|
417
|
+
If CSV file cannot be loaded or library is not available.
|
|
418
|
+
"""
|
|
419
|
+
import os
|
|
420
|
+
|
|
421
|
+
if not os.path.exists(file_path):
|
|
422
|
+
raise YAMLValidationError(f"CSV file not found: {file_path}")
|
|
423
|
+
|
|
424
|
+
try:
|
|
425
|
+
if df_library == "polars":
|
|
426
|
+
if not _is_lib_present("polars"):
|
|
427
|
+
raise YAMLValidationError("Polars library is not available")
|
|
428
|
+
import polars as pl
|
|
429
|
+
|
|
430
|
+
return pl.read_csv(file_path)
|
|
431
|
+
|
|
432
|
+
elif df_library == "pandas":
|
|
433
|
+
if not _is_lib_present("pandas"):
|
|
434
|
+
raise YAMLValidationError("Pandas library is not available")
|
|
435
|
+
import pandas as pd
|
|
436
|
+
|
|
437
|
+
return pd.read_csv(file_path)
|
|
438
|
+
|
|
439
|
+
elif df_library == "duckdb":
|
|
440
|
+
# For DuckDB, we'll use the existing _process_data since it handles DuckDB
|
|
441
|
+
from pointblank.validate import _process_data
|
|
442
|
+
|
|
443
|
+
return _process_data(file_path)
|
|
444
|
+
|
|
445
|
+
else:
|
|
446
|
+
raise YAMLValidationError(
|
|
447
|
+
f"Unsupported df_library: {df_library}. Use 'polars', 'pandas', or 'duckdb'"
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
except Exception as e:
|
|
451
|
+
raise YAMLValidationError(
|
|
452
|
+
f"Failed to load CSV file '{file_path}' with {df_library}: {e}"
|
|
453
|
+
)
|
|
454
|
+
|
|
391
455
|
def _parse_column_spec(self, columns_expr: Any) -> list[str]:
|
|
392
456
|
"""Parse column specification from YAML.
|
|
393
457
|
|
|
@@ -559,6 +623,29 @@ class YAMLValidator:
|
|
|
559
623
|
if "schema" in parameters and method_name == "col_schema_match":
|
|
560
624
|
parameters["schema"] = self._parse_schema_spec(parameters["schema"])
|
|
561
625
|
|
|
626
|
+
# Handle `conjointly()` expressions: convert list to separate positional arguments
|
|
627
|
+
if method_name == "conjointly" and "expressions" in parameters:
|
|
628
|
+
expressions = parameters.pop("expressions") # Remove from parameters
|
|
629
|
+
if isinstance(expressions, list):
|
|
630
|
+
# Convert string expressions to lambda functions
|
|
631
|
+
lambda_expressions = []
|
|
632
|
+
for expr in expressions:
|
|
633
|
+
if isinstance(expr, str):
|
|
634
|
+
lambda_expressions.append(_safe_eval_python_code(expr))
|
|
635
|
+
else:
|
|
636
|
+
lambda_expressions.append(expr)
|
|
637
|
+
# Pass expressions as positional arguments (stored as special key)
|
|
638
|
+
parameters["_conjointly_expressions"] = lambda_expressions
|
|
639
|
+
else:
|
|
640
|
+
raise YAMLValidationError("conjointly 'expressions' must be a list")
|
|
641
|
+
|
|
642
|
+
# Handle `specially()` expr parameter: support shortcut syntax
|
|
643
|
+
if method_name == "specially" and "expr" in parameters:
|
|
644
|
+
expr_value = parameters["expr"]
|
|
645
|
+
if isinstance(expr_value, str):
|
|
646
|
+
# Treat string directly as Python code (shortcut syntax)
|
|
647
|
+
parameters["expr"] = _safe_eval_python_code(expr_value)
|
|
648
|
+
|
|
562
649
|
# Convert `actions=` if present (ensure it's an Actions object)
|
|
563
650
|
if "actions" in parameters:
|
|
564
651
|
if isinstance(parameters["actions"], dict):
|
|
@@ -583,8 +670,9 @@ class YAMLValidator:
|
|
|
583
670
|
Validate
|
|
584
671
|
Validate object with configured validation steps.
|
|
585
672
|
"""
|
|
586
|
-
# Load data source
|
|
587
|
-
|
|
673
|
+
# Load data source with specified library
|
|
674
|
+
df_library = config.get("df_library", "polars")
|
|
675
|
+
data = self._load_data_source(config["tbl"], df_library)
|
|
588
676
|
|
|
589
677
|
# Create Validate object
|
|
590
678
|
validate_kwargs = {}
|
|
@@ -603,7 +691,7 @@ class YAMLValidator:
|
|
|
603
691
|
|
|
604
692
|
# Set actions if provided
|
|
605
693
|
if "actions" in config:
|
|
606
|
-
# Process actions
|
|
694
|
+
# Process actions: handle `python:` block syntax for callables
|
|
607
695
|
processed_actions = _process_python_expressions(config["actions"])
|
|
608
696
|
# Convert to Actions object
|
|
609
697
|
validate_kwargs["actions"] = Actions(**processed_actions)
|
|
@@ -629,8 +717,13 @@ class YAMLValidator:
|
|
|
629
717
|
# Get the method from the validation object
|
|
630
718
|
method = getattr(validation, method_name)
|
|
631
719
|
|
|
632
|
-
#
|
|
633
|
-
|
|
720
|
+
# Special handling for conjointly: pass expressions as positional arguments
|
|
721
|
+
if method_name == "conjointly" and "_conjointly_expressions" in parameters:
|
|
722
|
+
expressions = parameters.pop("_conjointly_expressions")
|
|
723
|
+
validation = method(*expressions, **parameters)
|
|
724
|
+
else:
|
|
725
|
+
# Call the method with parameters
|
|
726
|
+
validation = method(**parameters)
|
|
634
727
|
|
|
635
728
|
return validation
|
|
636
729
|
|
|
@@ -1162,20 +1255,21 @@ def yaml_to_python(yaml: Union[str, Path]) -> str:
|
|
|
1162
1255
|
|
|
1163
1256
|
# Add data loading as first argument
|
|
1164
1257
|
tbl_spec = config["tbl"]
|
|
1165
|
-
|
|
1258
|
+
df_library = config.get("df_library", "polars")
|
|
1259
|
+
|
|
1260
|
+
# Use the original Python expression if we extracted it (df_library is ignored in this case)
|
|
1261
|
+
if original_tbl_expression:
|
|
1262
|
+
validate_args.append(f"data={original_tbl_expression}")
|
|
1263
|
+
elif isinstance(tbl_spec, str):
|
|
1166
1264
|
if tbl_spec.endswith((".csv", ".parquet")):
|
|
1167
1265
|
# File loading
|
|
1168
|
-
validate_args.append(f'data=pb.load_dataset("{tbl_spec}")')
|
|
1266
|
+
validate_args.append(f'data=pb.load_dataset("{tbl_spec}", tbl_type="{df_library}")')
|
|
1169
1267
|
else:
|
|
1170
1268
|
# Dataset loading
|
|
1171
|
-
validate_args.append(f'data=pb.load_dataset("{tbl_spec}")')
|
|
1269
|
+
validate_args.append(f'data=pb.load_dataset("{tbl_spec}", tbl_type="{df_library}")')
|
|
1172
1270
|
else:
|
|
1173
|
-
#
|
|
1174
|
-
|
|
1175
|
-
validate_args.append(f"data={original_tbl_expression}")
|
|
1176
|
-
else:
|
|
1177
|
-
# Fallback to placeholder if we couldn't extract the original expression
|
|
1178
|
-
validate_args.append("data=<python_expression_result>")
|
|
1271
|
+
# Fallback to placeholder if we couldn't extract the original expression
|
|
1272
|
+
validate_args.append("data=<python_expression_result>")
|
|
1179
1273
|
|
|
1180
1274
|
# Add table name if present
|
|
1181
1275
|
if "tbl_name" in config:
|
|
@@ -1243,16 +1337,65 @@ def yaml_to_python(yaml: Union[str, Path]) -> str:
|
|
|
1243
1337
|
|
|
1244
1338
|
# Add validation steps as chained method calls
|
|
1245
1339
|
for step_index, step_config in enumerate(config["steps"]):
|
|
1340
|
+
# Get original expressions before parsing
|
|
1341
|
+
original_expressions = {}
|
|
1342
|
+
step_method = list(step_config.keys())[
|
|
1343
|
+
0
|
|
1344
|
+
] # Get the method name (conjointly, specially, etc.)
|
|
1345
|
+
step_params = step_config[step_method]
|
|
1346
|
+
|
|
1347
|
+
if (
|
|
1348
|
+
step_method == "conjointly"
|
|
1349
|
+
and isinstance(step_params, dict)
|
|
1350
|
+
and "expressions" in step_params
|
|
1351
|
+
):
|
|
1352
|
+
original_expressions["expressions"] = step_params["expressions"]
|
|
1353
|
+
|
|
1354
|
+
if step_method == "specially" and isinstance(step_params, dict) and "expr" in step_params:
|
|
1355
|
+
if isinstance(step_params["expr"], dict) and "python" in step_params["expr"]:
|
|
1356
|
+
original_expressions["expr"] = step_params["expr"]["python"].strip()
|
|
1357
|
+
elif isinstance(step_params["expr"], str):
|
|
1358
|
+
original_expressions["expr"] = step_params["expr"]
|
|
1359
|
+
|
|
1246
1360
|
method_name, parameters = validator._parse_validation_step(step_config)
|
|
1247
1361
|
|
|
1362
|
+
# Apply the original expressions to override the converted lambda functions
|
|
1363
|
+
if method_name == "conjointly" and "expressions" in original_expressions:
|
|
1364
|
+
# Remove the internal parameter and add expressions as a proper parameter
|
|
1365
|
+
if "_conjointly_expressions" in parameters:
|
|
1366
|
+
parameters.pop("_conjointly_expressions")
|
|
1367
|
+
parameters["expressions"] = original_expressions["expressions"]
|
|
1368
|
+
|
|
1369
|
+
if method_name == "specially" and "expr" in original_expressions:
|
|
1370
|
+
parameters["expr"] = original_expressions["expr"]
|
|
1371
|
+
|
|
1248
1372
|
# Format parameters
|
|
1249
1373
|
param_parts = []
|
|
1250
1374
|
for key, value in parameters.items():
|
|
1251
1375
|
# Check if we have an original expression for this parameter
|
|
1252
1376
|
expression_path = f"steps[{step_index}].{list(step_config.keys())[0]}.{key}"
|
|
1253
|
-
|
|
1377
|
+
|
|
1378
|
+
# Skip using step_expressions for specially/conjointly parameters that we handle specially
|
|
1379
|
+
if (
|
|
1380
|
+
expression_path in step_expressions
|
|
1381
|
+
and not (method_name == "specially" and key == "expr")
|
|
1382
|
+
and not (method_name == "conjointly" and key == "expressions")
|
|
1383
|
+
):
|
|
1254
1384
|
# Use the original Python expression
|
|
1255
1385
|
param_parts.append(f"{key}={step_expressions[expression_path]}")
|
|
1386
|
+
elif key == "expressions" and method_name == "conjointly":
|
|
1387
|
+
# Handle conjointly expressions list
|
|
1388
|
+
if isinstance(value, list):
|
|
1389
|
+
expressions_str = "[" + ", ".join([f'"{expr}"' for expr in value]) + "]"
|
|
1390
|
+
param_parts.append(f"expressions={expressions_str}")
|
|
1391
|
+
else:
|
|
1392
|
+
param_parts.append(f"expressions={value}")
|
|
1393
|
+
elif key == "expr" and method_name == "specially":
|
|
1394
|
+
# Handle specially expr parameter: should be unquoted lambda expression
|
|
1395
|
+
if isinstance(value, str):
|
|
1396
|
+
param_parts.append(f"expr={value}")
|
|
1397
|
+
else:
|
|
1398
|
+
param_parts.append(f"expr={value}")
|
|
1256
1399
|
elif key in ["columns", "columns_subset"]:
|
|
1257
1400
|
if isinstance(value, list):
|
|
1258
1401
|
if len(value) == 1:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pointblank
|
|
3
|
-
Version: 0.11.
|
|
3
|
+
Version: 0.11.6
|
|
4
4
|
Summary: Find out if your data is what you think it is.
|
|
5
5
|
Author-email: Richard Iannone <riannone@me.com>
|
|
6
6
|
License: MIT License
|
|
@@ -43,7 +43,7 @@ License-File: LICENSE
|
|
|
43
43
|
Requires-Dist: commonmark>=0.9.1
|
|
44
44
|
Requires-Dist: importlib-metadata
|
|
45
45
|
Requires-Dist: great_tables>=0.17.0
|
|
46
|
-
Requires-Dist: narwhals>=1.
|
|
46
|
+
Requires-Dist: narwhals>=1.45.0
|
|
47
47
|
Requires-Dist: typing_extensions>=3.10.0.0
|
|
48
48
|
Requires-Dist: requests>=2.31.0
|
|
49
49
|
Requires-Dist: click>=8.0.0
|
|
@@ -20,8 +20,8 @@ pointblank/scan_profile_stats.py,sha256=qdzoGXB-zi2hmpA4mTz6LLTqMnb-NRG9ndxU9cxS
|
|
|
20
20
|
pointblank/schema.py,sha256=d93omncsV2lVbatM_QUFeCfCFA42WPZcgO_kE-ktjfU,45107
|
|
21
21
|
pointblank/tf.py,sha256=8o_8m4i01teulEe3-YYMotSNf3tImjBMInsvdjSAO5Q,8844
|
|
22
22
|
pointblank/thresholds.py,sha256=mybeLzTVdmN04NLKoV-jiSBXsWknwHO0Gox0ttVN_MU,25766
|
|
23
|
-
pointblank/validate.py,sha256=
|
|
24
|
-
pointblank/yaml.py,sha256=
|
|
23
|
+
pointblank/validate.py,sha256=CelY6wwB1JRyii-KWEyKfiazd7mToxAegHG1GMtKIm4,680332
|
|
24
|
+
pointblank/yaml.py,sha256=4DrkOJwCQ3CaXQ7ESNIW72pp-dL1ctlX6ONU30Vh1Fs,57901
|
|
25
25
|
pointblank/data/api-docs.txt,sha256=_mKEb3zuI6TR0bPNkpr5Y-GUtbB3Qv5WESR7MFuL06I,506515
|
|
26
26
|
pointblank/data/game_revenue-duckdb.zip,sha256=tKIVx48OGLYGsQPS3h5AjA2Nyq_rfEpLCjBiFUWhagU,35880
|
|
27
27
|
pointblank/data/game_revenue.zip,sha256=7c9EvHLyi93CHUd4p3dM4CZ-GucFCtXKSPxgLojL32U,33749
|
|
@@ -32,9 +32,9 @@ pointblank/data/nycflights.zip,sha256=yVjbUaKUz2LydSdF9cABuir0VReHBBgV7shiNWSd0m
|
|
|
32
32
|
pointblank/data/polars-api-docs.txt,sha256=KGcS-BOtUs9zgpkWfXD-GFdFh4O_zjdkpX7msHjztLg,198045
|
|
33
33
|
pointblank/data/small_table-duckdb.zip,sha256=BhTaZ2CRS4-9Z1uVhOU6HggvW3XCar7etMznfENIcOc,2028
|
|
34
34
|
pointblank/data/small_table.zip,sha256=lmFb90Nb-v5X559Ikjg31YLAXuRyMkD9yLRElkXPMzQ,472
|
|
35
|
-
pointblank-0.11.
|
|
36
|
-
pointblank-0.11.
|
|
37
|
-
pointblank-0.11.
|
|
38
|
-
pointblank-0.11.
|
|
39
|
-
pointblank-0.11.
|
|
40
|
-
pointblank-0.11.
|
|
35
|
+
pointblank-0.11.6.dist-info/licenses/LICENSE,sha256=apLF-HWPNU7pT5bmf5KmZpD5Cklpy2u-BN_0xBoRMLY,1081
|
|
36
|
+
pointblank-0.11.6.dist-info/METADATA,sha256=ni-fTo_hxUkYWEsW2qmncCPnpeyfh6me5aqexPcxXSA,17777
|
|
37
|
+
pointblank-0.11.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
38
|
+
pointblank-0.11.6.dist-info/entry_points.txt,sha256=GqqqOTOH8uZe22wLcvYjzpizqk_j4MNcUo2YM14ryCw,42
|
|
39
|
+
pointblank-0.11.6.dist-info/top_level.txt,sha256=-wHrS1SvV8-nhvc3w-PPYs1C1WtEc1pK-eGjubbCCKc,11
|
|
40
|
+
pointblank-0.11.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|