pointblank 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pointblank/__init__.py +0 -2
- pointblank/_constants.py +2 -28
- pointblank/_constants_translations.py +54 -0
- pointblank/_interrogation.py +1483 -1735
- pointblank/column.py +6 -2
- pointblank/datascan.py +3 -2
- pointblank/schema.py +155 -1
- pointblank/validate.py +459 -222
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/METADATA +3 -2
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/RECORD +14 -15
- pointblank/tf.py +0 -287
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/WHEEL +0 -0
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/entry_points.txt +0 -0
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/licenses/LICENSE +0 -0
- {pointblank-0.13.0.dist-info → pointblank-0.13.2.dist-info}/top_level.txt +0 -0
pointblank/column.py
CHANGED
|
@@ -215,8 +215,12 @@ class ColumnSelectorNarwhals(Column):
|
|
|
215
215
|
# Convert the native table to a Narwhals DataFrame
|
|
216
216
|
dfn = nw.from_native(table)
|
|
217
217
|
# Use the selector to select columns and return their names
|
|
218
|
-
|
|
219
|
-
|
|
218
|
+
selected_df = dfn.select(self.exprs.exprs)
|
|
219
|
+
# Use `collect_schema()` for LazyFrame to avoid performance warnings
|
|
220
|
+
if hasattr(selected_df, "collect_schema"):
|
|
221
|
+
return list(selected_df.collect_schema().keys())
|
|
222
|
+
else:
|
|
223
|
+
return list(selected_df.columns)
|
|
220
224
|
|
|
221
225
|
|
|
222
226
|
def col(
|
pointblank/datascan.py
CHANGED
|
@@ -162,14 +162,15 @@ class DataScan:
|
|
|
162
162
|
self.profile: _DataProfile = self._generate_profile_df()
|
|
163
163
|
|
|
164
164
|
def _generate_profile_df(self) -> _DataProfile:
|
|
165
|
-
|
|
165
|
+
# Get schema and extract all column names from it
|
|
166
|
+
schema: Mapping[str, Any] = self.nw_data.collect_schema()
|
|
167
|
+
columns: list[str] = list(schema.keys())
|
|
166
168
|
|
|
167
169
|
profile = _DataProfile(
|
|
168
170
|
table_name=self.tbl_name,
|
|
169
171
|
columns=columns,
|
|
170
172
|
implementation=self.nw_data.implementation,
|
|
171
173
|
)
|
|
172
|
-
schema: Mapping[str, Any] = self.nw_data.schema
|
|
173
174
|
for column in columns:
|
|
174
175
|
col_data: DataFrame = self.nw_data.select(column)
|
|
175
176
|
|
pointblank/schema.py
CHANGED
|
@@ -8,7 +8,7 @@ import narwhals as nw
|
|
|
8
8
|
from pointblank._constants import IBIS_BACKENDS
|
|
9
9
|
from pointblank._utils import _get_tbl_type, _is_lazy_frame, _is_lib_present, _is_narwhals_table
|
|
10
10
|
|
|
11
|
-
__all__ = ["Schema"]
|
|
11
|
+
__all__ = ["Schema", "_check_schema_match"]
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
@dataclass
|
|
@@ -888,6 +888,80 @@ def _schema_info_generate_params_dict(
|
|
|
888
888
|
}
|
|
889
889
|
|
|
890
890
|
|
|
891
|
+
def _check_schema_match(
|
|
892
|
+
data_tbl: any,
|
|
893
|
+
schema: Schema,
|
|
894
|
+
complete: bool = True,
|
|
895
|
+
in_order: bool = True,
|
|
896
|
+
case_sensitive_colnames: bool = True,
|
|
897
|
+
case_sensitive_dtypes: bool = True,
|
|
898
|
+
full_match_dtypes: bool = True,
|
|
899
|
+
) -> bool:
|
|
900
|
+
"""
|
|
901
|
+
Check if the schema matches the target table.
|
|
902
|
+
|
|
903
|
+
This function performs schema validation and returns a boolean result.
|
|
904
|
+
|
|
905
|
+
Parameters
|
|
906
|
+
----------
|
|
907
|
+
data_tbl
|
|
908
|
+
The target table to validate.
|
|
909
|
+
schema
|
|
910
|
+
The expected schema.
|
|
911
|
+
complete
|
|
912
|
+
Whether the schema should be complete.
|
|
913
|
+
in_order
|
|
914
|
+
Whether the schema should be in order.
|
|
915
|
+
case_sensitive_colnames
|
|
916
|
+
Whether column names are case-sensitive.
|
|
917
|
+
case_sensitive_dtypes
|
|
918
|
+
Whether data types are case-sensitive.
|
|
919
|
+
full_match_dtypes
|
|
920
|
+
Whether data types must match exactly.
|
|
921
|
+
|
|
922
|
+
Returns
|
|
923
|
+
-------
|
|
924
|
+
bool
|
|
925
|
+
True if the schema matches, False otherwise.
|
|
926
|
+
"""
|
|
927
|
+
validation_info = _get_schema_validation_info(
|
|
928
|
+
data_tbl=data_tbl,
|
|
929
|
+
schema=schema,
|
|
930
|
+
passed=False, # This will be determined by the logic below
|
|
931
|
+
complete=complete,
|
|
932
|
+
in_order=in_order,
|
|
933
|
+
case_sensitive_colnames=case_sensitive_colnames,
|
|
934
|
+
case_sensitive_dtypes=case_sensitive_dtypes,
|
|
935
|
+
full_match_dtypes=full_match_dtypes,
|
|
936
|
+
)
|
|
937
|
+
|
|
938
|
+
# Determine if the schema validation passed based on the validation info
|
|
939
|
+
passed = True
|
|
940
|
+
|
|
941
|
+
# Check completeness requirement
|
|
942
|
+
if complete and not validation_info["columns_full_set"]:
|
|
943
|
+
passed = False
|
|
944
|
+
|
|
945
|
+
# Check order requirement
|
|
946
|
+
if in_order and not validation_info["columns_matched_in_order"]:
|
|
947
|
+
passed = False
|
|
948
|
+
|
|
949
|
+
# Check if all expected columns were found
|
|
950
|
+
if validation_info["columns_not_found"]:
|
|
951
|
+
passed = False
|
|
952
|
+
|
|
953
|
+
# Check column-specific validations
|
|
954
|
+
for col_info in validation_info["columns"].values():
|
|
955
|
+
if not col_info["colname_matched"]:
|
|
956
|
+
passed = False
|
|
957
|
+
if not col_info.get(
|
|
958
|
+
"dtype_matched", True
|
|
959
|
+
): # dtype_matched may not exist if no dtypes specified
|
|
960
|
+
passed = False
|
|
961
|
+
|
|
962
|
+
return passed
|
|
963
|
+
|
|
964
|
+
|
|
891
965
|
def _get_schema_validation_info(
|
|
892
966
|
data_tbl: any,
|
|
893
967
|
schema: Schema,
|
|
@@ -1181,3 +1255,83 @@ def _get_schema_validation_info(
|
|
|
1181
1255
|
)
|
|
1182
1256
|
|
|
1183
1257
|
return schema_info
|
|
1258
|
+
|
|
1259
|
+
|
|
1260
|
+
def _check_schema_match(
|
|
1261
|
+
data_tbl,
|
|
1262
|
+
schema,
|
|
1263
|
+
complete: bool = True,
|
|
1264
|
+
in_order: bool = True,
|
|
1265
|
+
case_sensitive_colnames: bool = True,
|
|
1266
|
+
case_sensitive_dtypes: bool = True,
|
|
1267
|
+
full_match_dtypes: bool = True,
|
|
1268
|
+
):
|
|
1269
|
+
"""
|
|
1270
|
+
Check if a column exists in a DataFrame or has a certain data type.
|
|
1271
|
+
|
|
1272
|
+
Parameters
|
|
1273
|
+
----------
|
|
1274
|
+
data_tbl
|
|
1275
|
+
A data table.
|
|
1276
|
+
schema
|
|
1277
|
+
A schema to check against.
|
|
1278
|
+
complete
|
|
1279
|
+
`True` to check if the schema is complete, `False` otherwise.
|
|
1280
|
+
in_order
|
|
1281
|
+
`True` to check if the schema is in order, `False` otherwise.
|
|
1282
|
+
case_sensitive_colnames
|
|
1283
|
+
`True` to perform column-name matching in a case-sensitive manner, `False` otherwise.
|
|
1284
|
+
case_sensitive_dtypes
|
|
1285
|
+
`True` to perform data-type matching in a case-sensitive manner, `False` otherwise.
|
|
1286
|
+
full_match_dtypes
|
|
1287
|
+
`True` to perform a full match of data types, `False` otherwise.
|
|
1288
|
+
|
|
1289
|
+
Returns
|
|
1290
|
+
-------
|
|
1291
|
+
bool
|
|
1292
|
+
`True` when schema matches, `False` otherwise.
|
|
1293
|
+
"""
|
|
1294
|
+
schema_expect = schema
|
|
1295
|
+
schema_actual = Schema(tbl=data_tbl)
|
|
1296
|
+
|
|
1297
|
+
if complete and in_order:
|
|
1298
|
+
# Check if the schema is complete and in order (most restrictive check)
|
|
1299
|
+
# complete: True, in_order: True
|
|
1300
|
+
res = schema_expect._compare_schema_columns_complete_in_order(
|
|
1301
|
+
other=schema_actual,
|
|
1302
|
+
case_sensitive_colnames=case_sensitive_colnames,
|
|
1303
|
+
case_sensitive_dtypes=case_sensitive_dtypes,
|
|
1304
|
+
full_match_dtypes=full_match_dtypes,
|
|
1305
|
+
)
|
|
1306
|
+
|
|
1307
|
+
elif not complete and not in_order:
|
|
1308
|
+
# Check if the schema is at least a subset, and, order of columns does not matter
|
|
1309
|
+
# complete: False, in_order: False
|
|
1310
|
+
res = schema_expect._compare_schema_columns_subset_any_order(
|
|
1311
|
+
other=schema_actual,
|
|
1312
|
+
case_sensitive_colnames=case_sensitive_colnames,
|
|
1313
|
+
case_sensitive_dtypes=case_sensitive_dtypes,
|
|
1314
|
+
full_match_dtypes=full_match_dtypes,
|
|
1315
|
+
)
|
|
1316
|
+
|
|
1317
|
+
elif complete:
|
|
1318
|
+
# Check if the schema is complete, but the order of columns does not matter
|
|
1319
|
+
# complete: True, in_order: False
|
|
1320
|
+
res = schema_expect._compare_schema_columns_complete_any_order(
|
|
1321
|
+
other=schema_actual,
|
|
1322
|
+
case_sensitive_colnames=case_sensitive_colnames,
|
|
1323
|
+
case_sensitive_dtypes=case_sensitive_dtypes,
|
|
1324
|
+
full_match_dtypes=full_match_dtypes,
|
|
1325
|
+
)
|
|
1326
|
+
|
|
1327
|
+
else:
|
|
1328
|
+
# Check if the schema is a subset (doesn't need to be complete) and in order
|
|
1329
|
+
# complete: False, in_order: True
|
|
1330
|
+
res = schema_expect._compare_schema_columns_subset_in_order(
|
|
1331
|
+
other=schema_actual,
|
|
1332
|
+
case_sensitive_colnames=case_sensitive_colnames,
|
|
1333
|
+
case_sensitive_dtypes=case_sensitive_dtypes,
|
|
1334
|
+
full_match_dtypes=full_match_dtypes,
|
|
1335
|
+
)
|
|
1336
|
+
|
|
1337
|
+
return res
|