pointblank 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pointblank/column.py CHANGED
@@ -215,8 +215,12 @@ class ColumnSelectorNarwhals(Column):
215
215
  # Convert the native table to a Narwhals DataFrame
216
216
  dfn = nw.from_native(table)
217
217
  # Use the selector to select columns and return their names
218
- columns = dfn.select(self.exprs.exprs).columns
219
- return columns
218
+ selected_df = dfn.select(self.exprs.exprs)
219
+ # Use `collect_schema()` for LazyFrame to avoid performance warnings
220
+ if hasattr(selected_df, "collect_schema"):
221
+ return list(selected_df.collect_schema().keys())
222
+ else:
223
+ return list(selected_df.columns)
220
224
 
221
225
 
222
226
  def col(
pointblank/datascan.py CHANGED
@@ -162,14 +162,15 @@ class DataScan:
162
162
  self.profile: _DataProfile = self._generate_profile_df()
163
163
 
164
164
  def _generate_profile_df(self) -> _DataProfile:
165
- columns: list[str] = self.nw_data.columns
165
+ # Get schema and extract all column names from it
166
+ schema: Mapping[str, Any] = self.nw_data.collect_schema()
167
+ columns: list[str] = list(schema.keys())
166
168
 
167
169
  profile = _DataProfile(
168
170
  table_name=self.tbl_name,
169
171
  columns=columns,
170
172
  implementation=self.nw_data.implementation,
171
173
  )
172
- schema: Mapping[str, Any] = self.nw_data.schema
173
174
  for column in columns:
174
175
  col_data: DataFrame = self.nw_data.select(column)
175
176
 
pointblank/schema.py CHANGED
@@ -8,7 +8,7 @@ import narwhals as nw
8
8
  from pointblank._constants import IBIS_BACKENDS
9
9
  from pointblank._utils import _get_tbl_type, _is_lazy_frame, _is_lib_present, _is_narwhals_table
10
10
 
11
- __all__ = ["Schema"]
11
+ __all__ = ["Schema", "_check_schema_match"]
12
12
 
13
13
 
14
14
  @dataclass
@@ -888,6 +888,80 @@ def _schema_info_generate_params_dict(
888
888
  }
889
889
 
890
890
 
891
+ def _check_schema_match(
892
+ data_tbl: any,
893
+ schema: Schema,
894
+ complete: bool = True,
895
+ in_order: bool = True,
896
+ case_sensitive_colnames: bool = True,
897
+ case_sensitive_dtypes: bool = True,
898
+ full_match_dtypes: bool = True,
899
+ ) -> bool:
900
+ """
901
+ Check if the schema matches the target table.
902
+
903
+ This function performs schema validation and returns a boolean result.
904
+
905
+ Parameters
906
+ ----------
907
+ data_tbl
908
+ The target table to validate.
909
+ schema
910
+ The expected schema.
911
+ complete
912
+ Whether the schema should be complete.
913
+ in_order
914
+ Whether the schema should be in order.
915
+ case_sensitive_colnames
916
+ Whether column names are case-sensitive.
917
+ case_sensitive_dtypes
918
+ Whether data types are case-sensitive.
919
+ full_match_dtypes
920
+ Whether data types must match exactly.
921
+
922
+ Returns
923
+ -------
924
+ bool
925
+ True if the schema matches, False otherwise.
926
+ """
927
+ validation_info = _get_schema_validation_info(
928
+ data_tbl=data_tbl,
929
+ schema=schema,
930
+ passed=False, # This will be determined by the logic below
931
+ complete=complete,
932
+ in_order=in_order,
933
+ case_sensitive_colnames=case_sensitive_colnames,
934
+ case_sensitive_dtypes=case_sensitive_dtypes,
935
+ full_match_dtypes=full_match_dtypes,
936
+ )
937
+
938
+ # Determine if the schema validation passed based on the validation info
939
+ passed = True
940
+
941
+ # Check completeness requirement
942
+ if complete and not validation_info["columns_full_set"]:
943
+ passed = False
944
+
945
+ # Check order requirement
946
+ if in_order and not validation_info["columns_matched_in_order"]:
947
+ passed = False
948
+
949
+ # Check if all expected columns were found
950
+ if validation_info["columns_not_found"]:
951
+ passed = False
952
+
953
+ # Check column-specific validations
954
+ for col_info in validation_info["columns"].values():
955
+ if not col_info["colname_matched"]:
956
+ passed = False
957
+ if not col_info.get(
958
+ "dtype_matched", True
959
+ ): # dtype_matched may not exist if no dtypes specified
960
+ passed = False
961
+
962
+ return passed
963
+
964
+
891
965
  def _get_schema_validation_info(
892
966
  data_tbl: any,
893
967
  schema: Schema,
@@ -1181,3 +1255,83 @@ def _get_schema_validation_info(
1181
1255
  )
1182
1256
 
1183
1257
  return schema_info
1258
+
1259
+
1260
+ def _check_schema_match(
1261
+ data_tbl,
1262
+ schema,
1263
+ complete: bool = True,
1264
+ in_order: bool = True,
1265
+ case_sensitive_colnames: bool = True,
1266
+ case_sensitive_dtypes: bool = True,
1267
+ full_match_dtypes: bool = True,
1268
+ ):
1269
+ """
1270
+ Check if a column exists in a DataFrame or has a certain data type.
1271
+
1272
+ Parameters
1273
+ ----------
1274
+ data_tbl
1275
+ A data table.
1276
+ schema
1277
+ A schema to check against.
1278
+ complete
1279
+ `True` to check if the schema is complete, `False` otherwise.
1280
+ in_order
1281
+ `True` to check if the schema is in order, `False` otherwise.
1282
+ case_sensitive_colnames
1283
+ `True` to perform column-name matching in a case-sensitive manner, `False` otherwise.
1284
+ case_sensitive_dtypes
1285
+ `True` to perform data-type matching in a case-sensitive manner, `False` otherwise.
1286
+ full_match_dtypes
1287
+ `True` to perform a full match of data types, `False` otherwise.
1288
+
1289
+ Returns
1290
+ -------
1291
+ bool
1292
+ `True` when schema matches, `False` otherwise.
1293
+ """
1294
+ schema_expect = schema
1295
+ schema_actual = Schema(tbl=data_tbl)
1296
+
1297
+ if complete and in_order:
1298
+ # Check if the schema is complete and in order (most restrictive check)
1299
+ # complete: True, in_order: True
1300
+ res = schema_expect._compare_schema_columns_complete_in_order(
1301
+ other=schema_actual,
1302
+ case_sensitive_colnames=case_sensitive_colnames,
1303
+ case_sensitive_dtypes=case_sensitive_dtypes,
1304
+ full_match_dtypes=full_match_dtypes,
1305
+ )
1306
+
1307
+ elif not complete and not in_order:
1308
+ # Check if the schema is at least a subset, and, order of columns does not matter
1309
+ # complete: False, in_order: False
1310
+ res = schema_expect._compare_schema_columns_subset_any_order(
1311
+ other=schema_actual,
1312
+ case_sensitive_colnames=case_sensitive_colnames,
1313
+ case_sensitive_dtypes=case_sensitive_dtypes,
1314
+ full_match_dtypes=full_match_dtypes,
1315
+ )
1316
+
1317
+ elif complete:
1318
+ # Check if the schema is complete, but the order of columns does not matter
1319
+ # complete: True, in_order: False
1320
+ res = schema_expect._compare_schema_columns_complete_any_order(
1321
+ other=schema_actual,
1322
+ case_sensitive_colnames=case_sensitive_colnames,
1323
+ case_sensitive_dtypes=case_sensitive_dtypes,
1324
+ full_match_dtypes=full_match_dtypes,
1325
+ )
1326
+
1327
+ else:
1328
+ # Check if the schema is a subset (doesn't need to be complete) and in order
1329
+ # complete: False, in_order: True
1330
+ res = schema_expect._compare_schema_columns_subset_in_order(
1331
+ other=schema_actual,
1332
+ case_sensitive_colnames=case_sensitive_colnames,
1333
+ case_sensitive_dtypes=case_sensitive_dtypes,
1334
+ full_match_dtypes=full_match_dtypes,
1335
+ )
1336
+
1337
+ return res