valediction 1.1.0__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {valediction-1.1.0 → valediction-1.2.0}/PKG-INFO +1 -1
  2. {valediction-1.1.0 → valediction-1.2.0}/pyproject.toml +1 -1
  3. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/datasets/datasets.py +12 -12
  4. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/dictionary/generation.py +5 -5
  5. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/dictionary/helpers.py +0 -7
  6. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/dictionary/importing.py +43 -20
  7. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/dictionary/model.py +108 -36
  8. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/integrity.py +67 -13
  9. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/io/csv_readers.py +3 -3
  10. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/support.py +5 -1
  11. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/validation/helpers.py +30 -33
  12. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/validation/issues.py +37 -25
  13. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/validation/validation.py +101 -52
  14. {valediction-1.1.0 → valediction-1.2.0}/.gitignore +0 -0
  15. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/__init__.py +0 -0
  16. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/convenience.py +0 -0
  17. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/data_types/__init__.py +0 -0
  18. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/data_types/data_type_helpers.py +0 -0
  19. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/data_types/data_types.py +0 -0
  20. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/data_types/type_inference.py +0 -0
  21. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/datasets/__init__.py +0 -0
  22. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/datasets/datasets_helpers.py +0 -0
  23. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
  24. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/demo/DEMOGRAPHICS.csv +0 -0
  25. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/demo/DIAGNOSES.csv +0 -0
  26. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/demo/LAB_TESTS.csv +0 -0
  27. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/demo/VITALS.csv +0 -0
  28. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/demo/__init__.py +0 -0
  29. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/demo/demo_dictionary.py +0 -0
  30. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/dictionary/__init__.py +0 -0
  31. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/dictionary/exporting.py +0 -0
  32. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/dictionary/exporting_helpers.py +0 -0
  33. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/dictionary/integrity.py +0 -0
  34. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
  35. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/exceptions.py +0 -0
  36. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/io/__init__.py +0 -0
  37. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/progress.py +0 -0
  38. {valediction-1.1.0 → valediction-1.2.0}/src/valediction/validation/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: valediction
3
- Version: 1.1.0
3
+ Version: 1.2.0
4
4
  Summary: Valediction is a convenience data validation package that allows generation, import, and constraint enforcement of user-defined data dictionaries against datasets.
5
5
  Author-email: Cai Davis <Cai.Davis@uhs.nhs.uk>
6
6
  Requires-Python: <4.0,>=3.11
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "valediction"
3
- version = "1.1.0"
3
+ version = "1.2.0"
4
4
  description = "Valediction is a convenience data validation package that allows generation, import, and constraint enforcement of user-defined data dictionaries against datasets."
5
5
  authors = [{ name = "Cai Davis", email = "Cai.Davis@uhs.nhs.uk" }]
6
6
  requires-python = ">=3.11,<4.0"
@@ -20,7 +20,8 @@ from valediction.io.csv_readers import (
20
20
  )
21
21
  from valediction.support import (
22
22
  _get_runtime_string,
23
- _normalise_name,
23
+ _normalise,
24
+ _strip,
24
25
  list_as_bullets,
25
26
  print_bold_red,
26
27
  print_red,
@@ -437,16 +438,16 @@ class Dataset(list[DatasetItem]):
437
438
 
438
439
  # Getters
439
440
  def get(self, name: str, default: DatasetItem | None = None) -> DatasetItem | None:
440
- name_key = name.strip()
441
+ name_key = _normalise(name)
441
442
  for item in self:
442
- if item.name.lower() == name_key.lower():
443
+ if _normalise(item.name) == name_key:
443
444
  return item
444
445
  return default
445
446
 
446
447
  def index_of(self, name: str) -> int | None:
447
- name_key = name.strip()
448
+ name_key = _normalise(name)
448
449
  for i, item in enumerate(self):
449
- if item.name == name_key:
450
+ if _normalise(item.name) == name_key:
450
451
  return i
451
452
  return None
452
453
 
@@ -796,20 +797,21 @@ class Dataset(list[DatasetItem]):
796
797
  name: str | None,
797
798
  data: DataLike,
798
799
  ) -> DatasetItem:
799
- """Normalise a (data, name) double into a DatasetItem."""
800
+ """Normalise a (name, data) double into a DatasetItem."""
800
801
  if isinstance(data, (str, Path)):
801
802
  path = Path(data)
802
803
  if not path.exists():
803
804
  raise FileNotFoundError(f"File not found: {path}")
804
805
  if path.suffix.lower() != ".csv":
805
806
  raise ValueError(f"Only .csv supported right now, got: {path}")
806
- resolved_name = _normalise_name(name or path.stem)
807
+ resolved_name = _strip(name or path.stem)
807
808
  return DatasetItem(name=resolved_name, data=path.resolve())
808
809
 
809
810
  if isinstance(data, DataFrame):
810
811
  if not name:
811
812
  raise ValueError("When providing a DataFrame, 'name' is required.")
812
- resolved_name = _normalise_name(name)
813
+ resolved_name = _strip(name)
814
+ data.columns = [_strip(column) for column in data.columns]
813
815
  return DatasetItem(name=resolved_name, data=data)
814
816
 
815
817
  raise TypeError("data must be a Path/str to .csv or a pandas DataFrame.")
@@ -823,13 +825,11 @@ class Dataset(list[DatasetItem]):
823
825
  if p.is_file():
824
826
  if p.suffix.lower() != ".csv":
825
827
  raise ValueError(f"Expected a .csv file, got: {p.suffix} ({p})")
826
- return [DatasetItem(name=_normalise_name(p.stem), data=p.resolve())]
828
+ return [DatasetItem(name=_strip(p.stem), data=p.resolve())]
827
829
 
828
830
  if p.is_dir():
829
831
  return [
830
- DatasetItem(
831
- name=_normalise_name(csv_path.stem), data=csv_path.resolve()
832
- )
832
+ DatasetItem(name=_strip(csv_path.stem), data=csv_path.resolve())
833
833
  for csv_path in p.glob("*.csv")
834
834
  ]
835
835
 
@@ -24,7 +24,7 @@ from valediction.io.csv_readers import (
24
24
  read_csv_sample,
25
25
  )
26
26
  from valediction.progress import Progress
27
- from valediction.support import _normalise_name, calculate_runtime
27
+ from valediction.support import _strip, calculate_runtime
28
28
 
29
29
  IMPORTING_DATA = "Importing data"
30
30
  CHUNK_STEPS = 1
@@ -124,7 +124,7 @@ class Generator:
124
124
  self.__say(f"Generating dictionary for {len(items)} tables")
125
125
  for item in items:
126
126
  self.__progress_init(item)
127
- table = Table(name=_normalise_name(item.name))
127
+ table = Table(name=_strip(item.name))
128
128
  dictionary.add_table(table)
129
129
 
130
130
  if item.is_path:
@@ -192,7 +192,7 @@ class Generator:
192
192
  col_state = inferer.states[col_name]
193
193
  data_type, length = col_state.final_data_type_and_length()
194
194
  col = Column(
195
- name=_normalise_name(col_name),
195
+ name=_strip(col_name),
196
196
  order=idx,
197
197
  data_type=data_type,
198
198
  length=length if data_type == DataType.TEXT else None,
@@ -242,7 +242,7 @@ class Generator:
242
242
  col_state = inferer.states[col_name]
243
243
  data_type, length = col_state.final_data_type_and_length()
244
244
  col = Column(
245
- name=_normalise_name(col_name),
245
+ name=_strip(col_name),
246
246
  order=idx,
247
247
  data_type=data_type,
248
248
  length=length if data_type == DataType.TEXT else None,
@@ -277,7 +277,7 @@ class Generator:
277
277
  next_order = max((c.order or 0 for c in table), default=0) + 1
278
278
  data_type, length = col_state.final_data_type_and_length()
279
279
  new_col = Column(
280
- name=_normalise_name(col_name),
280
+ name=_strip(col_name),
281
281
  order=next_order,
282
282
  data_type=data_type,
283
283
  length=length if data_type == DataType.TEXT else None,
@@ -26,9 +26,6 @@ def _check_name(name: str, entity: Literal["table", "column"]) -> list[str]:
26
26
  else config.max_column_name_length
27
27
  )
28
28
 
29
- if name != name.upper(): # name must be uppercase
30
- errors.append("must be uppercase")
31
-
32
29
  if invalid_chars.search(name): # check invalid characters
33
30
  bad = set(invalid_chars.findall(name))
34
31
  errors.append(
@@ -115,10 +112,6 @@ def _check_primary_key(primary_key: int | None, data_type: DataType) -> list[str
115
112
  return errors
116
113
 
117
114
 
118
- def _normalise_name(name: str) -> str:
119
- return name.upper().strip()
120
-
121
-
122
115
  def _norm_header_map(columns: list) -> dict:
123
116
  mapping, _ = {}, set()
124
117
  for c in columns:
@@ -11,7 +11,6 @@ from valediction.dictionary.helpers import (
11
11
  _get_required_header,
12
12
  _is_missing,
13
13
  _norm_header_map,
14
- _normalise_name,
15
14
  _parse_int,
16
15
  _parse_truthy,
17
16
  _row_is_blank,
@@ -19,7 +18,7 @@ from valediction.dictionary.helpers import (
19
18
  from valediction.dictionary.integrity import REQUIRED_SHEETS
20
19
  from valediction.dictionary.model import Column, Dictionary, Table
21
20
  from valediction.exceptions import DataDictionaryError, DataDictionaryImportError
22
- from valediction.support import list_as_bullets
21
+ from valediction.support import _normalise, _strip, list_as_bullets
23
22
 
24
23
 
25
24
  @dataclass
@@ -80,6 +79,13 @@ class ExcelDataDictionary:
80
79
  raise error
81
80
 
82
81
  # Import & Helpers
82
+ def _resolve_table_name(self, name: str) -> str | None:
83
+ """Return the canonical table name as it appears in Tables sheet (or None)."""
84
+ target = _normalise(name)
85
+ return next(
86
+ (t for t in self.table_metadata.keys() if _normalise(t) == target), None
87
+ )
88
+
83
89
  def _open_workbook(self) -> None:
84
90
  if not self.path.exists():
85
91
  raise DataDictionaryImportError(f"File not found: {self.path}")
@@ -140,20 +146,27 @@ class ExcelDataDictionary:
140
146
  description_col_header = _get_required_header(header_map, "description")
141
147
 
142
148
  meta: dict[str, str | None] = {}
149
+ seen: set[str] = set()
150
+
143
151
  for _, row in tables_df.iterrows():
144
152
  if _is_missing(row[table_col_header]):
145
153
  continue
146
- table_name = _normalise_name(str(row[table_col_header]))
154
+
155
+ table_name = _strip(str(row[table_col_header]))
147
156
  table_description = (
148
157
  None
149
158
  if _is_missing(row[description_col_header])
150
159
  else str(row[description_col_header])
151
160
  )
152
- if table_name in meta:
161
+
162
+ key = _normalise(table_name)
163
+ if key in seen:
153
164
  raise DataDictionaryImportError(
154
165
  f"Duplicate table '{table_name}' in Tables sheet."
155
166
  )
167
+ seen.add(key)
156
168
  meta[table_name] = table_description
169
+
157
170
  if not meta:
158
171
  raise DataDictionaryImportError(
159
172
  "Data Dictionary sheet 'Tables' contains no table rows."
@@ -177,12 +190,13 @@ class ExcelDataDictionary:
177
190
  or _is_missing(row[code_col_header])
178
191
  ):
179
192
  continue
180
- table_name = _normalise_name(str(row[table_col_header]))
181
- column_name = _normalise_name(str(row[column_col_header]))
182
- enum_map.setdefault((table_name, column_name), {})
183
- enum_map[(table_name, column_name)][row[code_col_header]] = row[
184
- name_col_header
185
- ]
193
+ table_name = _strip(str(row[table_col_header]))
194
+ column_name = _strip(str(row[column_col_header]))
195
+ resolved_table = self._resolve_table_name(table_name) or table_name
196
+ enum_key = (_normalise(resolved_table), _normalise(column_name))
197
+ enum_map.setdefault(enum_key, {})
198
+ enum_map[enum_key][row[code_col_header]] = row[name_col_header]
199
+
186
200
  self.enumerations = enum_map
187
201
 
188
202
  # Parse Columns
@@ -234,7 +248,12 @@ class ExcelDataDictionary:
234
248
 
235
249
  self.table_columns[inputs.table_name].append(column_obj)
236
250
  if inputs.has_enumerations:
237
- self.enum_flags.add((inputs.table_name, inputs.column_name))
251
+ self.enum_flags.add(
252
+ (
253
+ _normalise(inputs.table_name),
254
+ _normalise(inputs.column_name),
255
+ )
256
+ )
238
257
 
239
258
  if errors:
240
259
  raise DataDictionaryImportError(
@@ -279,7 +298,7 @@ class ExcelDataDictionary:
279
298
 
280
299
  # Validate Foreign Keys
281
300
  def _validate_foreign_keys(self) -> None:
282
- name_to_table = {t.name: t for t in self.tables}
301
+ name_to_table = {_normalise(t.name): t for t in self.tables}
283
302
  errors: list[str] = []
284
303
  for table in self.tables:
285
304
  for column in table:
@@ -292,9 +311,9 @@ class ExcelDataDictionary:
292
311
  )
293
312
  continue
294
313
  target_table_raw, target_column_raw = target.split(".", 1)
295
- target_table_name = _normalise_name(target_table_raw)
296
- target_column_name = _normalise_name(target_column_raw)
297
- referenced_table = name_to_table.get(target_table_name)
314
+ target_table_name = _strip(target_table_raw)
315
+ target_column_name = _strip(target_column_raw)
316
+ referenced_table = name_to_table.get(_normalise(target_table_name))
298
317
  if not referenced_table:
299
318
  errors.append(
300
319
  f"{table.name}.{column.name} references unknown table {target_table_name!r}."
@@ -392,13 +411,17 @@ class ExcelDataDictionary:
392
411
  f"{row_context}: missing required field(s): {', '.join(missing_fields)}."
393
412
  )
394
413
 
395
- table_name = _normalise_name(str(row[table_col_header]))
396
- column_name = _normalise_name(str(row[column_col_header]))
397
- if table_name not in self.table_metadata:
414
+ table_name_raw = _strip(str(row[table_col_header]))
415
+ column_name = _strip(str(row[column_col_header]))
416
+
417
+ resolved_table_name = self._resolve_table_name(table_name_raw)
418
+ if resolved_table_name is None:
398
419
  raise DataDictionaryImportError(
399
- f"{row_context}: Table '{table_name}' not present in Tables sheet."
420
+ f"{row_context}: Table '{table_name_raw}' not present in Tables sheet."
400
421
  )
401
422
 
423
+ table_name = resolved_table_name
424
+
402
425
  order_int = _parse_int(row[order_col_header], "Order", row_context)
403
426
  length_int = (
404
427
  _parse_int(row[length_col_header], "Length", row_context, required=False)
@@ -461,7 +484,7 @@ class ExcelDataDictionary:
461
484
 
462
485
  def _make_column(self, inputs: _ColumnInputs) -> Column:
463
486
  enums_for_column = self.enumerations.get(
464
- (inputs.table_name, inputs.column_name), {}
487
+ (_normalise(inputs.table_name), _normalise(inputs.column_name)), {}
465
488
  )
466
489
  return Column(
467
490
  name=inputs.column_name,
@@ -9,10 +9,9 @@ from valediction.dictionary.helpers import (
9
9
  _check_name,
10
10
  _check_order,
11
11
  _check_primary_key,
12
- _normalise_name,
13
12
  )
14
13
  from valediction.exceptions import DataDictionaryError
15
- from valediction.support import list_as_bullets
14
+ from valediction.support import _normalise, _strip, list_as_bullets
16
15
 
17
16
 
18
17
  class Column:
@@ -44,7 +43,7 @@ class Column:
44
43
  description: str | None = None,
45
44
  datetime_format: str | None = None,
46
45
  ):
47
- self.name = _normalise_name(name)
46
+ self.name = _strip(name)
48
47
  self.order = int(order) if order is not None else None
49
48
  self.data_type: DataType = None
50
49
  self.length = int(length) if length is not None else None
@@ -127,7 +126,7 @@ class Table(list[Column]):
127
126
  columns: list[Column] | None = None,
128
127
  ):
129
128
  super().__init__()
130
- self.name = _normalise_name(name)
129
+ self.name = _strip(name)
131
130
  self.description = description
132
131
  for column in columns or []:
133
132
  self.add_column(column)
@@ -139,24 +138,28 @@ class Table(list[Column]):
139
138
  )
140
139
  return f"Table(name={self.name!r}, description={self.description!r}{cols_str})"
141
140
 
141
+ def __key(self, name: str) -> str:
142
+ return _normalise(name)
143
+
142
144
  def __getitem__(self, key: int | str) -> Column:
143
145
  if isinstance(key, int):
144
146
  return super().__getitem__(key)
145
- target = _normalise_name(key)
146
- found = next((c for c in self if c.name == target), None)
147
+
148
+ target_key = self.__key(key)
149
+ found = next((c for c in self if self.__key(c.name) == target_key), None)
147
150
  if not found:
148
151
  raise KeyError(f"Column {key!r} not found in table {self.name!r}.")
149
152
  return found
150
153
 
151
154
  def __get(self, name: str, default: Column | None = None) -> Column | None:
152
- target = _normalise_name(name)
153
- return next((c for c in self if c.name == target), default)
155
+ target_key = self.__key(name)
156
+ return next((c for c in self if self.__key(c.name) == target_key), default)
154
157
 
155
158
  # Getters
156
159
  def index_of(self, name: str) -> int | None:
157
- target = _normalise_name(name)
160
+ target_key = self.__key(name)
158
161
  for i, c in enumerate(self):
159
- if c.name == target:
162
+ if self.__key(c.name) == target_key:
160
163
  return i
161
164
  return None
162
165
 
@@ -303,16 +306,17 @@ class Table(list[Column]):
303
306
  if not isinstance(column, Column):
304
307
  raise DataDictionaryError("Only Column objects can be added to a Table.")
305
308
 
306
- if column.name in self.get_column_names():
307
- conflict = self.get_column(column.name)
309
+ incoming_key = self.__key(column.name)
310
+ conflict = next((c for c in self if self.__key(c.name) == incoming_key), None)
311
+ if conflict is not None:
308
312
  raise DataDictionaryError(
309
- f"Column {column.name!r} already exists (order={conflict.order!r})"
313
+ f"Column {column.name!r} already exists (order={conflict.order!r}, as {conflict.name!r})."
310
314
  )
311
315
 
312
316
  if column.order in self.get_column_orders():
313
- conflict = self.get_column(column.order)
317
+ conflict_by_order = self.get_column(column.order)
314
318
  raise DataDictionaryError(
315
- f"Order {column.order!r} already exists (name={conflict.name!r})"
319
+ f"Order {column.order!r} already exists (name={conflict_by_order.name!r})"
316
320
  )
317
321
 
318
322
  if column.primary_key is not None:
@@ -339,10 +343,7 @@ class Table(list[Column]):
339
343
  Raises:
340
344
  DataDictionaryError: if the column does not exist
341
345
  """
342
- if isinstance(column, str):
343
- name = self.get_column(column).name
344
- else:
345
- name = self.get_column(column).name # by order
346
+ name = self.get_column(column).name
346
347
  remaining = [c for c in self if c.name != name]
347
348
  self.clear()
348
349
  super().extend(remaining)
@@ -367,16 +368,17 @@ class Table(list[Column]):
367
368
  for col in self:
368
369
  col.primary_key = None
369
370
 
370
- # Resolve and dedupe
371
+ # Resolve and deduplicate
371
372
  resolved: list[Column] = []
372
373
  seen: set[str] = set()
373
374
  for key in primary_keys:
374
375
  col = self.get_column(key)
375
- if col.name in seen:
376
+ col_key = self.__key(col.name)
377
+ if col_key in seen:
376
378
  raise DataDictionaryError(
377
379
  f"Duplicate column {col.name!r} provided for table {self.name!r}."
378
380
  )
379
- seen.add(col.name)
381
+ seen.add(col_key)
380
382
  resolved.append(col)
381
383
 
382
384
  # Assign ordinals 1..N
@@ -416,14 +418,20 @@ class Dictionary(list[Table]):
416
418
  ):
417
419
  super().__init__()
418
420
  self.name = name
421
+
422
+ if isinstance(tables, Table):
423
+ tables = [tables]
424
+
419
425
  for t in tables or []:
420
426
  self.add_table(t)
427
+
421
428
  self.organisations = organisations
422
429
  self.version = version
423
430
  self.version_notes = version_notes
424
431
  self.inclusion_criteria = inclusion_criteria
425
432
  self.exclusion_criteria = exclusion_criteria
426
433
  self.imported = imported
434
+ self.__check_variables()
427
435
 
428
436
  # Properties
429
437
  @property
@@ -439,24 +447,85 @@ class Dictionary(list[Table]):
439
447
  tables = list_as_bullets(elements=[str(t) for t in self], bullet="\n- ")
440
448
  return f"Dictionary(name={self.name!r}, imported={self.imported!r}, {tables})"
441
449
 
450
+ def __key(self, name: str) -> str:
451
+ return _normalise(name)
452
+
442
453
  def __getitem__(self, key: int | str) -> Table:
443
454
  if isinstance(key, int):
444
455
  return super().__getitem__(key)
445
- target = _normalise_name(key)
446
- found = next((t for t in self if t.name == target), None)
456
+
457
+ target_key = self.__key(key)
458
+ found = next((t for t in self if self.__key(t.name) == target_key), None)
447
459
  if not found:
448
460
  raise KeyError(f"Table {key!r} not found in Dictionary.")
449
461
  return found
450
462
 
451
- # Getters
452
463
  def __get(self, name: str, default: Table | None = None) -> Table | None:
453
- target = _normalise_name(name)
454
- return next((t for t in self if t.name == target), default)
464
+ target_key = self.__key(name)
465
+ return next((t for t in self if self.__key(t.name) == target_key), default)
466
+
467
+ # Checkers
468
+ def __check_variables(self) -> None:
469
+ self.__check_name()
470
+ self.__check_organisations()
471
+ self.__check_version()
472
+ self.__check_version_notes()
473
+ self.__check_criteria()
474
+
475
+ def __check_name(self) -> None:
476
+ # Check name
477
+ if self.name is not None:
478
+ if not isinstance(self.name, str):
479
+ raise DataDictionaryError("Dictionary `name` must be a string.")
480
+
481
+ def __check_organisations(self) -> None:
482
+ # Check organisations
483
+ if self.organisations is not None:
484
+ if not isinstance(self.organisations, str):
485
+ raise DataDictionaryError(
486
+ "Dictionary `organisations` must be a string."
487
+ )
488
+
489
+ def __check_version(self) -> None:
490
+ # Check version
491
+ if self.version is not None:
492
+ if not isinstance(self.version, (str, int, float)):
493
+ raise DataDictionaryError(
494
+ "Dictionary `version` must be a string, int, or float."
495
+ )
496
+
497
+ if isinstance(self.version, (int, float)):
498
+ self.version = str(self.version)
499
+
500
+ # Check version_notes
455
501
 
502
+ def __check_version_notes(self) -> None:
503
+ if self.version_notes is not None:
504
+ if not isinstance(self.version_notes, str):
505
+ raise DataDictionaryError(
506
+ "Dictionary `version_notes` must be a string."
507
+ )
508
+
509
+ def __check_criteria(self) -> None:
510
+ # Check inclusion_criteria
511
+ if self.inclusion_criteria is not None:
512
+ if not isinstance(self.inclusion_criteria, str):
513
+ raise DataDictionaryError(
514
+ "Dictionary `inclusion_criteria` must be a string."
515
+ )
516
+
517
+ # Check exclusion_criteria
518
+ if self.exclusion_criteria is not None:
519
+ if not isinstance(self.exclusion_criteria, str):
520
+ raise DataDictionaryError(
521
+ "Dictionary exclusion_criteria must be a string."
522
+ )
523
+
524
+ # Getters
456
525
  def index_of(self, name: str) -> int | None:
457
- target = _normalise_name(name)
526
+ target_key = self.__key(name)
458
527
  for i, t in enumerate(self):
459
- if t.name == target:
528
+ if self.__key(t.name) == target_key:
460
529
  return i
461
530
  return None
462
531
 
@@ -484,12 +553,9 @@ class Dictionary(list[Table]):
484
553
  Raises:
485
554
  KeyError: If the table is not found in the dictionary.
486
555
  """
487
- target = _normalise_name(table)
488
- found = next((t for t in self if t.name == target), None)
489
-
490
- if not found:
556
+ found = self.__get(table)
557
+ if found is None:
491
558
  raise KeyError(f"Table {table!r} not found in Dictionary.")
492
-
493
559
  return found
494
560
 
495
561
  # Manipulation
@@ -508,8 +574,14 @@ class Dictionary(list[Table]):
508
574
  raise DataDictionaryError(
509
575
  "Only Table objects can be added to a Dictionary."
510
576
  )
511
- if table.name in self.get_table_names():
512
- raise DataDictionaryError(f"Table {table.name!r} already exists.")
577
+
578
+ incoming_key = self.__key(table.name)
579
+ conflict = next((t for t in self if self.__key(t.name) == incoming_key), None)
580
+ if conflict is not None:
581
+ raise DataDictionaryError(
582
+ f"Table {table.name!r} already exists (as {conflict.name!r})."
583
+ )
584
+
513
585
  super().append(table)
514
586
 
515
587
  def remove_table(self, table: str) -> None:
@@ -1,6 +1,10 @@
1
+ from __future__ import annotations
2
+
1
3
  import re
4
+ from copy import deepcopy
2
5
  from pathlib import Path
3
6
  from re import Pattern
7
+ from typing import Any
4
8
 
5
9
  from valediction.data_types.data_types import DataType
6
10
  from valediction.support import list_as_bullets
@@ -12,13 +16,58 @@ TEMPLATE_DATA_DICTIONARY_PATH = (
12
16
  )
13
17
 
14
18
 
19
+ externally_injected_variables: dict[
20
+ str, Any
21
+ ] = {} # External injection store for package wrapping (any keys, always included)
22
+
23
+
24
+ def reset_injected_config_variables() -> None:
25
+ global externally_injected_variables
26
+ externally_injected_variables = {}
27
+
28
+
29
+ def inject_config_variables(variables: dict[str, Any]) -> None:
30
+ """Injects variables into the Valediction Config, which will always be incorporated
31
+ as overrides, regardless of Config calling method (default, session-scoped, or
32
+ contextual).
33
+
34
+ Args:
35
+ variables (dict[str, Any]): Dictionary of config variables.
36
+ """
37
+ global externally_injected_variables, session_config
38
+
39
+ # check type allows
40
+ if not isinstance(variables, dict):
41
+ raise TypeError(
42
+ f"Config injection variables must be a dictionary, not {type(variables)}"
43
+ )
44
+ problematic_keys = []
45
+ for variable_name in variables.keys():
46
+ if not isinstance(variable_name, str):
47
+ problematic_keys.append(variable_name)
48
+
49
+ if problematic_keys:
50
+ raise TypeError("Config injection variables accepts only string keys.")
51
+
52
+ externally_injected_variables = dict(variables or {})
53
+
54
+ # Apply immediately to the current session config (if it exists)
55
+ if session_config is not None:
56
+ _apply_external_injections(session_config)
57
+
58
+
59
+ def _apply_external_injections(config: Config) -> None:
60
+ for variable_name, variable_value in externally_injected_variables.items():
61
+ setattr(config, variable_name, deepcopy(variable_value))
62
+
63
+
15
64
  class Config:
16
65
  def __init__(self):
17
66
  self.template_data_dictionary_path: Path = TEMPLATE_DATA_DICTIONARY_PATH
18
67
  self.max_table_name_length: int = 63
19
68
  self.max_column_name_length: int = 30
20
69
  self.max_primary_keys: int = 7
21
- self.invalid_name_pattern: str | Pattern = re.compile(r"[^A-Z0-9_]")
70
+ self.invalid_name_pattern: str | Pattern = re.compile(r"[^A-Za-z0-9_]")
22
71
  self.null_values: list[str] = ["", "null", "none"]
23
72
  self.forbidden_characters: list[str] = []
24
73
  self.date_formats: dict[str, DataType] = {
@@ -42,6 +91,7 @@ class Config:
42
91
  }
43
92
  self.enforce_no_null_columns: bool = True
44
93
  self.enforce_primary_keys: bool = True
94
+ _apply_external_injections(self)
45
95
 
46
96
  def __repr__(self):
47
97
  date_list = list_as_bullets(
@@ -65,33 +115,37 @@ class Config:
65
115
 
66
116
  # Context Wrapper With Reset
67
117
  def __enter__(self):
68
- global default_config
69
- default_config = self
118
+ global session_config
119
+
120
+ _apply_external_injections(self)
121
+
122
+ session_config = self
70
123
  return self
71
124
 
72
125
  def __exit__(self, exc_type, exc_value, traceback):
73
- global default_config
74
- default_config = Config()
126
+ global session_config
127
+ session_config = Config()
75
128
 
76
129
 
77
- default_config: Config = None
130
+ session_config: Config = None
78
131
 
79
132
 
80
133
  def get_config() -> Config:
81
- """Gets the current `default_config` instance. Changing attributes will set them
82
- globally.
134
+ """Gets the current `session_config` instance. Changing attributes will set them
135
+ globally for the python session. Use `reset_default_config()` to reset to original
136
+ defaults.
83
137
 
84
138
  Returns:
85
- Config: The current default configuration.
139
+ Config: The current session configuration.
86
140
  """
87
- global default_config
88
- return default_config
141
+ global session_config
142
+ return session_config
89
143
 
90
144
 
91
145
  def reset_default_config() -> None:
92
146
  """Resets `default_config` settings globally to original defaults."""
93
- global default_config
94
- default_config = Config()
147
+ global session_config
148
+ session_config = Config()
95
149
 
96
150
 
97
151
  reset_default_config()
@@ -11,7 +11,7 @@ import pandas as pd
11
11
  from pandas import DataFrame
12
12
  from pandas.errors import ParserError
13
13
 
14
- from valediction.support import _normalise_name
14
+ from valediction.support import _strip
15
15
 
16
16
 
17
17
  class FrameChunk(NamedTuple):
@@ -34,7 +34,7 @@ class FrameChunk(NamedTuple):
34
34
  total_chunks_seen: int | None
35
35
 
36
36
  def estimate_chunk_count(self) -> int:
37
- # Buffers (accounting for CSV tails/bytes innacuracy)
37
+ # Buffers (accounting for CSV tails/bytes inaccuracy)
38
38
  EPS_ABS = 4096 # Fixed
39
39
  EPS_REL = 0.05 # 5% tail buffer
40
40
 
@@ -93,7 +93,7 @@ def _post_read_processing(df: DataFrame, cfg: CsvReadConfig) -> DataFrame:
93
93
  """Apply header normalisation and vectorised value stripping after reading."""
94
94
  cfg = cfg or CsvReadConfig()
95
95
  if cfg.normalise_headers:
96
- df = df.rename(columns={c: _normalise_name(c) for c in df.columns})
96
+ df = df.rename(columns={c: _strip(c) for c in df.columns})
97
97
  if cfg.strip_values:
98
98
  str_cols = df.select_dtypes(include=["string"]).columns
99
99
  if len(str_cols) > 0:
@@ -35,10 +35,14 @@ def list_as_bullets(elements: list, bullet: str = "\n - ") -> str:
35
35
  return bullet + bullet.join(elements)
36
36
 
37
37
 
38
- def _normalise_name(name: str) -> str:
38
+ def _normalise(name: str) -> str:
39
39
  return name.strip().upper()
40
40
 
41
41
 
42
+ def _strip(name: str) -> str:
43
+ return name.strip()
44
+
45
+
42
46
  def _get_runtime_string(runtime: timedelta) -> str:
43
47
  total_seconds = runtime.total_seconds()
44
48
  hours = trunc(total_seconds / 3600)
@@ -10,6 +10,7 @@ from pandas.util import hash_pandas_object
10
10
  from valediction.data_types.data_types import DataType
11
11
  from valediction.dictionary.model import Table
12
12
  from valediction.integrity import get_config
13
+ from valediction.support import _normalise
13
14
  from valediction.validation.issues import Range
14
15
 
15
16
 
@@ -17,11 +18,14 @@ from valediction.validation.issues import Range
17
18
  def _set_nulls(df: DataFrame) -> DataFrame:
18
19
  null_values = get_config().null_values
19
20
  token_set = {str(t).strip().casefold() for t in null_values}
20
- columns = df.select_dtypes(include=["string", "object"]).columns
21
+ columns = df.select_dtypes(include=["string", "object", "category"]).columns
21
22
  for column in columns:
22
23
  series = df[column]
23
- mask = series.notna() & series.str.casefold().isin(token_set)
24
- df[column] = series.mask(mask, NA)
24
+
25
+ s_txt = series.astype("string", copy=False) # dtype safe
26
+ mask = s_txt.notna() & s_txt.str.strip().str.casefold().isin(token_set)
27
+ if mask.any():
28
+ df[column] = series.mask(mask, NA)
25
29
 
26
30
  return df
27
31
 
@@ -68,37 +72,24 @@ def create_pk_hashes(
68
72
  Returns:
69
73
  Series: Pandas Series with hashes or Nulls.
70
74
  """
71
- hash_col_name = "PK_HASH"
75
+ HASH_COL_NAME = "PK_HASH"
72
76
  if df_primaries.empty or df_primaries.shape[1] == 0:
73
- return Series([], dtype=object, name=hash_col_name)
77
+ return Series([], dtype=object, name=HASH_COL_NAME)
74
78
 
75
- # Any NA in row => invalid PK -> None
79
+ # Check Nulls
76
80
  null_rows = df_primaries.isna().any(axis=1)
77
81
 
78
- # First Hash
79
- hash_1 = hash_pandas_object(df_primaries, index=False) # uint64
80
-
81
- # Second Hash (rows backwards if single row, else salt)
82
- if df_primaries.shape[1] > 1:
83
- df_primaries_backwards = df_primaries.iloc[:, ::-1]
84
- else:
85
- s = df_primaries.iloc[:, 0]
86
- salt = Series(["§"] * len(s), index=s.index, dtype="string")
87
- df_primaries_backwards = DataFrame(
88
- {
89
- "_a": s,
90
- "_b": s.str.cat(salt),
91
- }
92
- )
93
-
94
- hash_2 = hash_pandas_object(df_primaries_backwards, index=False) # uint64
82
+ # Two independent 64-bit hashes with 16 byte keys
83
+ hash_1 = hash_pandas_object(df_primaries, index=False, hash_key="valediction_pk1!")
84
+ hash_2 = hash_pandas_object(df_primaries, index=False, hash_key="valediction_pk2!")
95
85
 
86
+ # Combine into 128-bit integer keys
96
87
  a1 = hash_1.to_numpy(dtype="uint64", copy=False).astype(object)
97
88
  a2 = hash_2.to_numpy(dtype="uint64", copy=False).astype(object)
98
-
99
89
  combined = (a1 << 64) | a2
90
+
100
91
  hashes = Series(
101
- combined, index=df_primaries.index, name=hash_col_name, dtype=object
92
+ combined, index=df_primaries.index, name=HASH_COL_NAME, dtype=object
102
93
  )
103
94
  hashes[null_rows] = None
104
95
  return hashes
@@ -167,8 +158,9 @@ def pk_contains_whitespace_mask(df_primaries: DataFrame) -> Series:
167
158
  if df_primaries.empty or df_primaries.shape[1] == 0:
168
159
  return Series(False, index=df_primaries.index)
169
160
 
170
- col_masks = df_primaries.apply(lambda s: s.str.contains(r"\s", na=False))
171
-
161
+ col_masks = df_primaries.apply(
162
+ lambda s: s.astype("string", copy=False).str.contains(r"\s", na=False)
163
+ )
172
164
  return col_masks.any(axis=1)
173
165
 
174
166
 
@@ -261,7 +253,9 @@ def invalid_mask_text_too_long(column: Series, max_len: int) -> Series:
261
253
  return Series(False, index=column.index)
262
254
 
263
255
  notnull = column.notna()
264
- lens = column.str.len()
256
+ s_txt = column.astype("string", copy=False)
257
+ lens = s_txt.str.len()
258
+
265
259
  return notnull & (lens > max_len)
266
260
 
267
261
 
@@ -270,20 +264,23 @@ def invalid_mask_text_forbidden_characters(column: Series) -> Series:
270
264
  if not forbidden:
271
265
  return column.notna() & False
272
266
 
273
- pattern = "[" + re.escape("".join(forbidden)) + "]"
267
+ pattern = "[" + re.escape("".join([str(s) for s in forbidden])) + "]"
274
268
  notnull = column.notna()
275
- has_forbidden = column.str.contains(pattern, regex=True, na=False)
269
+
270
+ s_txt = column.astype("string", copy=False)
271
+ has_forbidden = s_txt.str.contains(pattern, regex=True, na=False)
272
+
276
273
  return notnull & has_forbidden
277
274
 
278
275
 
279
276
  # Apply Data Types #
280
277
  def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
281
278
  # name -> column object
282
- column_dictionary = {column.name: column for column in table_dictionary}
279
+ column_dictionary = {_normalise(column.name): column for column in table_dictionary}
283
280
 
284
281
  for col in df.columns:
285
- data_type = column_dictionary.get(col).data_type
286
- datetime_format = column_dictionary.get(col).datetime_format
282
+ data_type = column_dictionary.get(_normalise(col)).data_type
283
+ datetime_format = column_dictionary.get(_normalise(col)).datetime_format
287
284
 
288
285
  if data_type in (DataType.TEXT, DataType.FILE):
289
286
  df[col] = df[col].astype("string")
@@ -8,7 +8,7 @@ from pandas import DataFrame, concat
8
8
 
9
9
  from valediction.datasets.datasets_helpers import DatasetItemLike
10
10
  from valediction.io.csv_readers import CsvReadConfig, read_csv_ranges
11
- from valediction.support import _normalise_name, list_as_bullets
11
+ from valediction.support import _strip, list_as_bullets
12
12
 
13
13
 
14
14
  class IssueType(Enum):
@@ -107,6 +107,7 @@ class Issue:
107
107
  merged.append(cur)
108
108
  self.ranges = merged
109
109
 
110
+ # Inspect
110
111
  def inspect(
111
112
  self,
112
113
  additional_columns: bool | str | list[str] | None = None,
@@ -132,9 +133,9 @@ class Issue:
132
133
  ValueError: if the issue has no parent DatasetItem
133
134
  """
134
135
  # Guard
135
- if not self.parent:
136
- raise ValueError("Issue has no parent DatasetItem")
136
+ self.__guard_parent()
137
137
  header = self.__repr__() if print_header else ""
138
+
138
139
  # Not applicable
139
140
  if self.type in APPLIES_WHOLE_COLUMN:
140
141
  print(f"{header}: applies to whole column")
@@ -143,22 +144,8 @@ class Issue:
143
144
  # Column Inclusion
144
145
  if print_header:
145
146
  print(f"{header}:")
146
- if additional_columns is True:
147
- columns = None
148
- else:
149
- additional_columns = (
150
- [additional_columns]
151
- if isinstance(additional_columns, str)
152
- else additional_columns
153
- )
154
- base = (
155
- set(self.parent.primary_keys)
156
- if self.type in PRIMARY_KEY_ISSUES
157
- else {self.column}
158
- )
159
- base |= set(additional_columns or [])
160
- base.discard(None)
161
- columns = list(base) if base else None
147
+
148
+ columns = self.__select_columns(additional_columns)
162
149
 
163
150
  if not self.ranges:
164
151
  return DataFrame(columns=columns) if columns else DataFrame()
@@ -194,6 +181,31 @@ class Issue:
194
181
 
195
182
  return out if columns is None else out.loc[:, columns]
196
183
 
184
+ # Inspect Helpers
185
+ def __guard_parent(self):
186
+ if not self.parent:
187
+ raise ValueError("Issue has no parent DatasetItem")
188
+
189
+ def __select_columns(self, additional_columns: bool | str | list[str]) -> list:
190
+ if additional_columns is True:
191
+ columns = None
192
+ else:
193
+ additional_columns = (
194
+ [additional_columns]
195
+ if isinstance(additional_columns, str)
196
+ else additional_columns
197
+ )
198
+ base = (
199
+ set(self.parent.primary_keys)
200
+ if self.type in PRIMARY_KEY_ISSUES
201
+ else {self.column}
202
+ )
203
+ base |= set(additional_columns or [])
204
+ base.discard(None)
205
+ columns = list(base) if base else None
206
+
207
+ return columns
208
+
197
209
 
198
210
  @dataclass
199
211
  class Issues:
@@ -235,8 +247,8 @@ class Issues:
235
247
  parent: DatasetItemLike | None = None,
236
248
  ) -> Issue:
237
249
  key = (
238
- _normalise_name(table),
239
- _normalise_name(column) if column is not None else None,
250
+ _strip(table),
251
+ _strip(column) if column is not None else None,
240
252
  issue_type,
241
253
  )
242
254
  issue = self._index.get(key)
@@ -255,8 +267,8 @@ class Issues:
255
267
  issue_type: IssueType | None = None,
256
268
  ) -> list[Issue]:
257
269
  """Case-insensitive filter; any arg can be None to act as a wildcard."""
258
- table = _normalise_name(table)
259
- column = _normalise_name(column) if column is not None else None
270
+ table = _strip(table)
271
+ column = _strip(column) if column is not None else None
260
272
  output: list[Issue] = []
261
273
  if issue_type is not None:
262
274
  # direct index lookup where possible
@@ -268,9 +280,9 @@ class Issues:
268
280
 
269
281
  # otherwise scan (still cheap; we maintain a compact list)
270
282
  for item in self._items:
271
- if _normalise_name(item.table) != table:
283
+ if _strip(item.table) != table:
272
284
  continue
273
- if column is not None and (_normalise_name(item.column) or "") != column:
285
+ if column is not None and (_strip(item.column) or "") != column:
274
286
  continue
275
287
  output.append(item)
276
288
  return output
@@ -20,7 +20,7 @@ from valediction.io.csv_readers import (
20
20
  iter_csv_chunks,
21
21
  )
22
22
  from valediction.progress import Progress
23
- from valediction.support import _get_runtime_string, calculate_runtime
23
+ from valediction.support import _get_runtime_string, _normalise, calculate_runtime
24
24
  from valediction.validation.helpers import (
25
25
  _column_has_values,
26
26
  _set_nulls,
@@ -86,7 +86,9 @@ class Validator:
86
86
  self._dt_needs_infer: set[str] = set()
87
87
 
88
88
  # Helpers
89
- self._column_names: set = set(self.table_dictionary.get_column_names())
89
+ self._column_names: set[str] = {
90
+ _normalise(n) for n in self.table_dictionary.get_column_names()
91
+ }
90
92
 
91
93
  # Progress Tracking
92
94
  self.progress: Progress | None = None
@@ -155,6 +157,20 @@ class Validator:
155
157
  if not datetime_format:
156
158
  self._dt_needs_infer.add(name)
157
159
 
160
+ # Column Scanning
161
+ def _resolve_df_col(self, df: DataFrame, name: str) -> str | None:
162
+ """Return the actual df column label matching name case-insensitively."""
163
+ target = _normalise(name)
164
+ return next((c for c in df.columns if _normalise(str(c)) == target), None)
165
+
166
+ def _resolve_df_cols(self, df: DataFrame, names: list[str]) -> list[str]:
167
+ resolved: list[str] = []
168
+ for n in names:
169
+ c = self._resolve_df_col(df, n)
170
+ if c is not None:
171
+ resolved.append(c)
172
+ return resolved
173
+
158
174
  # Validate
159
175
  def validate(self):
160
176
  """
@@ -272,28 +288,45 @@ class Validator:
272
288
  # Validation: Start Helpers
273
289
  def _check_for_missing_columns(self, df: DataFrame):
274
290
  self.__begin_step(step="Checking for missing columns")
275
- missing = self._column_names - set(df.columns)
276
- if missing:
277
- for column in missing:
278
- self.issues.add(
279
- issue_type=IssueType.MISSING_COLUMN,
280
- table=self.table_name,
281
- column=column,
282
- parent=self.dataset_item,
283
- )
291
+
292
+ dict_names = self.table_dictionary.get_column_names()
293
+ dict_keys = {_normalise(name) for name in dict_names}
294
+
295
+ df_keys = {_normalise(str(column)) for column in df.columns}
296
+
297
+ missing_keys = dict_keys - df_keys
298
+ if missing_keys:
299
+ for name in dict_names:
300
+ if _normalise(name) in missing_keys:
301
+ self.issues.add(
302
+ issue_type=IssueType.MISSING_COLUMN,
303
+ table=self.table_name,
304
+ column=name,
305
+ parent=self.dataset_item,
306
+ )
307
+
284
308
  self.__complete_step()
285
309
 
286
310
  def _check_for_extra_columns(self, df: DataFrame):
287
311
  self.__begin_step(step="Checking for extra columns")
288
- extra = set(df.columns) - self._column_names
289
- if extra:
290
- for column in extra:
291
- self.issues.add(
292
- issue_type=IssueType.EXTRA_COLUMN,
293
- table=self.table_name,
294
- column=column,
295
- parent=self.dataset_item,
296
- )
312
+
313
+ dict_keys = {
314
+ _normalise(name) for name in self.table_dictionary.get_column_names()
315
+ }
316
+ df_cols = [str(column) for column in df.columns]
317
+ df_keys = {_normalise(column) for column in df_cols}
318
+
319
+ extra_keys = df_keys - dict_keys
320
+ if extra_keys:
321
+ for col in df_cols:
322
+ if _normalise(col) in extra_keys:
323
+ self.issues.add(
324
+ issue_type=IssueType.EXTRA_COLUMN,
325
+ table=self.table_name,
326
+ column=col, # report the actual df label
327
+ parent=self.dataset_item,
328
+ )
329
+
297
330
  self.__complete_step()
298
331
 
299
332
  # Validation: Chunk Helpers
@@ -319,13 +352,16 @@ class Validator:
319
352
 
320
353
  # Check for whitespace (text cols only)
321
354
  self.__begin_step(step="Checking for primary key whitespace")
322
- pk_cols_text = []
323
- for column in self.table_dictionary:
324
- if column.name in pk_cols and column.data_type in [DataType.TEXT]:
325
- pk_cols_text.append(column.name)
355
+ pk_keys = {_normalise(p) for p in pk_cols}
356
+ pk_cols_text = [
357
+ column.name
358
+ for column in self.table_dictionary
359
+ if _normalise(column.name) in pk_keys and column.data_type is DataType.TEXT
360
+ ]
326
361
 
327
362
  if pk_cols_text:
328
- space_mask = pk_contains_whitespace_mask(df[pk_cols_text])
363
+ pk_cols_text_df = self._resolve_df_cols(df, pk_cols_text)
364
+ space_mask = pk_contains_whitespace_mask(df[pk_cols_text_df])
329
365
  if space_mask.any():
330
366
  self.issues.add(
331
367
  issue_type=IssueType.PK_WHITESPACE,
@@ -343,7 +379,9 @@ class Validator:
343
379
 
344
380
  # Create primary key hashes
345
381
  self.__begin_step(step="Creating primary key hashes")
346
- pk_hashes = create_pk_hashes(df[pk_cols])
382
+ pk_cols_df = self._resolve_df_cols(df, pk_cols)
383
+ pk_hashes = create_pk_hashes(df[pk_cols_df])
384
+
347
385
  self.__complete_step()
348
386
 
349
387
  # Primary Key Nulls
@@ -437,44 +475,51 @@ class Validator:
437
475
  self.__complete_step()
438
476
  return
439
477
 
440
- columns = [col for col in self._dt_needs_infer if col in df.columns]
441
- if not columns:
478
+ cols = [
479
+ (dict_col, df_col)
480
+ for dict_col in self._dt_needs_infer
481
+ if (df_col := self._resolve_df_col(df, dict_col)) is not None
482
+ ]
483
+ if not cols:
442
484
  self.__complete_step()
443
485
  return
444
486
 
445
- for column in columns:
446
- series = df[column].astype("string", copy=False).str.strip()
447
- unique = series.dropna().unique()
487
+ from valediction.validation.helpers import _allowed_formats_for
488
+
489
+ for dict_col, df_col in cols:
490
+ unique = (
491
+ df[df_col].astype("string", copy=False).str.strip().dropna().unique()
492
+ )
448
493
  if len(unique) == 0:
449
494
  continue
450
495
 
451
496
  try:
452
- fmt_or_false = infer_datetime_format(Series(unique, dtype="string"))
497
+ fmt = infer_datetime_format(Series(unique, dtype="string"))
453
498
  except ValueError:
454
- # ambiguous - try again in later chunk
455
499
  continue
456
500
 
457
- if fmt_or_false and fmt_or_false is not False:
458
- col_dtype = self._find_data_type(column)
459
- from valediction.validation.helpers import _allowed_formats_for
460
-
461
- allowed = _allowed_formats_for(col_dtype)
462
- if fmt_or_false in allowed:
463
- self._dt_format_cache[column] = fmt_or_false
464
- self._dt_needs_infer.discard(column)
465
-
466
- # Persist in the dictionary
467
- try:
468
- self.table_dictionary.get_column(
469
- column
470
- ).datetime_format = fmt_or_false
471
- except Exception:
472
- pass
501
+ if not fmt or fmt is False:
502
+ continue
503
+
504
+ col_dtype = self._find_data_type(dict_col) # case-insensitive getter
505
+ if fmt not in _allowed_formats_for(col_dtype):
506
+ continue
507
+
508
+ self._dt_format_cache[dict_col] = fmt
509
+ self._dt_needs_infer.discard(dict_col)
510
+
511
+ try:
512
+ self.table_dictionary.get_column(dict_col).datetime_format = fmt
513
+ except Exception:
514
+ pass
515
+
473
516
  self.__complete_step()
474
517
 
475
518
  def _check_column_types(self, df: DataFrame, start_row: int) -> None:
476
519
  self.__begin_step(step="Checking column types")
477
- present = [col for col in df.columns if col in self._column_names]
520
+ present = [
521
+ col for col in df.columns if _normalise(str(col)) in self._column_names
522
+ ]
478
523
  for col in present:
479
524
  dtype = self._find_data_type(col)
480
525
  if dtype == DataType.TEXT:
@@ -506,7 +551,9 @@ class Validator:
506
551
 
507
552
  def _check_text_lengths(self, df: DataFrame, start_row: int) -> None:
508
553
  self.__begin_step(step="Checking text lengths")
509
- present = [col for col in df.columns if col in self._column_names]
554
+ present = [
555
+ col for col in df.columns if _normalise(str(col)) in self._column_names
556
+ ]
510
557
  for col in present:
511
558
  if self._find_data_type(col) != DataType.TEXT:
512
559
  continue
@@ -524,7 +571,9 @@ class Validator:
524
571
 
525
572
  def _check_text_forbidden_chars(self, df: DataFrame, start_row: int) -> None:
526
573
  self.__begin_step(step="Checking for forbidden characters")
527
- present = [col for col in df.columns if col in self._column_names]
574
+ present = [
575
+ col for col in df.columns if _normalise(str(col)) in self._column_names
576
+ ]
528
577
  for col in present:
529
578
  if self._find_data_type(col) != DataType.TEXT:
530
579
  continue
File without changes