valediction 1.0.3__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
valediction/integrity.py CHANGED
@@ -1,6 +1,10 @@
1
+ from __future__ import annotations
2
+
1
3
  import re
4
+ from copy import deepcopy
2
5
  from pathlib import Path
3
6
  from re import Pattern
7
+ from typing import Any
4
8
 
5
9
  from valediction.data_types.data_types import DataType
6
10
  from valediction.support import list_as_bullets
@@ -12,13 +16,58 @@ TEMPLATE_DATA_DICTIONARY_PATH = (
12
16
  )
13
17
 
14
18
 
19
+ externally_injected_variables: dict[
20
+ str, Any
21
+ ] = {} # External injection store for package wrapping (any keys, always included)
22
+
23
+
24
+ def reset_injected_config_variables() -> None:
25
+ global externally_injected_variables
26
+ externally_injected_variables = {}
27
+
28
+
29
+ def inject_config_variables(variables: dict[str, Any]) -> None:
30
+ """Injects variables into the Valediction Config, which will always be incorporated
31
+ as overrides, regardless of Config calling method (default, session-scoped, or
32
+ contextual).
33
+
34
+ Args:
35
+ variables (dict[str, Any]): Dictionary of config variables.
36
+ """
37
+ global externally_injected_variables, session_config
38
+
39
+ # check type allows
40
+ if not isinstance(variables, dict):
41
+ raise TypeError(
42
+ f"Config injection variables must be a dictionary, not {type(variables)}"
43
+ )
44
+ problematic_keys = []
45
+ for variable_name in variables.keys():
46
+ if not isinstance(variable_name, str):
47
+ problematic_keys.append(variable_name)
48
+
49
+ if problematic_keys:
50
+ raise TypeError("Config injection variables accepts only string keys.")
51
+
52
+ externally_injected_variables = dict(variables or {})
53
+
54
+ # Apply immediately to the current session config (if it exists)
55
+ if session_config is not None:
56
+ _apply_external_injections(session_config)
57
+
58
+
59
+ def _apply_external_injections(config: Config) -> None:
60
+ for variable_name, variable_value in externally_injected_variables.items():
61
+ setattr(config, variable_name, deepcopy(variable_value))
62
+
63
+
15
64
  class Config:
16
65
  def __init__(self):
17
66
  self.template_data_dictionary_path: Path = TEMPLATE_DATA_DICTIONARY_PATH
18
67
  self.max_table_name_length: int = 63
19
68
  self.max_column_name_length: int = 30
20
69
  self.max_primary_keys: int = 7
21
- self.invalid_name_pattern: str | Pattern = re.compile(r"[^A-Z0-9_]")
70
+ self.invalid_name_pattern: str | Pattern = re.compile(r"[^A-Za-z0-9_]")
22
71
  self.null_values: list[str] = ["", "null", "none"]
23
72
  self.forbidden_characters: list[str] = []
24
73
  self.date_formats: dict[str, DataType] = {
@@ -42,6 +91,7 @@ class Config:
42
91
  }
43
92
  self.enforce_no_null_columns: bool = True
44
93
  self.enforce_primary_keys: bool = True
94
+ _apply_external_injections(self)
45
95
 
46
96
  def __repr__(self):
47
97
  date_list = list_as_bullets(
@@ -65,33 +115,37 @@ class Config:
65
115
 
66
116
  # Context Wrapper With Reset
67
117
  def __enter__(self):
68
- global default_config
69
- default_config = self
118
+ global session_config
119
+
120
+ _apply_external_injections(self)
121
+
122
+ session_config = self
70
123
  return self
71
124
 
72
125
  def __exit__(self, exc_type, exc_value, traceback):
73
- global default_config
74
- default_config = Config()
126
+ global session_config
127
+ session_config = Config()
75
128
 
76
129
 
77
- default_config: Config = None
130
+ session_config: Config = None
78
131
 
79
132
 
80
133
  def get_config() -> Config:
81
- """Gets the current `default_config` instance. Changing attributes will set them
82
- globally.
134
+ """Gets the current `session_config` instance. Changing attributes will set them
135
+ globally for the python session. Use `reset_default_config()` to reset to original
136
+ defaults.
83
137
 
84
138
  Returns:
85
- Config: The current default configuration.
139
+ Config: The current session configuration.
86
140
  """
87
- global default_config
88
- return default_config
141
+ global session_config
142
+ return session_config
89
143
 
90
144
 
91
145
  def reset_default_config() -> None:
92
146
  """Resets `default_config` settings globally to original defaults."""
93
- global default_config
94
- default_config = Config()
147
+ global session_config
148
+ session_config = Config()
95
149
 
96
150
 
97
151
  reset_default_config()
@@ -11,7 +11,7 @@ import pandas as pd
11
11
  from pandas import DataFrame
12
12
  from pandas.errors import ParserError
13
13
 
14
- from valediction.support import _normalise_name
14
+ from valediction.support import _strip
15
15
 
16
16
 
17
17
  class FrameChunk(NamedTuple):
@@ -34,7 +34,7 @@ class FrameChunk(NamedTuple):
34
34
  total_chunks_seen: int | None
35
35
 
36
36
  def estimate_chunk_count(self) -> int:
37
- # Buffers (accounting for CSV tails/bytes innacuracy)
37
+ # Buffers (accounting for CSV tails/bytes inaccuracy)
38
38
  EPS_ABS = 4096 # Fixed
39
39
  EPS_REL = 0.05 # 5% tail buffer
40
40
 
@@ -93,7 +93,7 @@ def _post_read_processing(df: DataFrame, cfg: CsvReadConfig) -> DataFrame:
93
93
  """Apply header normalisation and vectorised value stripping after reading."""
94
94
  cfg = cfg or CsvReadConfig()
95
95
  if cfg.normalise_headers:
96
- df = df.rename(columns={c: _normalise_name(c) for c in df.columns})
96
+ df = df.rename(columns={c: _strip(c) for c in df.columns})
97
97
  if cfg.strip_values:
98
98
  str_cols = df.select_dtypes(include=["string"]).columns
99
99
  if len(str_cols) > 0:
valediction/support.py CHANGED
@@ -35,10 +35,14 @@ def list_as_bullets(elements: list, bullet: str = "\n - ") -> str:
35
35
  return bullet + bullet.join(elements)
36
36
 
37
37
 
38
- def _normalise_name(name: str) -> str:
38
+ def _normalise(name: str) -> str:
39
39
  return name.strip().upper()
40
40
 
41
41
 
42
+ def _strip(name: str) -> str:
43
+ return name.strip()
44
+
45
+
42
46
  def _get_runtime_string(runtime: timedelta) -> str:
43
47
  total_seconds = runtime.total_seconds()
44
48
  hours = trunc(total_seconds / 3600)
@@ -10,6 +10,7 @@ from pandas.util import hash_pandas_object
10
10
  from valediction.data_types.data_types import DataType
11
11
  from valediction.dictionary.model import Table
12
12
  from valediction.integrity import get_config
13
+ from valediction.support import _normalise
13
14
  from valediction.validation.issues import Range
14
15
 
15
16
 
@@ -17,11 +18,14 @@ from valediction.validation.issues import Range
17
18
  def _set_nulls(df: DataFrame) -> DataFrame:
18
19
  null_values = get_config().null_values
19
20
  token_set = {str(t).strip().casefold() for t in null_values}
20
- columns = df.select_dtypes(include=["string", "object"]).columns
21
+ columns = df.select_dtypes(include=["string", "object", "category"]).columns
21
22
  for column in columns:
22
23
  series = df[column]
23
- mask = series.notna() & series.str.casefold().isin(token_set)
24
- df[column] = series.mask(mask, NA)
24
+
25
+ s_txt = series.astype("string", copy=False) # dtype safe
26
+ mask = s_txt.notna() & s_txt.str.strip().str.casefold().isin(token_set)
27
+ if mask.any():
28
+ df[column] = series.mask(mask, NA)
25
29
 
26
30
  return df
27
31
 
@@ -68,37 +72,24 @@ def create_pk_hashes(
68
72
  Returns:
69
73
  Series: Pandas Series with hashes or Nulls.
70
74
  """
71
- hash_col_name = "PK_HASH"
75
+ HASH_COL_NAME = "PK_HASH"
72
76
  if df_primaries.empty or df_primaries.shape[1] == 0:
73
- return Series([], dtype=object, name=hash_col_name)
77
+ return Series([], dtype=object, name=HASH_COL_NAME)
74
78
 
75
- # Any NA in row => invalid PK -> None
79
+ # Check Nulls
76
80
  null_rows = df_primaries.isna().any(axis=1)
77
81
 
78
- # First Hash
79
- hash_1 = hash_pandas_object(df_primaries, index=False) # uint64
80
-
81
- # Second Hash (rows backwards if single row, else salt)
82
- if df_primaries.shape[1] > 1:
83
- df_primaries_backwards = df_primaries.iloc[:, ::-1]
84
- else:
85
- s = df_primaries.iloc[:, 0]
86
- salt = Series(["§"] * len(s), index=s.index, dtype="string")
87
- df_primaries_backwards = DataFrame(
88
- {
89
- "_a": s,
90
- "_b": s.str.cat(salt),
91
- }
92
- )
93
-
94
- hash_2 = hash_pandas_object(df_primaries_backwards, index=False) # uint64
82
+ # Two independent 64-bit hashes with 16 byte keys
83
+ hash_1 = hash_pandas_object(df_primaries, index=False, hash_key="valediction_pk1!")
84
+ hash_2 = hash_pandas_object(df_primaries, index=False, hash_key="valediction_pk2!")
95
85
 
86
+ # Combine into 128-bit integer keys
96
87
  a1 = hash_1.to_numpy(dtype="uint64", copy=False).astype(object)
97
88
  a2 = hash_2.to_numpy(dtype="uint64", copy=False).astype(object)
98
-
99
89
  combined = (a1 << 64) | a2
90
+
100
91
  hashes = Series(
101
- combined, index=df_primaries.index, name=hash_col_name, dtype=object
92
+ combined, index=df_primaries.index, name=HASH_COL_NAME, dtype=object
102
93
  )
103
94
  hashes[null_rows] = None
104
95
  return hashes
@@ -167,8 +158,9 @@ def pk_contains_whitespace_mask(df_primaries: DataFrame) -> Series:
167
158
  if df_primaries.empty or df_primaries.shape[1] == 0:
168
159
  return Series(False, index=df_primaries.index)
169
160
 
170
- col_masks = df_primaries.apply(lambda s: s.str.contains(r"\s", na=False))
171
-
161
+ col_masks = df_primaries.apply(
162
+ lambda s: s.astype("string", copy=False).str.contains(r"\s", na=False)
163
+ )
172
164
  return col_masks.any(axis=1)
173
165
 
174
166
 
@@ -261,7 +253,9 @@ def invalid_mask_text_too_long(column: Series, max_len: int) -> Series:
261
253
  return Series(False, index=column.index)
262
254
 
263
255
  notnull = column.notna()
264
- lens = column.str.len()
256
+ s_txt = column.astype("string", copy=False)
257
+ lens = s_txt.str.len()
258
+
265
259
  return notnull & (lens > max_len)
266
260
 
267
261
 
@@ -270,20 +264,23 @@ def invalid_mask_text_forbidden_characters(column: Series) -> Series:
270
264
  if not forbidden:
271
265
  return column.notna() & False
272
266
 
273
- pattern = "[" + re.escape("".join(forbidden)) + "]"
267
+ pattern = "[" + re.escape("".join([str(s) for s in forbidden])) + "]"
274
268
  notnull = column.notna()
275
- has_forbidden = column.str.contains(pattern, regex=True, na=False)
269
+
270
+ s_txt = column.astype("string", copy=False)
271
+ has_forbidden = s_txt.str.contains(pattern, regex=True, na=False)
272
+
276
273
  return notnull & has_forbidden
277
274
 
278
275
 
279
276
  # Apply Data Types #
280
277
  def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
281
278
  # name -> column object
282
- column_dictionary = {column.name: column for column in table_dictionary}
279
+ column_dictionary = {_normalise(column.name): column for column in table_dictionary}
283
280
 
284
281
  for col in df.columns:
285
- data_type = column_dictionary.get(col).data_type
286
- datetime_format = column_dictionary.get(col).datetime_format
282
+ data_type = column_dictionary.get(_normalise(col)).data_type
283
+ datetime_format = column_dictionary.get(_normalise(col)).datetime_format
287
284
 
288
285
  if data_type in (DataType.TEXT, DataType.FILE):
289
286
  df[col] = df[col].astype("string")
@@ -8,7 +8,7 @@ from pandas import DataFrame, concat
8
8
 
9
9
  from valediction.datasets.datasets_helpers import DatasetItemLike
10
10
  from valediction.io.csv_readers import CsvReadConfig, read_csv_ranges
11
- from valediction.support import _normalise_name, list_as_bullets
11
+ from valediction.support import _strip, list_as_bullets
12
12
 
13
13
 
14
14
  class IssueType(Enum):
@@ -107,6 +107,7 @@ class Issue:
107
107
  merged.append(cur)
108
108
  self.ranges = merged
109
109
 
110
+ # Inspect
110
111
  def inspect(
111
112
  self,
112
113
  additional_columns: bool | str | list[str] | None = None,
@@ -132,9 +133,9 @@ class Issue:
132
133
  ValueError: if the issue has no parent DatasetItem
133
134
  """
134
135
  # Guard
135
- if not self.parent:
136
- raise ValueError("Issue has no parent DatasetItem")
136
+ self.__guard_parent()
137
137
  header = self.__repr__() if print_header else ""
138
+
138
139
  # Not applicable
139
140
  if self.type in APPLIES_WHOLE_COLUMN:
140
141
  print(f"{header}: applies to whole column")
@@ -143,22 +144,8 @@ class Issue:
143
144
  # Column Inclusion
144
145
  if print_header:
145
146
  print(f"{header}:")
146
- if additional_columns is True:
147
- columns = None
148
- else:
149
- additional_columns = (
150
- [additional_columns]
151
- if isinstance(additional_columns, str)
152
- else additional_columns
153
- )
154
- base = (
155
- set(self.parent.primary_keys)
156
- if self.type in PRIMARY_KEY_ISSUES
157
- else {self.column}
158
- )
159
- base |= set(additional_columns or [])
160
- base.discard(None)
161
- columns = list(base) if base else None
147
+
148
+ columns = self.__select_columns(additional_columns)
162
149
 
163
150
  if not self.ranges:
164
151
  return DataFrame(columns=columns) if columns else DataFrame()
@@ -194,6 +181,31 @@ class Issue:
194
181
 
195
182
  return out if columns is None else out.loc[:, columns]
196
183
 
184
+ # Inspect Helpers
185
+ def __guard_parent(self):
186
+ if not self.parent:
187
+ raise ValueError("Issue has no parent DatasetItem")
188
+
189
+ def __select_columns(self, additional_columns: bool | str | list[str]) -> list:
190
+ if additional_columns is True:
191
+ columns = None
192
+ else:
193
+ additional_columns = (
194
+ [additional_columns]
195
+ if isinstance(additional_columns, str)
196
+ else additional_columns
197
+ )
198
+ base = (
199
+ set(self.parent.primary_keys)
200
+ if self.type in PRIMARY_KEY_ISSUES
201
+ else {self.column}
202
+ )
203
+ base |= set(additional_columns or [])
204
+ base.discard(None)
205
+ columns = list(base) if base else None
206
+
207
+ return columns
208
+
197
209
 
198
210
  @dataclass
199
211
  class Issues:
@@ -235,8 +247,8 @@ class Issues:
235
247
  parent: DatasetItemLike | None = None,
236
248
  ) -> Issue:
237
249
  key = (
238
- _normalise_name(table),
239
- _normalise_name(column) if column is not None else None,
250
+ _strip(table),
251
+ _strip(column) if column is not None else None,
240
252
  issue_type,
241
253
  )
242
254
  issue = self._index.get(key)
@@ -255,8 +267,8 @@ class Issues:
255
267
  issue_type: IssueType | None = None,
256
268
  ) -> list[Issue]:
257
269
  """Case-insensitive filter; any arg can be None to act as a wildcard."""
258
- table = _normalise_name(table)
259
- column = _normalise_name(column) if column is not None else None
270
+ table = _strip(table)
271
+ column = _strip(column) if column is not None else None
260
272
  output: list[Issue] = []
261
273
  if issue_type is not None:
262
274
  # direct index lookup where possible
@@ -268,9 +280,9 @@ class Issues:
268
280
 
269
281
  # otherwise scan (still cheap; we maintain a compact list)
270
282
  for item in self._items:
271
- if _normalise_name(item.table) != table:
283
+ if _strip(item.table) != table:
272
284
  continue
273
- if column is not None and (_normalise_name(item.column) or "") != column:
285
+ if column is not None and (_strip(item.column) or "") != column:
274
286
  continue
275
287
  output.append(item)
276
288
  return output
@@ -20,7 +20,7 @@ from valediction.io.csv_readers import (
20
20
  iter_csv_chunks,
21
21
  )
22
22
  from valediction.progress import Progress
23
- from valediction.support import _get_runtime_string, calculate_runtime
23
+ from valediction.support import _get_runtime_string, _normalise, calculate_runtime
24
24
  from valediction.validation.helpers import (
25
25
  _column_has_values,
26
26
  _set_nulls,
@@ -62,7 +62,7 @@ class Validator:
62
62
  dataset_item: DatasetItemLike,
63
63
  table_dictionary: Table,
64
64
  feedback: bool = True,
65
- chunk_size: int = 10_000_000,
65
+ chunk_size: int | None = 10_000_000,
66
66
  _padding: int = 0,
67
67
  ):
68
68
  # User Variables
@@ -86,7 +86,9 @@ class Validator:
86
86
  self._dt_needs_infer: set[str] = set()
87
87
 
88
88
  # Helpers
89
- self._column_names: set = set(self.table_dictionary.get_column_names())
89
+ self._column_names: set[str] = {
90
+ _normalise(n) for n in self.table_dictionary.get_column_names()
91
+ }
90
92
 
91
93
  # Progress Tracking
92
94
  self.progress: Progress | None = None
@@ -155,6 +157,20 @@ class Validator:
155
157
  if not datetime_format:
156
158
  self._dt_needs_infer.add(name)
157
159
 
160
+ # Column Scanning
161
+ def _resolve_df_col(self, df: DataFrame, name: str) -> str | None:
162
+ """Return the actual df column label matching name case-insensitively."""
163
+ target = _normalise(name)
164
+ return next((c for c in df.columns if _normalise(str(c)) == target), None)
165
+
166
+ def _resolve_df_cols(self, df: DataFrame, names: list[str]) -> list[str]:
167
+ resolved: list[str] = []
168
+ for n in names:
169
+ c = self._resolve_df_col(df, n)
170
+ if c is not None:
171
+ resolved.append(c)
172
+ return resolved
173
+
158
174
  # Validate
159
175
  def validate(self):
160
176
  """
@@ -272,28 +288,45 @@ class Validator:
272
288
  # Validation: Start Helpers
273
289
  def _check_for_missing_columns(self, df: DataFrame):
274
290
  self.__begin_step(step="Checking for missing columns")
275
- missing = self._column_names - set(df.columns)
276
- if missing:
277
- for column in missing:
278
- self.issues.add(
279
- issue_type=IssueType.MISSING_COLUMN,
280
- table=self.table_name,
281
- column=column,
282
- parent=self.dataset_item,
283
- )
291
+
292
+ dict_names = self.table_dictionary.get_column_names()
293
+ dict_keys = {_normalise(name) for name in dict_names}
294
+
295
+ df_keys = {_normalise(str(column)) for column in df.columns}
296
+
297
+ missing_keys = dict_keys - df_keys
298
+ if missing_keys:
299
+ for name in dict_names:
300
+ if _normalise(name) in missing_keys:
301
+ self.issues.add(
302
+ issue_type=IssueType.MISSING_COLUMN,
303
+ table=self.table_name,
304
+ column=name,
305
+ parent=self.dataset_item,
306
+ )
307
+
284
308
  self.__complete_step()
285
309
 
286
310
  def _check_for_extra_columns(self, df: DataFrame):
287
311
  self.__begin_step(step="Checking for extra columns")
288
- extra = set(df.columns) - self._column_names
289
- if extra:
290
- for column in extra:
291
- self.issues.add(
292
- issue_type=IssueType.EXTRA_COLUMN,
293
- table=self.table_name,
294
- column=column,
295
- parent=self.dataset_item,
296
- )
312
+
313
+ dict_keys = {
314
+ _normalise(name) for name in self.table_dictionary.get_column_names()
315
+ }
316
+ df_cols = [str(column) for column in df.columns]
317
+ df_keys = {_normalise(column) for column in df_cols}
318
+
319
+ extra_keys = df_keys - dict_keys
320
+ if extra_keys:
321
+ for col in df_cols:
322
+ if _normalise(col) in extra_keys:
323
+ self.issues.add(
324
+ issue_type=IssueType.EXTRA_COLUMN,
325
+ table=self.table_name,
326
+ column=col, # report the actual df label
327
+ parent=self.dataset_item,
328
+ )
329
+
297
330
  self.__complete_step()
298
331
 
299
332
  # Validation: Chunk Helpers
@@ -319,13 +352,16 @@ class Validator:
319
352
 
320
353
  # Check for whitespace (text cols only)
321
354
  self.__begin_step(step="Checking for primary key whitespace")
322
- pk_cols_text = []
323
- for column in self.table_dictionary:
324
- if column.name in pk_cols and column.data_type in [DataType.TEXT]:
325
- pk_cols_text.append(column.name)
355
+ pk_keys = {_normalise(p) for p in pk_cols}
356
+ pk_cols_text = [
357
+ column.name
358
+ for column in self.table_dictionary
359
+ if _normalise(column.name) in pk_keys and column.data_type is DataType.TEXT
360
+ ]
326
361
 
327
362
  if pk_cols_text:
328
- space_mask = pk_contains_whitespace_mask(df[pk_cols_text])
363
+ pk_cols_text_df = self._resolve_df_cols(df, pk_cols_text)
364
+ space_mask = pk_contains_whitespace_mask(df[pk_cols_text_df])
329
365
  if space_mask.any():
330
366
  self.issues.add(
331
367
  issue_type=IssueType.PK_WHITESPACE,
@@ -343,7 +379,9 @@ class Validator:
343
379
 
344
380
  # Create primary key hashes
345
381
  self.__begin_step(step="Creating primary key hashes")
346
- pk_hashes = create_pk_hashes(df[pk_cols])
382
+ pk_cols_df = self._resolve_df_cols(df, pk_cols)
383
+ pk_hashes = create_pk_hashes(df[pk_cols_df])
384
+
347
385
  self.__complete_step()
348
386
 
349
387
  # Primary Key Nulls
@@ -437,44 +475,51 @@ class Validator:
437
475
  self.__complete_step()
438
476
  return
439
477
 
440
- columns = [col for col in self._dt_needs_infer if col in df.columns]
441
- if not columns:
478
+ cols = [
479
+ (dict_col, df_col)
480
+ for dict_col in self._dt_needs_infer
481
+ if (df_col := self._resolve_df_col(df, dict_col)) is not None
482
+ ]
483
+ if not cols:
442
484
  self.__complete_step()
443
485
  return
444
486
 
445
- for column in columns:
446
- series = df[column].astype("string", copy=False).str.strip()
447
- unique = series.dropna().unique()
487
+ from valediction.validation.helpers import _allowed_formats_for
488
+
489
+ for dict_col, df_col in cols:
490
+ unique = (
491
+ df[df_col].astype("string", copy=False).str.strip().dropna().unique()
492
+ )
448
493
  if len(unique) == 0:
449
494
  continue
450
495
 
451
496
  try:
452
- fmt_or_false = infer_datetime_format(Series(unique, dtype="string"))
497
+ fmt = infer_datetime_format(Series(unique, dtype="string"))
453
498
  except ValueError:
454
- # ambiguous - try again in later chunk
455
499
  continue
456
500
 
457
- if fmt_or_false and fmt_or_false is not False:
458
- col_dtype = self._find_data_type(column)
459
- from valediction.validation.helpers import _allowed_formats_for
460
-
461
- allowed = _allowed_formats_for(col_dtype)
462
- if fmt_or_false in allowed:
463
- self._dt_format_cache[column] = fmt_or_false
464
- self._dt_needs_infer.discard(column)
465
-
466
- # Persist in the dictionary
467
- try:
468
- self.table_dictionary.get_column(
469
- column
470
- ).datetime_format = fmt_or_false
471
- except Exception:
472
- pass
501
+ if not fmt or fmt is False:
502
+ continue
503
+
504
+ col_dtype = self._find_data_type(dict_col) # case-insensitive getter
505
+ if fmt not in _allowed_formats_for(col_dtype):
506
+ continue
507
+
508
+ self._dt_format_cache[dict_col] = fmt
509
+ self._dt_needs_infer.discard(dict_col)
510
+
511
+ try:
512
+ self.table_dictionary.get_column(dict_col).datetime_format = fmt
513
+ except Exception:
514
+ pass
515
+
473
516
  self.__complete_step()
474
517
 
475
518
  def _check_column_types(self, df: DataFrame, start_row: int) -> None:
476
519
  self.__begin_step(step="Checking column types")
477
- present = [col for col in df.columns if col in self._column_names]
520
+ present = [
521
+ col for col in df.columns if _normalise(str(col)) in self._column_names
522
+ ]
478
523
  for col in present:
479
524
  dtype = self._find_data_type(col)
480
525
  if dtype == DataType.TEXT:
@@ -506,7 +551,9 @@ class Validator:
506
551
 
507
552
  def _check_text_lengths(self, df: DataFrame, start_row: int) -> None:
508
553
  self.__begin_step(step="Checking text lengths")
509
- present = [col for col in df.columns if col in self._column_names]
554
+ present = [
555
+ col for col in df.columns if _normalise(str(col)) in self._column_names
556
+ ]
510
557
  for col in present:
511
558
  if self._find_data_type(col) != DataType.TEXT:
512
559
  continue
@@ -524,7 +571,9 @@ class Validator:
524
571
 
525
572
  def _check_text_forbidden_chars(self, df: DataFrame, start_row: int) -> None:
526
573
  self.__begin_step(step="Checking for forbidden characters")
527
- present = [col for col in df.columns if col in self._column_names]
574
+ present = [
575
+ col for col in df.columns if _normalise(str(col)) in self._column_names
576
+ ]
528
577
  for col in present:
529
578
  if self._find_data_type(col) != DataType.TEXT:
530
579
  continue
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: valediction
3
- Version: 1.0.3
3
+ Version: 1.2.0
4
4
  Summary: Valediction is a convenience data validation package that allows generation, import, and constraint enforcement of user-defined data dictionaries against datasets.
5
5
  Author-email: Cai Davis <Cai.Davis@uhs.nhs.uk>
6
6
  Requires-Python: <4.0,>=3.11