valediction 1.1.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -62,14 +62,14 @@ def infer_datetime_format(
62
62
 
63
63
 
64
64
  def get_date_type(datetime_format: str) -> DataType | None:
65
- """Identifies if a datetime format string corresponds to a Date or Datetime data
65
+ """Identifies if a datetime format string corresponds to a Date or Timestamp data
66
66
  type.
67
67
 
68
68
  Args:
69
69
  datetime_format (str): datetime format string
70
70
 
71
71
  Returns:
72
- DataType | None: DataType of Date, Datetime, or None if not found.
72
+ DataType | None: DataType of Date, Timestamp, or None if not found.
73
73
  """
74
74
  config = get_config()
75
75
  return config.date_formats.get(datetime_format)
@@ -8,7 +8,7 @@ class DataType(Enum):
8
8
  INTEGER = "Integer"
9
9
  FLOAT = "Float"
10
10
  DATE = "Date"
11
- DATETIME = "Datetime"
11
+ TIMESTAMP = "Timestamp"
12
12
  FILE = "File"
13
13
 
14
14
  def __str__(self) -> str:
@@ -32,9 +32,9 @@ class DataType(Enum):
32
32
  "number": cls.FLOAT,
33
33
  "numeric": cls.FLOAT,
34
34
  "date": cls.DATE,
35
- "datetime": cls.DATETIME,
36
- "datetime64": cls.DATETIME,
37
- "timestamp": cls.DATETIME,
35
+ "datetime": cls.TIMESTAMP,
36
+ "datetime64": cls.TIMESTAMP,
37
+ "timestamp": cls.TIMESTAMP,
38
38
  "file": cls.FILE,
39
39
  "blob": cls.FILE,
40
40
  "binary": cls.FILE,
@@ -49,10 +49,10 @@ class DataType(Enum):
49
49
  return self in {DataType.TEXT}
50
50
 
51
51
  def valid_for_primary_key(self) -> bool:
52
- """PKs can only be Text, Integer, Date, Datetime."""
52
+ """PKs can only be Text, Integer, Date, Timestamp."""
53
53
  return self in {
54
54
  DataType.TEXT,
55
55
  DataType.INTEGER,
56
56
  DataType.DATE,
57
- DataType.DATETIME,
57
+ DataType.TIMESTAMP,
58
58
  }
@@ -4,6 +4,7 @@ import re
4
4
  import warnings
5
5
 
6
6
  import pandas as pd
7
+ from pandas.api.types import is_object_dtype, is_string_dtype
7
8
 
8
9
  from valediction.data_types.data_type_helpers import infer_datetime_format
9
10
  from valediction.data_types.data_types import DataType
@@ -53,8 +54,8 @@ class ColumnState:
53
54
  return DataType.FLOAT, None
54
55
  if self.data_type == DataType.DATE:
55
56
  return DataType.DATE, None
56
- if self.data_type == DataType.DATETIME:
57
- return DataType.DATETIME, None
57
+ if self.data_type == DataType.TIMESTAMP:
58
+ return DataType.TIMESTAMP, None
58
59
 
59
60
  return DataType.TEXT, _len1()
60
61
 
@@ -123,7 +124,7 @@ class TypeInferer:
123
124
  _handling_function: callable = {
124
125
  DataType.TEXT: self._handle_state_text,
125
126
  DataType.DATE: self._handle_state_date,
126
- DataType.DATETIME: self._handle_state_datetime,
127
+ DataType.TIMESTAMP: self._handle_state_datetime,
127
128
  DataType.INTEGER: self._handle_state_integer,
128
129
  DataType.FLOAT: self._handle_state_float,
129
130
  }.get(state.data_type, self._handle_state_text)
@@ -141,20 +142,31 @@ class TypeInferer:
141
142
  self, s: pd.Series
142
143
  ) -> tuple[pd.Series, pd.Series, pd.Series, int | None]:
143
144
  self.__begin_step(step="Trimming whitespace")
144
- trimmed = s.str.strip()
145
+ is_text = is_string_dtype(s) or is_object_dtype(s)
146
+
147
+ if is_text:
148
+ trimmed = s.astype("string").str.strip()
149
+ else:
150
+ trimmed = s
145
151
  self.__complete_step()
146
152
 
147
153
  self.__begin_step(step="Checking nulls")
148
- nulls = trimmed.isna() | trimmed.str.lower().isin(self.null_tokens)
154
+ if is_text:
155
+ nulls = trimmed.isna() | trimmed.str.lower().isin(self.null_tokens)
156
+ else:
157
+ nulls = trimmed.isna()
149
158
  self.__complete_step()
150
159
 
151
160
  self.__begin_step(step="Checking max length")
152
- lengths = s.str.len()
153
- max_len = int(lengths.max(skipna=True)) if lengths.notna().any() else None
161
+ if is_text:
162
+ lengths = trimmed.str.len()
163
+ max_len = int(lengths.max(skipna=True)) if lengths.notna().any() else None
164
+ else:
165
+ max_len = None
154
166
  self.__complete_step()
155
167
 
156
168
  self.__begin_step(step="Setting non-null mask")
157
- nonnull_mask = (~nulls) & s.notna()
169
+ nonnull_mask = (~nulls) & trimmed.notna()
158
170
  self.__complete_step()
159
171
 
160
172
  return trimmed, nulls, nonnull_mask, max_len
@@ -193,7 +205,7 @@ class TypeInferer:
193
205
  if ok.all():
194
206
  self._transition(
195
207
  st,
196
- DataType.DATETIME if has_time.any() else DataType.DATE,
208
+ DataType.TIMESTAMP if has_time.any() else DataType.DATE,
197
209
  f"cached datetime format={st.cached_datetime_format!r}",
198
210
  )
199
211
  self.__complete_step()
@@ -210,7 +222,7 @@ class TypeInferer:
210
222
  st.cached_datetime_format = fmt
211
223
  self._transition(
212
224
  st,
213
- DataType.DATETIME if has_time.any() else DataType.DATE,
225
+ DataType.TIMESTAMP if has_time.any() else DataType.DATE,
214
226
  f"explicit datetime format={fmt!r}",
215
227
  )
216
228
  self.__complete_step()
@@ -276,7 +288,7 @@ class TypeInferer:
276
288
  st.lock_text_permanent = True
277
289
  self._transition(st, DataType.TEXT, "datetime parse failures")
278
290
  elif has_time.any():
279
- self._transition(st, DataType.DATETIME, "time component detected")
291
+ self._transition(st, DataType.TIMESTAMP, "time component detected")
280
292
 
281
293
  self.__complete_step()
282
294
 
@@ -334,7 +346,7 @@ class TypeInferer:
334
346
  if ok.all():
335
347
  self._transition(
336
348
  st,
337
- DataType.DATETIME if has_time.any() else DataType.DATE,
349
+ DataType.TIMESTAMP if has_time.any() else DataType.DATE,
338
350
  f"cached datetime format={st.cached_datetime_format!r}",
339
351
  )
340
352
  return True
@@ -377,7 +389,7 @@ class TypeInferer:
377
389
  if ok.all():
378
390
  self._transition(
379
391
  st,
380
- DataType.DATETIME if has_time.any() else DataType.DATE,
392
+ DataType.TIMESTAMP if has_time.any() else DataType.DATE,
381
393
  f"explicit datetime format={st.cached_datetime_format!r}",
382
394
  )
383
395
  return True
@@ -20,7 +20,8 @@ from valediction.io.csv_readers import (
20
20
  )
21
21
  from valediction.support import (
22
22
  _get_runtime_string,
23
- _normalise_name,
23
+ _normalise,
24
+ _strip,
24
25
  list_as_bullets,
25
26
  print_bold_red,
26
27
  print_red,
@@ -437,16 +438,16 @@ class Dataset(list[DatasetItem]):
437
438
 
438
439
  # Getters
439
440
  def get(self, name: str, default: DatasetItem | None = None) -> DatasetItem | None:
440
- name_key = name.strip()
441
+ name_key = _normalise(name)
441
442
  for item in self:
442
- if item.name.lower() == name_key.lower():
443
+ if _normalise(item.name) == name_key:
443
444
  return item
444
445
  return default
445
446
 
446
447
  def index_of(self, name: str) -> int | None:
447
- name_key = name.strip()
448
+ name_key = _normalise(name)
448
449
  for i, item in enumerate(self):
449
- if item.name == name_key:
450
+ if _normalise(item.name) == name_key:
450
451
  return i
451
452
  return None
452
453
 
@@ -796,20 +797,21 @@ class Dataset(list[DatasetItem]):
796
797
  name: str | None,
797
798
  data: DataLike,
798
799
  ) -> DatasetItem:
799
- """Normalise a (data, name) double into a DatasetItem."""
800
+ """Normalise a (name, data) double into a DatasetItem."""
800
801
  if isinstance(data, (str, Path)):
801
802
  path = Path(data)
802
803
  if not path.exists():
803
804
  raise FileNotFoundError(f"File not found: {path}")
804
805
  if path.suffix.lower() != ".csv":
805
806
  raise ValueError(f"Only .csv supported right now, got: {path}")
806
- resolved_name = _normalise_name(name or path.stem)
807
+ resolved_name = _strip(name or path.stem)
807
808
  return DatasetItem(name=resolved_name, data=path.resolve())
808
809
 
809
810
  if isinstance(data, DataFrame):
810
811
  if not name:
811
812
  raise ValueError("When providing a DataFrame, 'name' is required.")
812
- resolved_name = _normalise_name(name)
813
+ resolved_name = _strip(name)
814
+ data.columns = [_strip(column) for column in data.columns]
813
815
  return DatasetItem(name=resolved_name, data=data)
814
816
 
815
817
  raise TypeError("data must be a Path/str to .csv or a pandas DataFrame.")
@@ -823,13 +825,11 @@ class Dataset(list[DatasetItem]):
823
825
  if p.is_file():
824
826
  if p.suffix.lower() != ".csv":
825
827
  raise ValueError(f"Expected a .csv file, got: {p.suffix} ({p})")
826
- return [DatasetItem(name=_normalise_name(p.stem), data=p.resolve())]
828
+ return [DatasetItem(name=_strip(p.stem), data=p.resolve())]
827
829
 
828
830
  if p.is_dir():
829
831
  return [
830
- DatasetItem(
831
- name=_normalise_name(csv_path.stem), data=csv_path.resolve()
832
- )
832
+ DatasetItem(name=_strip(csv_path.stem), data=csv_path.resolve())
833
833
  for csv_path in p.glob("*.csv")
834
834
  ]
835
835
 
@@ -103,7 +103,7 @@ def demo_dictionary() -> Dictionary:
103
103
  foreign_key="DEMOGRAPHICS.PATIENT_HASH",
104
104
  ),
105
105
  Column(
106
- name="OBSERVATION_TIME", order=2, data_type="datetime", primary_key=2
106
+ name="OBSERVATION_TIME", order=2, data_type="timestamp", primary_key=2
107
107
  ),
108
108
  Column(
109
109
  name="OBSERVATION_TYPE",
@@ -24,7 +24,7 @@ from valediction.io.csv_readers import (
24
24
  read_csv_sample,
25
25
  )
26
26
  from valediction.progress import Progress
27
- from valediction.support import _normalise_name, calculate_runtime
27
+ from valediction.support import _strip, calculate_runtime
28
28
 
29
29
  IMPORTING_DATA = "Importing data"
30
30
  CHUNK_STEPS = 1
@@ -124,7 +124,7 @@ class Generator:
124
124
  self.__say(f"Generating dictionary for {len(items)} tables")
125
125
  for item in items:
126
126
  self.__progress_init(item)
127
- table = Table(name=_normalise_name(item.name))
127
+ table = Table(name=_strip(item.name))
128
128
  dictionary.add_table(table)
129
129
 
130
130
  if item.is_path:
@@ -192,7 +192,7 @@ class Generator:
192
192
  col_state = inferer.states[col_name]
193
193
  data_type, length = col_state.final_data_type_and_length()
194
194
  col = Column(
195
- name=_normalise_name(col_name),
195
+ name=_strip(col_name),
196
196
  order=idx,
197
197
  data_type=data_type,
198
198
  length=length if data_type == DataType.TEXT else None,
@@ -242,7 +242,7 @@ class Generator:
242
242
  col_state = inferer.states[col_name]
243
243
  data_type, length = col_state.final_data_type_and_length()
244
244
  col = Column(
245
- name=_normalise_name(col_name),
245
+ name=_strip(col_name),
246
246
  order=idx,
247
247
  data_type=data_type,
248
248
  length=length if data_type == DataType.TEXT else None,
@@ -257,7 +257,7 @@ class Generator:
257
257
  table.add_column(col)
258
258
 
259
259
  def _set_datetime_format(self, column_state: ColumnState, column: Column) -> None:
260
- if column.data_type in (DataType.DATE, DataType.DATETIME):
260
+ if column.data_type in (DataType.DATE, DataType.TIMESTAMP):
261
261
  datetime_format = getattr(column_state, "cached_datetime_format", None)
262
262
  if datetime_format and hasattr(column, "datetime_format"):
263
263
  column.datetime_format = datetime_format
@@ -277,7 +277,7 @@ class Generator:
277
277
  next_order = max((c.order or 0 for c in table), default=0) + 1
278
278
  data_type, length = col_state.final_data_type_and_length()
279
279
  new_col = Column(
280
- name=_normalise_name(col_name),
280
+ name=_strip(col_name),
281
281
  order=next_order,
282
282
  data_type=data_type,
283
283
  length=length if data_type == DataType.TEXT else None,
@@ -26,9 +26,6 @@ def _check_name(name: str, entity: Literal["table", "column"]) -> list[str]:
26
26
  else config.max_column_name_length
27
27
  )
28
28
 
29
- if name != name.upper(): # name must be uppercase
30
- errors.append("must be uppercase")
31
-
32
29
  if invalid_chars.search(name): # check invalid characters
33
30
  bad = set(invalid_chars.findall(name))
34
31
  errors.append(
@@ -109,16 +106,12 @@ def _check_primary_key(primary_key: int | None, data_type: DataType) -> list[str
109
106
  ):
110
107
  errors.append(
111
108
  f"invalid data type '{data_type.value}' for primary key column; "
112
- "primary keys must be Text, Integer, Date, or Datetime"
109
+ "primary keys must be Text, Integer, Date, or Timestamp"
113
110
  )
114
111
 
115
112
  return errors
116
113
 
117
114
 
118
- def _normalise_name(name: str) -> str:
119
- return name.upper().strip()
120
-
121
-
122
115
  def _norm_header_map(columns: list) -> dict:
123
116
  mapping, _ = {}, set()
124
117
  for c in columns:
@@ -11,7 +11,6 @@ from valediction.dictionary.helpers import (
11
11
  _get_required_header,
12
12
  _is_missing,
13
13
  _norm_header_map,
14
- _normalise_name,
15
14
  _parse_int,
16
15
  _parse_truthy,
17
16
  _row_is_blank,
@@ -19,7 +18,7 @@ from valediction.dictionary.helpers import (
19
18
  from valediction.dictionary.integrity import REQUIRED_SHEETS
20
19
  from valediction.dictionary.model import Column, Dictionary, Table
21
20
  from valediction.exceptions import DataDictionaryError, DataDictionaryImportError
22
- from valediction.support import list_as_bullets
21
+ from valediction.support import _normalise, _strip, list_as_bullets
23
22
 
24
23
 
25
24
  @dataclass
@@ -80,6 +79,13 @@ class ExcelDataDictionary:
80
79
  raise error
81
80
 
82
81
  # Import & Helpers
82
+ def _resolve_table_name(self, name: str) -> str | None:
83
+ """Return the canonical table name as it appears in Tables sheet (or None)."""
84
+ target = _normalise(name)
85
+ return next(
86
+ (t for t in self.table_metadata.keys() if _normalise(t) == target), None
87
+ )
88
+
83
89
  def _open_workbook(self) -> None:
84
90
  if not self.path.exists():
85
91
  raise DataDictionaryImportError(f"File not found: {self.path}")
@@ -140,20 +146,27 @@ class ExcelDataDictionary:
140
146
  description_col_header = _get_required_header(header_map, "description")
141
147
 
142
148
  meta: dict[str, str | None] = {}
149
+ seen: set[str] = set()
150
+
143
151
  for _, row in tables_df.iterrows():
144
152
  if _is_missing(row[table_col_header]):
145
153
  continue
146
- table_name = _normalise_name(str(row[table_col_header]))
154
+
155
+ table_name = _strip(str(row[table_col_header]))
147
156
  table_description = (
148
157
  None
149
158
  if _is_missing(row[description_col_header])
150
159
  else str(row[description_col_header])
151
160
  )
152
- if table_name in meta:
161
+
162
+ key = _normalise(table_name)
163
+ if key in seen:
153
164
  raise DataDictionaryImportError(
154
165
  f"Duplicate table '{table_name}' in Tables sheet."
155
166
  )
167
+ seen.add(key)
156
168
  meta[table_name] = table_description
169
+
157
170
  if not meta:
158
171
  raise DataDictionaryImportError(
159
172
  "Data Dictionary sheet 'Tables' contains no table rows."
@@ -177,12 +190,13 @@ class ExcelDataDictionary:
177
190
  or _is_missing(row[code_col_header])
178
191
  ):
179
192
  continue
180
- table_name = _normalise_name(str(row[table_col_header]))
181
- column_name = _normalise_name(str(row[column_col_header]))
182
- enum_map.setdefault((table_name, column_name), {})
183
- enum_map[(table_name, column_name)][row[code_col_header]] = row[
184
- name_col_header
185
- ]
193
+ table_name = _strip(str(row[table_col_header]))
194
+ column_name = _strip(str(row[column_col_header]))
195
+ resolved_table = self._resolve_table_name(table_name) or table_name
196
+ enum_key = (_normalise(resolved_table), _normalise(column_name))
197
+ enum_map.setdefault(enum_key, {})
198
+ enum_map[enum_key][row[code_col_header]] = row[name_col_header]
199
+
186
200
  self.enumerations = enum_map
187
201
 
188
202
  # Parse Columns
@@ -234,7 +248,12 @@ class ExcelDataDictionary:
234
248
 
235
249
  self.table_columns[inputs.table_name].append(column_obj)
236
250
  if inputs.has_enumerations:
237
- self.enum_flags.add((inputs.table_name, inputs.column_name))
251
+ self.enum_flags.add(
252
+ (
253
+ _normalise(inputs.table_name),
254
+ _normalise(inputs.column_name),
255
+ )
256
+ )
238
257
 
239
258
  if errors:
240
259
  raise DataDictionaryImportError(
@@ -279,7 +298,7 @@ class ExcelDataDictionary:
279
298
 
280
299
  # Validate Foreign Keys
281
300
  def _validate_foreign_keys(self) -> None:
282
- name_to_table = {t.name: t for t in self.tables}
301
+ name_to_table = {_normalise(t.name): t for t in self.tables}
283
302
  errors: list[str] = []
284
303
  for table in self.tables:
285
304
  for column in table:
@@ -292,9 +311,9 @@ class ExcelDataDictionary:
292
311
  )
293
312
  continue
294
313
  target_table_raw, target_column_raw = target.split(".", 1)
295
- target_table_name = _normalise_name(target_table_raw)
296
- target_column_name = _normalise_name(target_column_raw)
297
- referenced_table = name_to_table.get(target_table_name)
314
+ target_table_name = _strip(target_table_raw)
315
+ target_column_name = _strip(target_column_raw)
316
+ referenced_table = name_to_table.get(_normalise(target_table_name))
298
317
  if not referenced_table:
299
318
  errors.append(
300
319
  f"{table.name}.{column.name} references unknown table {target_table_name!r}."
@@ -345,7 +364,7 @@ class ExcelDataDictionary:
345
364
  enumeration_flag_col_header = header_map.get("enumerations")
346
365
  primary_key_col_header = header_map.get("primary_key")
347
366
  foreign_key_col_header = header_map.get("foreign_key_target")
348
- description_col_header = header_map.get("description")
367
+ description_col_header = header_map.get("column_description")
349
368
  return (
350
369
  table_col_header,
351
370
  column_col_header,
@@ -392,13 +411,17 @@ class ExcelDataDictionary:
392
411
  f"{row_context}: missing required field(s): {', '.join(missing_fields)}."
393
412
  )
394
413
 
395
- table_name = _normalise_name(str(row[table_col_header]))
396
- column_name = _normalise_name(str(row[column_col_header]))
397
- if table_name not in self.table_metadata:
414
+ table_name_raw = _strip(str(row[table_col_header]))
415
+ column_name = _strip(str(row[column_col_header]))
416
+
417
+ resolved_table_name = self._resolve_table_name(table_name_raw)
418
+ if resolved_table_name is None:
398
419
  raise DataDictionaryImportError(
399
- f"{row_context}: Table '{table_name}' not present in Tables sheet."
420
+ f"{row_context}: Table '{table_name_raw}' not present in Tables sheet."
400
421
  )
401
422
 
423
+ table_name = resolved_table_name
424
+
402
425
  order_int = _parse_int(row[order_col_header], "Order", row_context)
403
426
  length_int = (
404
427
  _parse_int(row[length_col_header], "Length", row_context, required=False)
@@ -461,7 +484,7 @@ class ExcelDataDictionary:
461
484
 
462
485
  def _make_column(self, inputs: _ColumnInputs) -> Column:
463
486
  enums_for_column = self.enumerations.get(
464
- (inputs.table_name, inputs.column_name), {}
487
+ (_normalise(inputs.table_name), _normalise(inputs.column_name)), {}
465
488
  )
466
489
  return Column(
467
490
  name=inputs.column_name,