valediction 1.2.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -62,14 +62,14 @@ def infer_datetime_format(
62
62
 
63
63
 
64
64
  def get_date_type(datetime_format: str) -> DataType | None:
65
- """Identifies if a datetime format string corresponds to a Date or Datetime data
65
+ """Identifies if a datetime format string corresponds to a Date or Timestamp data
66
66
  type.
67
67
 
68
68
  Args:
69
69
  datetime_format (str): datetime format string
70
70
 
71
71
  Returns:
72
- DataType | None: DataType of Date, Datetime, or None if not found.
72
+ DataType | None: DataType of Date, Timestamp, or None if not found.
73
73
  """
74
74
  config = get_config()
75
75
  return config.date_formats.get(datetime_format)
@@ -8,7 +8,7 @@ class DataType(Enum):
8
8
  INTEGER = "Integer"
9
9
  FLOAT = "Float"
10
10
  DATE = "Date"
11
- DATETIME = "Datetime"
11
+ TIMESTAMP = "Timestamp"
12
12
  FILE = "File"
13
13
 
14
14
  def __str__(self) -> str:
@@ -32,9 +32,9 @@ class DataType(Enum):
32
32
  "number": cls.FLOAT,
33
33
  "numeric": cls.FLOAT,
34
34
  "date": cls.DATE,
35
- "datetime": cls.DATETIME,
36
- "datetime64": cls.DATETIME,
37
- "timestamp": cls.DATETIME,
35
+ "datetime": cls.TIMESTAMP,
36
+ "datetime64": cls.TIMESTAMP,
37
+ "timestamp": cls.TIMESTAMP,
38
38
  "file": cls.FILE,
39
39
  "blob": cls.FILE,
40
40
  "binary": cls.FILE,
@@ -49,10 +49,10 @@ class DataType(Enum):
49
49
  return self in {DataType.TEXT}
50
50
 
51
51
  def valid_for_primary_key(self) -> bool:
52
- """PKs can only be Text, Integer, Date, Datetime."""
52
+ """PKs can only be Text, Integer, Date, Timestamp."""
53
53
  return self in {
54
54
  DataType.TEXT,
55
55
  DataType.INTEGER,
56
56
  DataType.DATE,
57
- DataType.DATETIME,
57
+ DataType.TIMESTAMP,
58
58
  }
@@ -4,6 +4,7 @@ import re
4
4
  import warnings
5
5
 
6
6
  import pandas as pd
7
+ from pandas.api.types import is_object_dtype, is_string_dtype
7
8
 
8
9
  from valediction.data_types.data_type_helpers import infer_datetime_format
9
10
  from valediction.data_types.data_types import DataType
@@ -53,8 +54,8 @@ class ColumnState:
53
54
  return DataType.FLOAT, None
54
55
  if self.data_type == DataType.DATE:
55
56
  return DataType.DATE, None
56
- if self.data_type == DataType.DATETIME:
57
- return DataType.DATETIME, None
57
+ if self.data_type == DataType.TIMESTAMP:
58
+ return DataType.TIMESTAMP, None
58
59
 
59
60
  return DataType.TEXT, _len1()
60
61
 
@@ -123,7 +124,7 @@ class TypeInferer:
123
124
  _handling_function: callable = {
124
125
  DataType.TEXT: self._handle_state_text,
125
126
  DataType.DATE: self._handle_state_date,
126
- DataType.DATETIME: self._handle_state_datetime,
127
+ DataType.TIMESTAMP: self._handle_state_datetime,
127
128
  DataType.INTEGER: self._handle_state_integer,
128
129
  DataType.FLOAT: self._handle_state_float,
129
130
  }.get(state.data_type, self._handle_state_text)
@@ -141,20 +142,31 @@ class TypeInferer:
141
142
  self, s: pd.Series
142
143
  ) -> tuple[pd.Series, pd.Series, pd.Series, int | None]:
143
144
  self.__begin_step(step="Trimming whitespace")
144
- trimmed = s.str.strip()
145
+ is_text = is_string_dtype(s) or is_object_dtype(s)
146
+
147
+ if is_text:
148
+ trimmed = s.astype("string").str.strip()
149
+ else:
150
+ trimmed = s
145
151
  self.__complete_step()
146
152
 
147
153
  self.__begin_step(step="Checking nulls")
148
- nulls = trimmed.isna() | trimmed.str.lower().isin(self.null_tokens)
154
+ if is_text:
155
+ nulls = trimmed.isna() | trimmed.str.lower().isin(self.null_tokens)
156
+ else:
157
+ nulls = trimmed.isna()
149
158
  self.__complete_step()
150
159
 
151
160
  self.__begin_step(step="Checking max length")
152
- lengths = s.str.len()
153
- max_len = int(lengths.max(skipna=True)) if lengths.notna().any() else None
161
+ if is_text:
162
+ lengths = trimmed.str.len()
163
+ max_len = int(lengths.max(skipna=True)) if lengths.notna().any() else None
164
+ else:
165
+ max_len = None
154
166
  self.__complete_step()
155
167
 
156
168
  self.__begin_step(step="Setting non-null mask")
157
- nonnull_mask = (~nulls) & s.notna()
169
+ nonnull_mask = (~nulls) & trimmed.notna()
158
170
  self.__complete_step()
159
171
 
160
172
  return trimmed, nulls, nonnull_mask, max_len
@@ -193,7 +205,7 @@ class TypeInferer:
193
205
  if ok.all():
194
206
  self._transition(
195
207
  st,
196
- DataType.DATETIME if has_time.any() else DataType.DATE,
208
+ DataType.TIMESTAMP if has_time.any() else DataType.DATE,
197
209
  f"cached datetime format={st.cached_datetime_format!r}",
198
210
  )
199
211
  self.__complete_step()
@@ -210,7 +222,7 @@ class TypeInferer:
210
222
  st.cached_datetime_format = fmt
211
223
  self._transition(
212
224
  st,
213
- DataType.DATETIME if has_time.any() else DataType.DATE,
225
+ DataType.TIMESTAMP if has_time.any() else DataType.DATE,
214
226
  f"explicit datetime format={fmt!r}",
215
227
  )
216
228
  self.__complete_step()
@@ -276,7 +288,7 @@ class TypeInferer:
276
288
  st.lock_text_permanent = True
277
289
  self._transition(st, DataType.TEXT, "datetime parse failures")
278
290
  elif has_time.any():
279
- self._transition(st, DataType.DATETIME, "time component detected")
291
+ self._transition(st, DataType.TIMESTAMP, "time component detected")
280
292
 
281
293
  self.__complete_step()
282
294
 
@@ -334,7 +346,7 @@ class TypeInferer:
334
346
  if ok.all():
335
347
  self._transition(
336
348
  st,
337
- DataType.DATETIME if has_time.any() else DataType.DATE,
349
+ DataType.TIMESTAMP if has_time.any() else DataType.DATE,
338
350
  f"cached datetime format={st.cached_datetime_format!r}",
339
351
  )
340
352
  return True
@@ -377,7 +389,7 @@ class TypeInferer:
377
389
  if ok.all():
378
390
  self._transition(
379
391
  st,
380
- DataType.DATETIME if has_time.any() else DataType.DATE,
392
+ DataType.TIMESTAMP if has_time.any() else DataType.DATE,
381
393
  f"explicit datetime format={st.cached_datetime_format!r}",
382
394
  )
383
395
  return True
@@ -103,7 +103,7 @@ def demo_dictionary() -> Dictionary:
103
103
  foreign_key="DEMOGRAPHICS.PATIENT_HASH",
104
104
  ),
105
105
  Column(
106
- name="OBSERVATION_TIME", order=2, data_type="datetime", primary_key=2
106
+ name="OBSERVATION_TIME", order=2, data_type="timestamp", primary_key=2
107
107
  ),
108
108
  Column(
109
109
  name="OBSERVATION_TYPE",
@@ -257,7 +257,7 @@ class Generator:
257
257
  table.add_column(col)
258
258
 
259
259
  def _set_datetime_format(self, column_state: ColumnState, column: Column) -> None:
260
- if column.data_type in (DataType.DATE, DataType.DATETIME):
260
+ if column.data_type in (DataType.DATE, DataType.TIMESTAMP):
261
261
  datetime_format = getattr(column_state, "cached_datetime_format", None)
262
262
  if datetime_format and hasattr(column, "datetime_format"):
263
263
  column.datetime_format = datetime_format
@@ -106,7 +106,7 @@ def _check_primary_key(primary_key: int | None, data_type: DataType) -> list[str
106
106
  ):
107
107
  errors.append(
108
108
  f"invalid data type '{data_type.value}' for primary key column; "
109
- "primary keys must be Text, Integer, Date, or Datetime"
109
+ "primary keys must be Text, Integer, Date, or Timestamp"
110
110
  )
111
111
 
112
112
  return errors
@@ -364,7 +364,7 @@ class ExcelDataDictionary:
364
364
  enumeration_flag_col_header = header_map.get("enumerations")
365
365
  primary_key_col_header = header_map.get("primary_key")
366
366
  foreign_key_col_header = header_map.get("foreign_key_target")
367
- description_col_header = header_map.get("description")
367
+ description_col_header = header_map.get("column_description")
368
368
  return (
369
369
  table_col_header,
370
370
  column_col_header,
valediction/integrity.py CHANGED
@@ -77,20 +77,21 @@ class Config:
77
77
  "%d-%m-%Y": DataType.DATE,
78
78
  "%m/%d/%Y": DataType.DATE,
79
79
  "%m-%d-%Y": DataType.DATE,
80
- "%Y-%m-%d %H:%M:%S": DataType.DATETIME,
81
- "%Y-%m-%d %H:%M": DataType.DATETIME,
82
- "%d/%m/%Y %H:%M:%S": DataType.DATETIME,
83
- "%d/%m/%Y %H:%M": DataType.DATETIME,
84
- "%m/%d/%Y %H:%M:%S": DataType.DATETIME,
85
- "%Y-%m-%dT%H:%M:%S": DataType.DATETIME,
86
- "%Y-%m-%dT%H:%M:%S.%f": DataType.DATETIME,
87
- "%Y-%m-%dT%H:%M:%S%z": DataType.DATETIME,
88
- "%Y-%m-%dT%H:%M:%S.%f%z": DataType.DATETIME,
89
- "%Y-%m-%dT%H:%M:%SZ": DataType.DATETIME,
90
- "%Y-%m-%dT%H:%M:%S.%fZ": DataType.DATETIME,
80
+ "%Y-%m-%d %H:%M:%S": DataType.TIMESTAMP,
81
+ "%Y-%m-%d %H:%M": DataType.TIMESTAMP,
82
+ "%d/%m/%Y %H:%M:%S": DataType.TIMESTAMP,
83
+ "%d/%m/%Y %H:%M": DataType.TIMESTAMP,
84
+ "%m/%d/%Y %H:%M:%S": DataType.TIMESTAMP,
85
+ "%Y-%m-%dT%H:%M:%S": DataType.TIMESTAMP,
86
+ "%Y-%m-%dT%H:%M:%S.%f": DataType.TIMESTAMP,
87
+ "%Y-%m-%dT%H:%M:%S%z": DataType.TIMESTAMP,
88
+ "%Y-%m-%dT%H:%M:%S.%f%z": DataType.TIMESTAMP,
89
+ "%Y-%m-%dT%H:%M:%SZ": DataType.TIMESTAMP,
90
+ "%Y-%m-%dT%H:%M:%S.%fZ": DataType.TIMESTAMP,
91
91
  }
92
92
  self.enforce_no_null_columns: bool = True
93
93
  self.enforce_primary_keys: bool = True
94
+ self.allow_bigint: bool = True
94
95
  _apply_external_injections(self)
95
96
 
96
97
  def __repr__(self):
@@ -110,6 +111,7 @@ class Config:
110
111
  f" - default_null_values={self.null_values}\n"
111
112
  f" - forbidden_characters={self.forbidden_characters}\n"
112
113
  f" - date_formats=[{date_list}\n ]\n"
114
+ f" - allow_bigint={self.allow_bigint}\n"
113
115
  ")"
114
116
  )
115
117
 
@@ -241,7 +241,7 @@ def invalid_mask_datetime(column: Series, fmt: str | None) -> Series:
241
241
  ok = parsed.notna()
242
242
  return notnull & (~ok)
243
243
 
244
- allowed = _allowed_formats_for(DataType.DATETIME)
244
+ allowed = _allowed_formats_for(DataType.TIMESTAMP)
245
245
  ok_any = _parse_ok_any(column, allowed)
246
246
  return notnull & (~ok_any)
247
247
 
@@ -300,7 +300,7 @@ def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
300
300
  )
301
301
  df[col] = dtv.dt.normalize() # midnight
302
302
 
303
- elif data_type == DataType.DATETIME:
303
+ elif data_type == DataType.TIMESTAMP:
304
304
  df[col] = to_datetime(
305
305
  df[col], format=datetime_format, errors="raise", utc=False
306
306
  )
@@ -310,3 +310,62 @@ def apply_data_types(df: DataFrame, table_dictionary: Table) -> DataFrame:
310
310
  df[col] = df[col].astype("string")
311
311
 
312
312
  return df
313
+
314
+
315
+ # Bigint Checks
316
+ _PG_INT4_MIN_STR_ABS = "2147483648" # abs(-2147483648)
317
+ _PG_INT4_MAX_STR_ABS = "2147483647"
318
+ _PG_INT4_MIN_LEN = len(_PG_INT4_MIN_STR_ABS)
319
+ _PG_INT4_MAX_LEN = len(_PG_INT4_MAX_STR_ABS)
320
+
321
+
322
+ def invalid_mask_integer_out_of_range(
323
+ series: Series,
324
+ invalid_integer_mask: Series | None = None,
325
+ ) -> Series:
326
+ """
327
+ Returns a boolean mask for values that:
328
+ - are integer-like under Valediction's integer rules, AND
329
+ - fall outside PostgreSQL INTEGER (int4) range.
330
+ """
331
+
332
+ # Start with all-False mask
333
+ out = series.isna() & False
334
+
335
+ # Use caller-provided invalid mask to avoid recomputing if available
336
+ if invalid_integer_mask is None:
337
+ from valediction.validation.helpers import invalid_mask_integer # avoid cycles
338
+
339
+ invalid_integer_mask = invalid_mask_integer(series)
340
+
341
+ # We only check range for values that already pass integer validation
342
+ valid = (~invalid_integer_mask) & series.notna()
343
+ if not valid.any():
344
+ return out
345
+
346
+ # String-normalise for safe compare (works for object/int dtype)
347
+ s = series[valid].astype("string", copy=False).str.strip()
348
+
349
+ # Sign handling
350
+ neg = s.str.startswith("-")
351
+ abs_str = s.str.lstrip("+-")
352
+
353
+ # Lengths
354
+ abs_len = abs_str.str.len()
355
+
356
+ # Positive overflow:
357
+ # abs_len > 10 OR (abs_len == 10 AND abs_str > 2147483647)
358
+ pos = ~neg
359
+ pos_over = (abs_len > _PG_INT4_MAX_LEN) | (
360
+ (abs_len == _PG_INT4_MAX_LEN) & (abs_str > _PG_INT4_MAX_STR_ABS)
361
+ )
362
+
363
+ # Negative overflow (too small):
364
+ # abs_len > 10 OR (abs_len == 10 AND abs_str > 2147483648)
365
+ neg_over = (abs_len > _PG_INT4_MIN_LEN) | (
366
+ (abs_len == _PG_INT4_MIN_LEN) & (abs_str > _PG_INT4_MIN_STR_ABS)
367
+ )
368
+
369
+ # Combine back into the full index
370
+ out.loc[valid] = (pos & pos_over) | (neg & neg_over)
371
+ return out
@@ -26,6 +26,7 @@ class IssueType(Enum):
26
26
  TYPE_MISMATCH = "TypeMismatch"
27
27
  TEXT_TOO_LONG = "TextTooLong"
28
28
  FORBIDDEN_CHARACTER = "ForbiddenCharacter"
29
+ INTEGER_OUT_OF_RANGE = "IntegerOutOfRange"
29
30
 
30
31
 
31
32
  # Settings
@@ -14,6 +14,7 @@ from valediction.data_types.data_types import DataType
14
14
  from valediction.datasets.datasets_helpers import DataLike, DatasetItemLike
15
15
  from valediction.dictionary.model import Table
16
16
  from valediction.exceptions import DataDictionaryImportError, DataIntegrityError
17
+ from valediction.integrity import get_config
17
18
  from valediction.io.csv_readers import (
18
19
  CsvReadConfig,
19
20
  FrameChunk,
@@ -29,6 +30,7 @@ from valediction.validation.helpers import (
29
30
  invalid_mask_datetime,
30
31
  invalid_mask_float,
31
32
  invalid_mask_integer,
33
+ invalid_mask_integer_out_of_range,
32
34
  invalid_mask_text_forbidden_characters,
33
35
  invalid_mask_text_too_long,
34
36
  mask_to_ranges,
@@ -151,7 +153,7 @@ class Validator:
151
153
  datetime_format = column.datetime_format
152
154
  data_type = column.data_type
153
155
 
154
- if data_type in (DataType.DATE, DataType.DATETIME):
156
+ if data_type in (DataType.DATE, DataType.TIMESTAMP):
155
157
  self._dt_format_cache[name] = datetime_format
156
158
 
157
159
  if not datetime_format:
@@ -298,12 +300,7 @@ class Validator:
298
300
  if missing_keys:
299
301
  for name in dict_names:
300
302
  if _normalise(name) in missing_keys:
301
- self.issues.add(
302
- issue_type=IssueType.MISSING_COLUMN,
303
- table=self.table_name,
304
- column=name,
305
- parent=self.dataset_item,
306
- )
303
+ self._save_issues(IssueType.MISSING_COLUMN, name)
307
304
 
308
305
  self.__complete_step()
309
306
 
@@ -320,12 +317,7 @@ class Validator:
320
317
  if extra_keys:
321
318
  for col in df_cols:
322
319
  if _normalise(col) in extra_keys:
323
- self.issues.add(
324
- issue_type=IssueType.EXTRA_COLUMN,
325
- table=self.table_name,
326
- column=col, # report the actual df label
327
- parent=self.dataset_item,
328
- )
320
+ self._save_issues(IssueType.EXTRA_COLUMN, col)
329
321
 
330
322
  self.__complete_step()
331
323
 
@@ -363,13 +355,7 @@ class Validator:
363
355
  pk_cols_text_df = self._resolve_df_cols(df, pk_cols_text)
364
356
  space_mask = pk_contains_whitespace_mask(df[pk_cols_text_df])
365
357
  if space_mask.any():
366
- self.issues.add(
367
- issue_type=IssueType.PK_WHITESPACE,
368
- table=self.table_name,
369
- column=None,
370
- ranges=mask_to_ranges(space_mask, start_row),
371
- parent=self.dataset_item,
372
- )
358
+ self._save_issues(IssueType.PK_WHITESPACE, None, space_mask, start_row)
373
359
  self.__complete_step()
374
360
 
375
361
  def _check_primary_key_integrity(self, df, start_row: int) -> None:
@@ -391,13 +377,7 @@ class Validator:
391
377
  pk_hashes_non_null = pk_hashes[non_null]
392
378
 
393
379
  if null.any():
394
- self.issues.add(
395
- IssueType.PK_NULL,
396
- table=self.table_name,
397
- column=None,
398
- ranges=mask_to_ranges(null, start_row),
399
- parent=self.dataset_item,
400
- )
380
+ self._save_issues(IssueType.PK_NULL, None, null, start_row)
401
381
  self.__complete_step()
402
382
 
403
383
  # 2) In-chunk collisions
@@ -427,22 +407,14 @@ class Validator:
427
407
 
428
408
  # 7) Emit in-chunk collisions Issues
429
409
  if in_chunk_collision.any():
430
- self.issues.add(
431
- IssueType.PK_COLLISION,
432
- table=self.table_name,
433
- column=None,
434
- ranges=mask_to_ranges(in_chunk_collision, start_row),
435
- parent=self.dataset_item,
410
+ self._save_issues(
411
+ IssueType.PK_COLLISION, None, in_chunk_collision, start_row
436
412
  )
437
413
 
438
414
  # 7) Emit cross-chunk collisions Issues
439
415
  if cross_chunk_collision.any():
440
- self.issues.add(
441
- IssueType.PK_COLLISION,
442
- table=self.table_name,
443
- column=None,
444
- ranges=mask_to_ranges(cross_chunk_collision, start_row),
445
- parent=self.dataset_item,
416
+ self._save_issues(
417
+ IssueType.PK_COLLISION, None, cross_chunk_collision, start_row
446
418
  )
447
419
 
448
420
  # Add the original PK row as a collision
@@ -453,7 +425,7 @@ class Validator:
453
425
  IssueType.PK_COLLISION,
454
426
  table=self.table_name,
455
427
  column=None,
456
- ranges=[Range(first_row, first_row)],
428
+ ranges=[Range(first_row, first_row)], # add directly
457
429
  parent=self.dataset_item,
458
430
  )
459
431
  self.tracker_pk_reported_first.add(int(h))
@@ -515,11 +487,50 @@ class Validator:
515
487
 
516
488
  self.__complete_step()
517
489
 
490
+ def _save_issues(
491
+ self,
492
+ issue_type: IssueType,
493
+ column: str | None = None,
494
+ invalid: Series | None = None,
495
+ start_row: int | None = None,
496
+ ) -> None:
497
+ if invalid is not None and start_row is None:
498
+ raise ValueError(
499
+ "'start_row' must be provided when 'invalid' mask is provided"
500
+ )
501
+
502
+ ranges = None if invalid is None else mask_to_ranges(invalid, start_row)
503
+ self.issues.add(
504
+ issue_type=issue_type,
505
+ table=self.table_name,
506
+ column=column,
507
+ ranges=ranges,
508
+ parent=self.dataset_item,
509
+ )
510
+
511
+ def _check_column_types_integer(
512
+ self, col: str, series: Series, start_row: int, allow_bigint: bool
513
+ ) -> None:
514
+ invalid = invalid_mask_integer(series)
515
+ if invalid.any():
516
+ self._save_issues(IssueType.TYPE_MISMATCH, col, invalid, start_row)
517
+
518
+ # Check for out of range integers
519
+ if allow_bigint is False:
520
+ out_of_range = invalid_mask_integer_out_of_range(
521
+ series, invalid_integer_mask=invalid
522
+ )
523
+ if out_of_range.any():
524
+ self._save_issues(
525
+ IssueType.INTEGER_OUT_OF_RANGE, col, out_of_range, start_row
526
+ )
527
+
518
528
  def _check_column_types(self, df: DataFrame, start_row: int) -> None:
519
529
  self.__begin_step(step="Checking column types")
520
530
  present = [
521
531
  col for col in df.columns if _normalise(str(col)) in self._column_names
522
532
  ]
533
+ config = get_config()
523
534
  for col in present:
524
535
  dtype = self._find_data_type(col)
525
536
  if dtype == DataType.TEXT:
@@ -527,26 +538,24 @@ class Validator:
527
538
 
528
539
  series = df[col]
529
540
  if dtype == DataType.INTEGER:
530
- invalid = invalid_mask_integer(series)
531
- elif dtype == DataType.FLOAT:
541
+ self._check_column_types_integer(
542
+ col, series, start_row, config.allow_bigint
543
+ )
544
+ continue
545
+
546
+ if dtype == DataType.FLOAT:
532
547
  invalid = invalid_mask_float(series)
533
548
  elif dtype == DataType.DATE:
534
549
  fmt = self._dt_format_cache.get(col) or self._find_datetime_format(col)
535
550
  invalid = invalid_mask_date(series, fmt)
536
- elif dtype == DataType.DATETIME:
551
+ elif dtype == DataType.TIMESTAMP:
537
552
  fmt = self._dt_format_cache.get(col) or self._find_datetime_format(col)
538
553
  invalid = invalid_mask_datetime(series, fmt)
539
554
  else:
540
555
  continue
541
556
 
542
557
  if invalid.any():
543
- self.issues.add(
544
- IssueType.TYPE_MISMATCH,
545
- table=self.table_name,
546
- column=col,
547
- ranges=mask_to_ranges(invalid, start_row),
548
- parent=self.dataset_item,
549
- )
558
+ self._save_issues(IssueType.TYPE_MISMATCH, col, invalid, start_row)
550
559
  self.__complete_step()
551
560
 
552
561
  def _check_text_lengths(self, df: DataFrame, start_row: int) -> None:
@@ -560,13 +569,7 @@ class Validator:
560
569
  max_len = self._find_max_length(col)
561
570
  invalid = invalid_mask_text_too_long(df[col], max_len)
562
571
  if invalid.any():
563
- self.issues.add(
564
- IssueType.TEXT_TOO_LONG,
565
- table=self.table_name,
566
- column=col,
567
- ranges=mask_to_ranges(invalid, start_row),
568
- parent=self.dataset_item,
569
- )
572
+ self._save_issues(IssueType.TEXT_TOO_LONG, col, invalid, start_row)
570
573
  self.__complete_step()
571
574
 
572
575
  def _check_text_forbidden_chars(self, df: DataFrame, start_row: int) -> None:
@@ -579,13 +582,7 @@ class Validator:
579
582
  continue
580
583
  mask = invalid_mask_text_forbidden_characters(df[col])
581
584
  if mask.any():
582
- self.issues.add(
583
- IssueType.FORBIDDEN_CHARACTER,
584
- table=self.table_name,
585
- column=col,
586
- ranges=mask_to_ranges(mask, start_row),
587
- parent=self.dataset_item,
588
- )
585
+ self._save_issues(IssueType.FORBIDDEN_CHARACTER, col, mask, start_row)
589
586
  self.__complete_step()
590
587
 
591
588
  # Validation: Final Helpers
@@ -593,12 +590,7 @@ class Validator:
593
590
  self.__begin_step(step="Checking for fully null columns")
594
591
  for column, seen in self.tracker_seen_non_nulls.items():
595
592
  if not seen:
596
- self.issues.add(
597
- issue_type=IssueType.FULLY_NULL_COLUMN,
598
- table=self.table_name,
599
- column=column,
600
- parent=self.dataset_item,
601
- )
593
+ self._save_issues(IssueType.FULLY_NULL_COLUMN, column)
602
594
  self.__complete_step()
603
595
 
604
596
  # Progress Helpers
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: valediction
3
- Version: 1.2.0
3
+ Version: 1.5.0
4
4
  Summary: Valediction is a convenience data validation package that allows generation, import, and constraint enforcement of user-defined data dictionaries against datasets.
5
5
  Author-email: Cai Davis <Cai.Davis@uhs.nhs.uk>
6
6
  Requires-Python: <4.0,>=3.11
@@ -1,38 +1,38 @@
1
1
  valediction/__init__.py,sha256=HJy57qHyaeENZ0xGf5-jkkal-P92n242UU6vIqtsnaw,511
2
2
  valediction/convenience.py,sha256=gDSNcI_T9VKO3Lk1Van4YQCt6hp_fqPyJnUJD8QNP_o,1438
3
3
  valediction/exceptions.py,sha256=OtAq_ShVCZeoNx0hWCzJVlVdl3Gm55l72IP1KrKUMR0,748
4
- valediction/integrity.py,sha256=2x1xpz1J3dmXCPRSGHPpnbLEvdlJDUuQ9B1y0baZ-mk,5151
4
+ valediction/integrity.py,sha256=P18v_5BaUNZTTDfp0OLB9N-eM0IoqzxSVjhmEhNOQKY,5254
5
5
  valediction/progress.py,sha256=fXld7BRhp8kk7xPCG50PbRPXvF8RV7Br2hENHuOUlbo,5974
6
6
  valediction/support.py,sha256=dhKwhtL6dgG709T6fkGaLZDvjYtnxIO9cMmgz477m-I,2207
7
7
  valediction/data_types/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- valediction/data_types/data_type_helpers.py,sha256=iqcpSPBoFZybkTMHBmxrlv56ZRg8PbqSLtVsuJXC2G0,2282
9
- valediction/data_types/data_types.py,sha256=MJv_io_hvbLo0G0N38kwj71goXlAo0isPFyS3TU05II,1605
10
- valediction/data_types/type_inference.py,sha256=11SGYgpvfAfwrDwyOewVIwvfA6pQtDo6i94_xfebYM8,19952
8
+ valediction/data_types/data_type_helpers.py,sha256=EyADhEHaLwKL7JaLsp2EgIsqo9O5r34WqiawWZKWVHI,2284
9
+ valediction/data_types/data_types.py,sha256=kluHvBdSwAjB5eBEM1xYH-SjiruhDaXiRMI2Bl0Lw7Y,1612
10
+ valediction/data_types/type_inference.py,sha256=a6er1jFZrhwAU0z2h1FsrmooPXUmwu6qkI2cHaciOb4,20315
11
11
  valediction/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  valediction/datasets/datasets.py,sha256=uweSdVkk5-zDBzL8M-cHnC6LETZXnubakajFi0J6L_c,30089
13
13
  valediction/datasets/datasets_helpers.py,sha256=AdB3ws5MYFpiXTmHXmSsdm2wZVwDXkXDOtYLvSYhs4I,1159
14
- valediction/demo/DEMO - Data Dictionary.xlsx,sha256=wj1JG8dHgdALVwV0zSSYnyWMomMTzrHxGFRm491wM_A,45308
14
+ valediction/demo/DEMO - Data Dictionary.xlsx,sha256=upbwe14U5haXNarJndSW7ur6rd7RAssyEV2x2D7avao,45271
15
15
  valediction/demo/DEMOGRAPHICS.csv,sha256=ochj8tiHdRByvK2YbZTw5UR3UxbjYxA2xVbA1BfUlbU,3437
16
16
  valediction/demo/DIAGNOSES.csv,sha256=tJYtjeyLEbLvgulsbLA2MQg07AN_vGDSRGUn33xdiwM,19784
17
17
  valediction/demo/LAB_TESTS.csv,sha256=ii1tdQWmm2JiG6MdywB8in_R8vXBJzalD1TEvxG3abw,80401
18
18
  valediction/demo/VITALS.csv,sha256=npyaJAP5pLNndz3P3gQ4VbwI5_KwvCrXjSXjNLVJ1g0,46643
19
19
  valediction/demo/__init__.py,sha256=trPaz5YRP_HCUJCVZhHdU8uQaaNjAISpFWCRx89ewBg,200
20
- valediction/demo/demo_dictionary.py,sha256=OQcmKpKuRmLQuidYr2KIVF3_78crki5HU8E6RuFm01s,4410
20
+ valediction/demo/demo_dictionary.py,sha256=etTvXk3r9OkPOL9C8Ws9RGZ3DgPsGT-JWt4gRmDgLfg,4411
21
21
  valediction/dictionary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  valediction/dictionary/exporting.py,sha256=uRcnVzY5Uj0Yfju4y8sWsjTSP7ywATV9CQDYZMmnws0,18617
23
23
  valediction/dictionary/exporting_helpers.py,sha256=O2pOMAHsRIyhLzxrSFrHIHJEf0wEteJguSI2JK9Rxcw,12434
24
- valediction/dictionary/generation.py,sha256=CsmQcW6joFGQe-PkhzOXj0uv97hiq7rN3kZKliqox9A,12305
25
- valediction/dictionary/helpers.py,sha256=DtEbtn__CSs9LWi0oVZD3DtZonRjHeoQA9de-xQ1z3I,5288
26
- valediction/dictionary/importing.py,sha256=PbE5gLv-y29MAl540ClBCzoTGT8U4Ss0xbzG_GJOpzo,19557
24
+ valediction/dictionary/generation.py,sha256=XVGabsZgoFFC6nyr8vRrypdXnyfajAUTWWLs3miamm8,12306
25
+ valediction/dictionary/helpers.py,sha256=C8LM5FjQmWiOzLLDck08ihGbuVSuXQwCgkVWPKaOoOE,5289
26
+ valediction/dictionary/importing.py,sha256=e_jfVzW2tyfD1eR78_FL0MsFkA17RUDP6n9taX2Bxqk,19564
27
27
  valediction/dictionary/integrity.py,sha256=k0RLRyNs8dsHyOivl2WCS6jxlhPsW9wfXB48owyokfs,787
28
28
  valediction/dictionary/model.py,sha256=WtTGb5gZAtg7JiurvaWuD1K4DnNkygU-PoEVTZIgBCc,21617
29
- valediction/dictionary/template/PROJECT - Data Dictionary.xltx,sha256=ZsWmJsSBHvh3ADfrntmeVMWI9Vp_q7zqrTgp7rGd-AI,41721
29
+ valediction/dictionary/template/PROJECT - Data Dictionary.xltx,sha256=1boUdgNE1zYUu3pko9QREEi-hn29gp85PQqVwISyS2U,41691
30
30
  valediction/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
31
  valediction/io/csv_readers.py,sha256=sKYP_xtOuxwm6ce2eDrphQ_wagxP0RYMXiMlEtkybBg,9812
32
32
  valediction/validation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
- valediction/validation/helpers.py,sha256=rJl_t0XCwt_X3LZAEz5pMihsAKoAIn-qkANNcm4lLf0,10195
34
- valediction/validation/issues.py,sha256=fBeGjbGGmwGg5XfENU4FtsYhvFfwdqAFSYGNF3UBEI8,9327
35
- valediction/validation/validation.py,sha256=XgYnRslQTCZCpAHi_AYUkZw4mXM7yjwBw6-iYJXhcao,22961
36
- valediction-1.2.0.dist-info/METADATA,sha256=vDocxg062EAic20YXBd3rQmfk95hkybDdnpLDQvV0W0,612
37
- valediction-1.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
38
- valediction-1.2.0.dist-info/RECORD,,
33
+ valediction/validation/helpers.py,sha256=RZsFeRGJqHJlxtSbXItV093-4SuXoI74X9539ZyqswY,12034
34
+ valediction/validation/issues.py,sha256=UO5bFxCOfT76rwi3SWi7wdn6Ux2wxxPC02RGvl0dL9Y,9374
35
+ valediction/validation/validation.py,sha256=ezWr4-RRq3w5P-IT3nC9g-KIz0FWXkFdNY-_ouOJxzo,22688
36
+ valediction-1.5.0.dist-info/METADATA,sha256=7OF5xMdNJnOVhEaz1ffd1lJqz61qJqje_T_r9PkKa7o,612
37
+ valediction-1.5.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
38
+ valediction-1.5.0.dist-info/RECORD,,