valediction 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. valediction/__init__.py +8 -0
  2. valediction/convenience.py +50 -0
  3. valediction/data_types/__init__.py +0 -0
  4. valediction/data_types/data_type_helpers.py +75 -0
  5. valediction/data_types/data_types.py +58 -0
  6. valediction/data_types/type_inference.py +541 -0
  7. valediction/datasets/__init__.py +0 -0
  8. valediction/datasets/datasets.py +870 -0
  9. valediction/datasets/datasets_helpers.py +46 -0
  10. valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
  11. valediction/demo/DEMOGRAPHICS.csv +101 -0
  12. valediction/demo/DIAGNOSES.csv +650 -0
  13. valediction/demo/LAB_TESTS.csv +1001 -0
  14. valediction/demo/VITALS.csv +1001 -0
  15. valediction/demo/__init__.py +6 -0
  16. valediction/demo/demo_dictionary.py +129 -0
  17. valediction/dictionary/__init__.py +0 -0
  18. valediction/dictionary/exporting.py +501 -0
  19. valediction/dictionary/exporting_helpers.py +371 -0
  20. valediction/dictionary/generation.py +357 -0
  21. valediction/dictionary/helpers.py +174 -0
  22. valediction/dictionary/importing.py +494 -0
  23. valediction/dictionary/integrity.py +37 -0
  24. valediction/dictionary/model.py +582 -0
  25. valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
  26. valediction/exceptions.py +22 -0
  27. valediction/integrity.py +97 -0
  28. valediction/io/__init__.py +0 -0
  29. valediction/io/csv_readers.py +307 -0
  30. valediction/progress.py +206 -0
  31. valediction/support.py +72 -0
  32. valediction/validation/__init__.py +0 -0
  33. valediction/validation/helpers.py +315 -0
  34. valediction/validation/issues.py +280 -0
  35. valediction/validation/validation.py +598 -0
  36. valediction-1.0.0.dist-info/METADATA +15 -0
  37. valediction-1.0.0.dist-info/RECORD +38 -0
  38. valediction-1.0.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,357 @@
1
+ # valediction/dictionary/generation.py
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass
5
+ from datetime import timedelta
6
+ from pathlib import Path
7
+ from typing import Iterable
8
+
9
+ import pandas as pd
10
+ from pandas import DataFrame
11
+
12
+ from valediction.data_types.data_types import DataType
13
+ from valediction.data_types.type_inference import (
14
+ COLUMN_STEPS,
15
+ ColumnState,
16
+ TypeInferer,
17
+ )
18
+ from valediction.datasets.datasets_helpers import DatasetItemLike
19
+ from valediction.dictionary.model import Column, Dictionary, Table
20
+ from valediction.io.csv_readers import (
21
+ CsvReadConfig,
22
+ iter_csv_chunks,
23
+ read_csv_headers,
24
+ read_csv_sample,
25
+ )
26
+ from valediction.progress import Progress
27
+ from valediction.support import _normalise_name, calculate_runtime
28
+
29
+ IMPORTING_DATA = "Importing data"
30
+ CHUNK_STEPS = 1
31
+ COLUMN_STEPS = COLUMN_STEPS
32
+
33
+
34
+ @dataclass(slots=True)
35
+ class GeneratorConfig:
36
+ chunk_size: int = 10_000_000
37
+ sample_rows: int | None = None
38
+ dayfirst: bool = True
39
+ infer_types: bool = True
40
+ infer_max_length: bool = True
41
+
42
+ def set_variables(
43
+ self,
44
+ chunk_size: int | None = None,
45
+ sample_rows: int | None = None,
46
+ ) -> None:
47
+ # Set user variables
48
+ self.chunk_size = chunk_size
49
+ self.sample_rows = sample_rows
50
+
51
+
52
+ class Generator:
53
+ """
54
+ Summary:
55
+ Generator class for creating dictionaries from datasets.
56
+
57
+ Arguments:
58
+ feedback (bool): Provide user feedback on progress (default: True)
59
+ debug (bool): Enable debug mode, providing full log of data type inference and
60
+ reasoning (default: False)
61
+ chunk_size (int | None): Size of chunks for reading data to optimise RAM usage,
62
+ if reading from CSV (default: 10_000_000)
63
+ sample_rows (int | None): Number of rows to sample for data type inference. Note:
64
+ this overrides `chunk_size` and reads in a single chunk (default: None)
65
+
66
+ Raises:
67
+ DataDictionaryError: If there is an issue with the data dictionary
68
+ """
69
+
70
+ def __init__(
71
+ self,
72
+ feedback: bool = True,
73
+ debug: bool = False,
74
+ chunk_size: int | None = 10_000_000,
75
+ sample_rows: int | None = None,
76
+ ) -> None:
77
+ # User Config
78
+ self.config = GeneratorConfig()
79
+ self.config.set_variables(sample_rows=sample_rows, chunk_size=chunk_size)
80
+ self.feedback: bool = feedback
81
+ self.debug: bool = debug
82
+ self.csv_cfg: CsvReadConfig = CsvReadConfig()
83
+
84
+ # Progress
85
+ self.progress: Progress = None
86
+
87
+ # Setup
88
+ if sample_rows is not None:
89
+ self.config.sample_rows = int(sample_rows)
90
+ if chunk_size is not None:
91
+ self.config.chunk_size = int(chunk_size)
92
+
93
+ def __say(
94
+ self,
95
+ *values: object,
96
+ sep: str | None = " ",
97
+ end: str | None = "\n",
98
+ ) -> None:
99
+ if self.feedback:
100
+ print(*values, sep=sep, end=end)
101
+
102
+ def generate_dictionary(
103
+ self,
104
+ items: Iterable[DatasetItemLike],
105
+ dictionary_name: str | None = None,
106
+ primary_keys: dict[str, list[str]] | None = None,
107
+ ) -> Dictionary:
108
+ """
109
+ Summary:
110
+ Generate a dictionary from a Dataset.
111
+
112
+ Arguments:
113
+ items (Dataset): A list of DatasetItems to generate the dictionary from.
114
+ dictionary_name (str | None): The name of the dictionary to generate.
115
+ If None, will not be set.
116
+ primary_keys (dict[str, list[str]] | None): A dictionary of primary keys
117
+ to set on the generated dictionary. If None, will not be set.
118
+
119
+ Returns:
120
+ Dictionary: The generated dictionary.
121
+ """
122
+ dictionary = Dictionary(name=dictionary_name, imported=True)
123
+
124
+ self.__say(f"Generating dictionary for {len(items)} tables")
125
+ for item in items:
126
+ self.__progress_init(item)
127
+ table = Table(name=_normalise_name(item.name))
128
+ dictionary.add_table(table)
129
+
130
+ if item.is_path:
131
+ self._infer_from_csv_into_table(item, table)
132
+ else:
133
+ self._infer_from_dataframe_into_table(item.data, table)
134
+
135
+ item._dictionary_runtimes = self.__finish_generation_for_table()
136
+
137
+ dictionary.set_primary_keys(primary_keys or {})
138
+ self.__say("\n", end="")
139
+ return dictionary
140
+
141
+ # Generation Helpers
142
+ def _infer_from_csv_into_table(self, item: DatasetItemLike, table: Table) -> None:
143
+ self.__begin_step(step=IMPORTING_DATA)
144
+ csv_path = item.data
145
+ inferer = TypeInferer(
146
+ debug=self.debug,
147
+ dayfirst=self.config.dayfirst,
148
+ progress=self.progress,
149
+ )
150
+
151
+ # Read single sample
152
+ if self.config.sample_rows is not None:
153
+ self.__begin_step(step=IMPORTING_DATA)
154
+ df = read_csv_sample(
155
+ csv_path,
156
+ nrows=self.config.sample_rows,
157
+ cfg=self.csv_cfg,
158
+ ).df
159
+ self.__complete_step()
160
+
161
+ inferer.update_with_chunk(df)
162
+ self._create_or_update_columns(table, inferer)
163
+ return
164
+
165
+ # Read in chunks
166
+ first_chunk = True
167
+ columns_by_name: dict[str, Column] = {}
168
+ column_count = item.column_count
169
+ iterator = iter_csv_chunks(
170
+ path=Path(csv_path), chunk_size=self.config.chunk_size, cfg=self.csv_cfg
171
+ )
172
+
173
+ while True:
174
+ # Import chunk
175
+ try:
176
+ chunk = next(iterator)
177
+ except StopIteration:
178
+ break
179
+
180
+ est_chunk_count = chunk.estimate_chunk_count()
181
+ self.__progress_retarget_total(
182
+ est_chunk_count=est_chunk_count, column_count=column_count
183
+ )
184
+ self.__complete_step()
185
+
186
+ inferer.update_with_chunk(chunk.df)
187
+
188
+ self.__begin_step(step="Saving chunk data types")
189
+ if first_chunk:
190
+ ordered = list(inferer.states.keys())
191
+ for idx, col_name in enumerate(ordered, start=1):
192
+ col_state = inferer.states[col_name]
193
+ data_type, length = col_state.final_data_type_and_length()
194
+ col = Column(
195
+ name=_normalise_name(col_name),
196
+ order=idx,
197
+ data_type=data_type,
198
+ length=length if data_type == DataType.TEXT else None,
199
+ vocabulary=None,
200
+ primary_key=None,
201
+ foreign_key=None,
202
+ description=None,
203
+ enumerations=None,
204
+ )
205
+
206
+ self._set_datetime_format(column_state=col_state, column=col)
207
+ table.add_column(col)
208
+ columns_by_name[col_name] = col
209
+ first_chunk = False
210
+
211
+ else:
212
+ self._apply_state_to_existing_columns(table, inferer, columns_by_name)
213
+
214
+ if first_chunk:
215
+ empty = read_csv_headers(
216
+ csv_path,
217
+ cfg=self.csv_cfg,
218
+ )
219
+ inferer.update_with_chunk(empty)
220
+ self._create_or_update_columns(table, inferer)
221
+
222
+ def _infer_from_dataframe_into_table(self, df: pd.DataFrame, table: Table) -> None:
223
+ self.__begin_step(step=IMPORTING_DATA)
224
+ inferer = TypeInferer(
225
+ debug=self.debug,
226
+ dayfirst=self.config.dayfirst,
227
+ progress=self.progress,
228
+ )
229
+ self.__complete_step()
230
+
231
+ inferer.update_with_chunk(df)
232
+ self._create_or_update_columns(table, inferer)
233
+
234
+ # Emit/Update Helpers
235
+ def _create_or_update_columns(self, table: Table, inferer: TypeInferer) -> None:
236
+ if len(table):
237
+ for existing in table:
238
+ table.remove_column(existing.name)
239
+
240
+ ordered = list(inferer.states.keys())
241
+ for idx, col_name in enumerate(ordered, start=1):
242
+ col_state = inferer.states[col_name]
243
+ data_type, length = col_state.final_data_type_and_length()
244
+ col = Column(
245
+ name=_normalise_name(col_name),
246
+ order=idx,
247
+ data_type=data_type,
248
+ length=length if data_type == DataType.TEXT else None,
249
+ vocabulary=None,
250
+ primary_key=None,
251
+ foreign_key=None,
252
+ description=None,
253
+ enumerations=None,
254
+ )
255
+ self._set_datetime_format(column_state=col_state, column=col)
256
+
257
+ table.add_column(col)
258
+
259
+ def _set_datetime_format(self, column_state: ColumnState, column: Column) -> None:
260
+ if column.data_type in (DataType.DATE, DataType.DATETIME):
261
+ datetime_format = getattr(column_state, "cached_datetime_format", None)
262
+ if datetime_format and hasattr(column, "datetime_format"):
263
+ column.datetime_format = datetime_format
264
+
265
+ else:
266
+ if hasattr(column, "datetime_format"):
267
+ column.datetime_format = None
268
+
269
+ def _apply_state_to_existing_columns(
270
+ self,
271
+ table: Table,
272
+ inferer: TypeInferer,
273
+ columns_by_name: dict[str, Column],
274
+ ) -> None:
275
+ for col_name, col_state in inferer.states.items():
276
+ if col_name not in columns_by_name:
277
+ next_order = max((c.order or 0 for c in table), default=0) + 1
278
+ data_type, length = col_state.final_data_type_and_length()
279
+ new_col = Column(
280
+ name=_normalise_name(col_name),
281
+ order=next_order,
282
+ data_type=data_type,
283
+ length=length if data_type == DataType.TEXT else None,
284
+ vocabulary=None,
285
+ primary_key=None,
286
+ foreign_key=None,
287
+ description=None,
288
+ enumerations=None,
289
+ )
290
+ self._set_datetime_format(column_state=col_state, column=new_col)
291
+ table.add_column(new_col)
292
+ columns_by_name[col_name] = new_col
293
+ continue
294
+
295
+ col = columns_by_name[col_name]
296
+ data_type, length = col_state.final_data_type_and_length()
297
+
298
+ if col.data_type != data_type:
299
+ col.data_type = data_type
300
+
301
+ if data_type == DataType.TEXT:
302
+ if length is not None and (col.length or 0) < length:
303
+ col.length = int(length)
304
+ else:
305
+ col.length = None
306
+
307
+ self._set_datetime_format(column_state=col_state, column=col)
308
+
309
+ # Progress
310
+ def __progress_init(self, item: DatasetItemLike) -> None:
311
+ # Switch to debug mode
312
+ if self.debug:
313
+ self.progress = Progress(enabled=False)
314
+ return
315
+
316
+ # Switch to silent mode
317
+ if not self.feedback:
318
+ self.progress = Progress(enabled=False)
319
+ return
320
+
321
+ # Progress bars on
322
+ total_steps = (
323
+ (CHUNK_STEPS + (COLUMN_STEPS * item.column_count))
324
+ if (isinstance(item.data, DataFrame) or self.config.sample_rows)
325
+ else None
326
+ )
327
+ pad = " " * item._padding if item._padding else ""
328
+
329
+ self.progress = Progress(
330
+ desc=f"Generating {item.name}: {pad}",
331
+ starting_step=IMPORTING_DATA,
332
+ est_total=total_steps,
333
+ smoothing_steps=(COLUMN_STEPS * item.column_count),
334
+ )
335
+
336
+ def __progress_retarget_total(
337
+ self, est_chunk_count: int, column_count: int
338
+ ) -> None:
339
+ new_total = (CHUNK_STEPS * est_chunk_count) + (
340
+ COLUMN_STEPS * est_chunk_count * column_count
341
+ )
342
+ self.progress.retarget_total(new_total=new_total)
343
+
344
+ def __begin_step(self, step: str | None = None) -> None:
345
+ self.progress.begin_step(step=step)
346
+
347
+ def __complete_step(self) -> None:
348
+ self.progress.complete_step()
349
+
350
+ def __finish_generation_for_table(self) -> dict[str, timedelta]:
351
+ step = (
352
+ f"Completed ({calculate_runtime(start=self.progress.full_start).message})"
353
+ )
354
+ save_as = "Total"
355
+ self.progress.finish(postfix=step, save_as=save_as, good=True)
356
+ self.progress.close()
357
+ return self.progress.runtimes
@@ -0,0 +1,174 @@
1
+ import re
2
+ from typing import Any, Literal
3
+
4
+ from pandas import Series
5
+ from pandas import isna as _pd_isna
6
+
7
+ from valediction.data_types.data_types import DataType
8
+ from valediction.exceptions import DataDictionaryImportError
9
+ from valediction.integrity import get_config
10
+
11
+
12
+ def _check_name(name: str, entity: Literal["table", "column"]) -> list[str]:
13
+ if entity not in ["table", "column"]:
14
+ raise ValueError("entity must be either 'table' or 'column'")
15
+
16
+ errors: list = []
17
+ config = get_config()
18
+ invalid_chars = (
19
+ config.invalid_name_pattern
20
+ if isinstance(config.invalid_name_pattern, re.Pattern)
21
+ else re.compile(config.invalid_name_pattern)
22
+ )
23
+ max_name_length = (
24
+ config.max_table_name_length
25
+ if entity == "table"
26
+ else config.max_column_name_length
27
+ )
28
+
29
+ if name != name.upper(): # name must be uppercase
30
+ errors.append("must be uppercase")
31
+
32
+ if invalid_chars.search(name): # check invalid characters
33
+ bad = set(invalid_chars.findall(name))
34
+ errors.append(
35
+ f"invalid characters: '{''.join(sorted(bad))}'; "
36
+ "only A-Z, 0-9, and underscores are allowed with no whitespace"
37
+ )
38
+
39
+ if len(name) > max_name_length: # max length 30
40
+ errors.append(f"exceeds max length of {max_name_length}")
41
+
42
+ if not name[0].isalpha(): # column starts with a letter
43
+ errors.append("must start with a letter")
44
+
45
+ if name.endswith("_"): # column cannot end with an underscore
46
+ errors.append("cannot end with '_'")
47
+
48
+ if "__" in name: # column cannot contain double underscores
49
+ errors.append("cannot contain double underscores '__'")
50
+
51
+ return errors
52
+
53
+
54
+ def _check_order(order: int | None) -> list[str]:
55
+ errors: list = []
56
+ if order is None: # presence
57
+ errors.append("order is required and must be an integer ≥ 1")
58
+ return errors
59
+
60
+ if not isinstance(order, int): # type integer
61
+ errors.append("order must be an integer ≥ 1")
62
+ return errors
63
+
64
+ if order < 1: # must be ≥ 1
65
+ errors.append("order must be ≥ 1")
66
+ return errors
67
+
68
+ return errors
69
+
70
+
71
+ def _check_data_type(data_type: DataType, length: int | None) -> list[str]:
72
+ errors: list = []
73
+ if not isinstance(data_type, DataType): # Ensure is a DataType
74
+ errors.append("data type is invalid; must be a DataType object")
75
+
76
+ if length is not None: # length rules
77
+ if not isinstance(length, int):
78
+ errors.append("length must be an positive integer if provided")
79
+ if length <= 0: # must be positive
80
+ errors.append("length must be an positive integer if provided")
81
+
82
+ if data_type == DataType.TEXT: # required for DataType.TEXT
83
+ if length is None:
84
+ errors.append("length is required for TEXT columns")
85
+ else:
86
+ if length is not None: # length not applicable
87
+ errors.append(f"length is not applicable to {data_type.value} columns")
88
+
89
+ return errors
90
+
91
+
92
+ def _check_primary_key(primary_key: int | None, data_type: DataType) -> list[str]:
93
+ errors: list = []
94
+ if primary_key is None:
95
+ return errors
96
+
97
+ if (
98
+ not isinstance(primary_key, int)
99
+ or primary_key < 1
100
+ or primary_key > get_config().max_primary_keys
101
+ ):
102
+ errors.append(
103
+ "primary key order must be an integer between 1 and 7 if provided"
104
+ )
105
+
106
+ if (
107
+ hasattr(data_type, "valid_for_primary_key")
108
+ and not data_type.valid_for_primary_key()
109
+ ):
110
+ errors.append(
111
+ f"invalid data type '{data_type.value}' for primary key column; "
112
+ "primary keys must be Text, Integer, Date, or Datetime"
113
+ )
114
+
115
+ return errors
116
+
117
+
118
+ def _normalise_name(name: str) -> str:
119
+ return name.upper().strip()
120
+
121
+
122
+ def _norm_header_map(columns: list) -> dict:
123
+ mapping, _ = {}, set()
124
+ for c in columns:
125
+ k = str(c).strip().lower().replace(" ", "_").replace("-", "_")
126
+ if k in mapping: # collision
127
+ raise DataDictionaryImportError(
128
+ f"Ambiguous headers after normalisation: {mapping[k]!r} and {c!r} both map to {k!r}"
129
+ )
130
+ mapping[k] = c
131
+ return mapping
132
+
133
+
134
+ def _get_required_header(header_map: dict[str, str], key: str) -> str:
135
+ if key not in header_map:
136
+ raise DataDictionaryImportError(
137
+ f"Required Data Dictionary column '{key}' not found. Available: {list(header_map.keys())}"
138
+ )
139
+ return header_map[key]
140
+
141
+
142
+ def _is_missing(val: Any) -> bool:
143
+ return _pd_isna(val) or (isinstance(val, str) and val.strip() == "")
144
+
145
+
146
+ def _parse_truthy(val: Any) -> bool:
147
+ if isinstance(val, str):
148
+ return val.strip().lower() in {"y", "yes", "true", "1"}
149
+ if isinstance(val, (int, float)):
150
+ try:
151
+ return int(val) == 1
152
+ except Exception:
153
+ return False
154
+ return False
155
+
156
+
157
+ def _row_is_blank(row: Series, keys: tuple[str, str]) -> bool:
158
+ a, b = keys
159
+ return _is_missing(row[a]) and _is_missing(row[b])
160
+
161
+
162
+ def _parse_int(
163
+ value: Any, label: str, row_ctx: str, *, required: bool = True
164
+ ) -> int | None:
165
+ if _is_missing(value):
166
+ if required:
167
+ raise DataDictionaryImportError(f"{row_ctx}: {label} is required.")
168
+ return None
169
+ try:
170
+ return int(value)
171
+ except Exception as e:
172
+ raise DataDictionaryImportError(
173
+ f"{row_ctx}: {label} must be integer (got {value!r})."
174
+ ) from e