valediction 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. valediction/__init__.py +8 -0
  2. valediction/convenience.py +50 -0
  3. valediction/data_types/__init__.py +0 -0
  4. valediction/data_types/data_type_helpers.py +75 -0
  5. valediction/data_types/data_types.py +58 -0
  6. valediction/data_types/type_inference.py +541 -0
  7. valediction/datasets/__init__.py +0 -0
  8. valediction/datasets/datasets.py +870 -0
  9. valediction/datasets/datasets_helpers.py +46 -0
  10. valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
  11. valediction/demo/DEMOGRAPHICS.csv +101 -0
  12. valediction/demo/DIAGNOSES.csv +650 -0
  13. valediction/demo/LAB_TESTS.csv +1001 -0
  14. valediction/demo/VITALS.csv +1001 -0
  15. valediction/demo/__init__.py +6 -0
  16. valediction/demo/demo_dictionary.py +129 -0
  17. valediction/dictionary/__init__.py +0 -0
  18. valediction/dictionary/exporting.py +501 -0
  19. valediction/dictionary/exporting_helpers.py +371 -0
  20. valediction/dictionary/generation.py +357 -0
  21. valediction/dictionary/helpers.py +174 -0
  22. valediction/dictionary/importing.py +494 -0
  23. valediction/dictionary/integrity.py +37 -0
  24. valediction/dictionary/model.py +582 -0
  25. valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
  26. valediction/exceptions.py +22 -0
  27. valediction/integrity.py +97 -0
  28. valediction/io/__init__.py +0 -0
  29. valediction/io/csv_readers.py +307 -0
  30. valediction/progress.py +206 -0
  31. valediction/support.py +72 -0
  32. valediction/validation/__init__.py +0 -0
  33. valediction/validation/helpers.py +315 -0
  34. valediction/validation/issues.py +280 -0
  35. valediction/validation/validation.py +598 -0
  36. valediction-1.0.0.dist-info/METADATA +15 -0
  37. valediction-1.0.0.dist-info/RECORD +38 -0
  38. valediction-1.0.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,307 @@
1
+ # valediction/io/csv_readers.py
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ from dataclasses import dataclass
6
+ from math import ceil
7
+ from pathlib import Path
8
+ from typing import Iterator, NamedTuple
9
+
10
+ import pandas as pd
11
+ from pandas import DataFrame
12
+ from pandas.errors import ParserError
13
+
14
+ from valediction.support import _normalise_name
15
+
16
+
17
+ class FrameChunk(NamedTuple):
18
+ """A chunk of rows + I/O metadata.
19
+
20
+ - start/end are 0-based inclusive row numbers in the full dataset.
21
+ - file_pos/total_size/bytes_read are None when not reading from disk.
22
+ """
23
+
24
+ df: DataFrame
25
+ start: int # 0-based, inclusive
26
+ end: int # 10-based, inclusive
27
+ total_size: int | None # bytes of the whole file
28
+ file_pos: int | None # f.tell() after producing this chunk
29
+ bytes_read: int | None # bytes consumed to produce this chunk
30
+ chunk_index: int | None # 0-based index of this chunk
31
+
32
+ # Cumulative Totals
33
+ total_bytes_read: int | None
34
+ total_chunks_seen: int | None
35
+
36
+ def estimate_chunk_count(self) -> int:
37
+ # Buffers (accounting for CSV tails/bytes innacuracy)
38
+ EPS_ABS = 4096 # Fixed
39
+ EPS_REL = 0.05 # 5% tail buffer
40
+
41
+ bytes_seen = int(self.total_bytes_read)
42
+ chunks_seen = max(1, int(self.total_chunks_seen))
43
+ average = max(1.0, bytes_seen / float(chunks_seen))
44
+
45
+ remaining = max(0, int(self.total_size) - bytes_seen)
46
+
47
+ # Account for small tail if potentially complete
48
+ tail_thresh = max(EPS_ABS, int(EPS_REL * average))
49
+ if remaining <= tail_thresh:
50
+ remaining = 0
51
+
52
+ return chunks_seen + (0 if remaining == 0 else int(ceil(remaining / average)))
53
+
54
+ def update_df(self, df: DataFrame) -> FrameChunk:
55
+ return self._replace(df=df)
56
+
57
+
58
+ @dataclass(slots=True)
59
+ class CsvReadConfig:
60
+ """Canonical CSV reading defaults for the overall project.
61
+
62
+ Notes:
63
+ - dtype="string" always reads columns as string, permitting downstream inference/validation.
64
+ - keep_default_na=False and na_values=[] prevent pandas from coercing tokens like "NA".
65
+ - We normalise headers and strip string values post-read (vectorised).
66
+ """
67
+
68
+ dtype: str = "string"
69
+ keep_default_na: bool = False
70
+ na_values: list[str] | None = None
71
+ encoding: str = "utf-8"
72
+ normalise_headers: bool = True
73
+ strip_values: bool = True
74
+ usecols: list[str] | None = None
75
+
76
+ def __post_init__(self) -> None:
77
+ if self.na_values is None:
78
+ self.na_values = []
79
+
80
+
81
+ def _kwargs(cfg: CsvReadConfig | None = None) -> dict:
82
+ cfg = cfg or CsvReadConfig()
83
+ return dict(
84
+ dtype=cfg.dtype,
85
+ keep_default_na=cfg.keep_default_na,
86
+ na_values=cfg.na_values,
87
+ encoding=cfg.encoding,
88
+ usecols=cfg.usecols,
89
+ )
90
+
91
+
92
+ def _post_read_processing(df: DataFrame, cfg: CsvReadConfig) -> DataFrame:
93
+ """Apply header normalisation and vectorised value stripping after reading."""
94
+ cfg = cfg or CsvReadConfig()
95
+ if cfg.normalise_headers:
96
+ df = df.rename(columns={c: _normalise_name(c) for c in df.columns})
97
+ if cfg.strip_values:
98
+ str_cols = df.select_dtypes(include=["string"]).columns
99
+ if len(str_cols) > 0:
100
+ df[str_cols] = df[str_cols].apply(lambda s: s.str.strip())
101
+ return df
102
+
103
+
104
+ def read_csv_headers(path: str | Path, cfg: CsvReadConfig | None = None) -> DataFrame:
105
+ """Read headers only (nrows=0) with canonical settings; returns a DataFrame."""
106
+ cfg = cfg or CsvReadConfig()
107
+ cfg.strip_values = False
108
+
109
+ try:
110
+ header = pd.read_csv(path, nrows=0, **_kwargs(cfg))
111
+ return _post_read_processing(header, cfg)
112
+
113
+ except ParserError as e:
114
+ raise ParserError(
115
+ f"Malformed CSV while reading header from '{path}': {e}"
116
+ ) from e
117
+
118
+
119
+ def read_csv_all(path: str | Path, cfg: CsvReadConfig | None = None) -> FrameChunk:
120
+ """Read the entire CSV with canonical settings; returns a single FrameChunk."""
121
+ cfg = cfg or CsvReadConfig()
122
+ try:
123
+ file_size = os.path.getsize(path)
124
+
125
+ with open(path, "rb") as file:
126
+ start_pos = file.tell()
127
+ df = pd.read_csv(file, **_kwargs(cfg))
128
+ end_pos = file.tell()
129
+ df = _post_read_processing(df, cfg)
130
+ n = len(df)
131
+
132
+ return FrameChunk(
133
+ df=df,
134
+ start=0,
135
+ end=n - 1,
136
+ total_size=file_size,
137
+ file_pos=end_pos,
138
+ bytes_read=end_pos - start_pos,
139
+ chunk_index=1,
140
+ total_bytes_read=file_size,
141
+ total_chunks_seen=1,
142
+ )
143
+ except ParserError as e:
144
+ raise ParserError(f"Malformed CSV while reading '{path}': {e}") from e
145
+
146
+
147
+ def read_csv_sample(
148
+ path: str | Path, nrows: int, cfg: CsvReadConfig | None = None
149
+ ) -> FrameChunk:
150
+ """Read first `nrows` with canonical settings; returns a FrameChunk with I/O
151
+ metadata."""
152
+ cfg = cfg or CsvReadConfig()
153
+
154
+ try:
155
+ file_size = os.path.getsize(path)
156
+ with open(path, "rb") as file:
157
+ start_pos = file.tell()
158
+ df = pd.read_csv(file, nrows=nrows, **_kwargs(cfg))
159
+ end_pos = file.tell()
160
+
161
+ df = _post_read_processing(df, cfg)
162
+ n = len(df)
163
+
164
+ bytes_read = (end_pos - start_pos) if end_pos > 0 else None
165
+ file_pos = end_pos if end_pos > 0 else None
166
+
167
+ return FrameChunk(
168
+ df=df,
169
+ start=0,
170
+ end=n - 1,
171
+ total_size=file_size,
172
+ file_pos=file_pos,
173
+ bytes_read=bytes_read,
174
+ chunk_index=1,
175
+ total_bytes_read=bytes_read or 0,
176
+ total_chunks_seen=1,
177
+ )
178
+
179
+ except ParserError as e:
180
+ raise ParserError(
181
+ f"Malformed CSV while reading sample from '{path}': {e}"
182
+ ) from e
183
+
184
+
185
+ def iter_csv_chunks(
186
+ path: str | Path, chunk_size: int | None, cfg: CsvReadConfig | None = None
187
+ ) -> Iterator[FrameChunk]:
188
+ """Yield FrameChunk with canonical settings.
189
+
190
+ Behaviour:
191
+ - If chunk_size is None or <= 0: yields a single chunk for the entire file.
192
+ - Else: yields multiple chunks each with populated bytes/position metadata.
193
+ """
194
+ cfg = cfg or CsvReadConfig()
195
+ try:
196
+ file_size = os.path.getsize(path)
197
+
198
+ # No chunking: one full-file chunk with metadata
199
+ if not chunk_size or (isinstance(chunk_size, int) and chunk_size <= 0):
200
+ with open(path, "rb") as file:
201
+ start_pos = file.tell()
202
+ df = pd.read_csv(file, **_kwargs(cfg))
203
+ end_pos = file.tell()
204
+ df = _post_read_processing(df, cfg)
205
+ n = len(df)
206
+ if n == 0:
207
+ return
208
+ yield FrameChunk(
209
+ df=df,
210
+ start=0,
211
+ end=n - 1,
212
+ total_size=file_size,
213
+ file_pos=end_pos,
214
+ bytes_read=file_size,
215
+ chunk_index=1,
216
+ total_bytes_read=end_pos - start_pos,
217
+ total_chunks_seen=1,
218
+ )
219
+ return
220
+
221
+ # Chunking: stream with bytes/pos metadata
222
+ with open(path, "rb") as file:
223
+ reader = pd.read_csv(file, chunksize=chunk_size, **_kwargs(cfg))
224
+ prev_pos = file.tell()
225
+ offset = 0
226
+ idx = 0
227
+ cumulative_bytes = 0
228
+ for raw in reader:
229
+ idx += 1
230
+ curr_pos = file.tell()
231
+ bytes_read = max(0, curr_pos - prev_pos)
232
+ prev_pos = curr_pos
233
+ cumulative_bytes += bytes_read
234
+
235
+ df = _post_read_processing(raw, cfg)
236
+ n = len(df)
237
+ if n == 0:
238
+ continue
239
+
240
+ start = offset
241
+ end = offset + n - 1
242
+ offset += n
243
+
244
+ yield FrameChunk(
245
+ df=df,
246
+ start=start,
247
+ end=end,
248
+ total_size=file_size,
249
+ file_pos=curr_pos,
250
+ bytes_read=bytes_read,
251
+ chunk_index=idx,
252
+ total_bytes_read=cumulative_bytes,
253
+ total_chunks_seen=idx,
254
+ )
255
+
256
+ except ParserError as e:
257
+ raise ParserError(
258
+ f"Malformed CSV while reading chunks from '{path}': {e}"
259
+ ) from e
260
+
261
+
262
+ # Reading specific ranges
263
+ def _intersect_local_spans(
264
+ ranges: list[tuple[int, int]],
265
+ chunk_start: int,
266
+ chunk_end: int,
267
+ ) -> list[tuple[int, int]]:
268
+ out: list[tuple[int, int]] = []
269
+ for r_start, r_end in ranges:
270
+ lo = max(r_start, chunk_start)
271
+ hi = min(r_end, chunk_end)
272
+ if hi >= lo:
273
+ out.append((lo - chunk_start, hi - chunk_start))
274
+ return out
275
+
276
+
277
+ def read_csv_ranges(
278
+ path: str | Path,
279
+ ranges: list[tuple[int, int]],
280
+ cfg: CsvReadConfig | None = None,
281
+ chunk_size: int | None = 1_000_000,
282
+ ) -> pd.DataFrame:
283
+ """Read only the rows covered by `ranges` (global 0-based inclusive pairs).
284
+
285
+ Respects CsvReadConfig (including usecols for column pruning). Returns a
286
+ concatenated DataFrame (may be empty).
287
+ """
288
+ if not ranges:
289
+ # honour columns if specified
290
+ cols = cfg.usecols if (cfg and cfg.usecols) else None
291
+ return pd.DataFrame(columns=cols) if cols else pd.DataFrame()
292
+
293
+ parts: list[pd.DataFrame] = []
294
+ for chunk in iter_csv_chunks(path, chunk_size=chunk_size, cfg=cfg):
295
+ local_spans = _intersect_local_spans(ranges, chunk.start, chunk.end)
296
+ if not local_spans:
297
+ continue
298
+
299
+ for lo, hi in local_spans:
300
+ part = chunk.df.iloc[lo : hi + 1]
301
+ parts.append(part)
302
+
303
+ if not parts:
304
+ cols = cfg.usecols if (cfg and cfg.usecols) else None
305
+ return pd.DataFrame(columns=cols) if cols else pd.DataFrame()
306
+
307
+ return pd.concat(parts, axis=0, ignore_index=False)
@@ -0,0 +1,206 @@
1
+ # progress.py
2
+ from __future__ import annotations
3
+
4
+ from datetime import datetime, timedelta
5
+
6
+ from tqdm import tqdm
7
+
8
+ from valediction.support import BOLD_GREEN, BOLD_RED, RESET, calculate_runtime
9
+
10
+ FORMAT_KNOWN_TOTAL = (
11
+ "{desc} {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} "
12
+ "[{elapsed}<{remaining}, {rate_fmt}{postfix}]"
13
+ )
14
+
15
+ FORMAT_UNKNOWN_TOTAL = (
16
+ "{desc} {percentage:3.0f}%|{bar}| ?/? [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
17
+ )
18
+
19
+
20
+ class Progress:
21
+ def __init__(
22
+ self,
23
+ desc: str = "",
24
+ est_total: int | None = 1,
25
+ smoothing_steps: int = 0,
26
+ unit: str = "step",
27
+ starting_step: str | None = None,
28
+ enabled: bool = True,
29
+ ) -> None:
30
+ """Progress bar (tqdm) with manual control.
31
+
32
+ Args:
33
+ desc (str): label shown to the left of the bar
34
+ starting_step (str, optional): initial step and starting postfix, e.g. "Importing Data".
35
+ Defaults to "".
36
+ est_total (int, optional): initial total number of steps (can grow/shrink later).
37
+ Defaults to 1.
38
+ smoothing_steps (int, optional): window length of previous steps to approximate ETA.
39
+ Use 0 for global average. Defaults to 0.
40
+ unit (str, optional): display unit (default: "step"). Defaults to "step".
41
+ bar_format (str, optional): custom bar format. Defaults to None (using Progress
42
+ default).
43
+ enabled (bool, optional): Enables switching off, avoiding duplication of upstream
44
+ checks. Defaults to True.
45
+ """
46
+ self.enabled: bool = enabled
47
+ self.desc: str = desc
48
+ self.est_total: int = est_total
49
+ self.smoothing_steps: int = max(0, int(smoothing_steps or 0))
50
+ self.unit: str = unit
51
+ self.postfix: str = ""
52
+
53
+ # Bar
54
+ self.bar: tqdm = None
55
+ self.total_steps: int = self.est_total
56
+ self.completed_steps: int = 0
57
+
58
+ # Runtimes
59
+ self.full_start: datetime = None
60
+ self.step_start: datetime = None
61
+ self.current_step = starting_step or ""
62
+ self.runtimes: dict[str, timedelta] = {}
63
+
64
+ self.__init_progress_bar()
65
+
66
+ # Context
67
+ def __enter__(self) -> Progress:
68
+ return self
69
+
70
+ def __exit__(self, exc_type, exc, tb) -> None:
71
+ self.close()
72
+
73
+ # Initialisation
74
+ def __init_progress_bar(self) -> None:
75
+ now = datetime.now()
76
+ self.full_start = now
77
+ self.step_start = now
78
+
79
+ if not self.enabled:
80
+ return
81
+
82
+ smoothing = (
83
+ 0.0 if self.smoothing_steps == 0 else 2.0 / (self.smoothing_steps + 1)
84
+ )
85
+
86
+ self.bar = tqdm(
87
+ total=self.total_steps,
88
+ unit=self.unit,
89
+ desc=self.desc,
90
+ smoothing=smoothing,
91
+ )
92
+ self.__set_bar_format()
93
+ if self.current_step:
94
+ self.bar.set_postfix_str(self.current_step)
95
+
96
+ def __set_bar_format(self) -> None:
97
+ if self.est_total:
98
+ self.bar.bar_format = FORMAT_KNOWN_TOTAL
99
+ else:
100
+ self.bar.bar_format = FORMAT_UNKNOWN_TOTAL
101
+
102
+ # Management
103
+ def retarget_total(self, new_total: int) -> None:
104
+ if not self.enabled:
105
+ return
106
+
107
+ new_total = max(1, int(new_total))
108
+ self.total_steps = new_total
109
+ self.est_total = new_total
110
+ self.__set_bar_format()
111
+
112
+ if self.bar is None:
113
+ return
114
+
115
+ if int(self.bar.total or 0) == new_total:
116
+ return
117
+
118
+ self.bar.total = new_total
119
+ self._refresh()
120
+
121
+ def begin_step(self, step: str, alt_postfix: str = None) -> None:
122
+ self.step_start = datetime.now()
123
+ self.current_step = step
124
+ postfix = alt_postfix or self.current_step
125
+
126
+ if self.enabled:
127
+ self._set_postfix(postfix)
128
+ self._refresh()
129
+
130
+ def complete_step(
131
+ self, n: int = 1, from_time: datetime = None, save_as: str = None
132
+ ) -> None:
133
+ step = save_as or self.current_step
134
+ runtime = calculate_runtime(start=from_time or self.step_start)
135
+ if self.runtimes.get(step) is None:
136
+ self.runtimes[step] = runtime.timedelta
137
+ else:
138
+ self.runtimes[step] += runtime.timedelta
139
+
140
+ if self.enabled:
141
+ self._tick(n=n)
142
+
143
+ def finish(
144
+ self,
145
+ postfix: str | None = "Completed",
146
+ save_as: str = "Total",
147
+ good: bool = None,
148
+ ) -> None:
149
+ self.complete_step(n=0, from_time=self.full_start, save_as=save_as)
150
+
151
+ if not self.enabled:
152
+ return
153
+
154
+ postfix = (
155
+ f"{BOLD_GREEN if good else BOLD_RED if good is False else ''}"
156
+ + postfix
157
+ + f"{'' if good is None else RESET}"
158
+ )
159
+ self._set_postfix(postfix)
160
+ completed_steps = int(getattr(self.bar, "n", 0))
161
+ if completed_steps <= 0:
162
+ self.bar.total = 1
163
+ self.bar.update(1)
164
+ self.completed_steps = 1
165
+
166
+ else:
167
+ self.bar.total = completed_steps
168
+ if self.bar.n < completed_steps:
169
+ self.bar.update(completed_steps - self.bar.n)
170
+ self.completed_steps = completed_steps
171
+ self._refresh()
172
+
173
+ def close(self) -> None:
174
+ if not self.enabled:
175
+ return
176
+
177
+ if self.bar:
178
+ try:
179
+ self.bar.close()
180
+ finally:
181
+ self.bar = None
182
+
183
+ # Helpers
184
+ def _refresh(self) -> None:
185
+ if not self.enabled:
186
+ return
187
+
188
+ self.bar.refresh()
189
+
190
+ def _tick(self, n: int = 1):
191
+ self.completed_steps += n
192
+ if not self.enabled:
193
+ return
194
+
195
+ if n:
196
+ self.bar.update(n)
197
+ self._refresh()
198
+
199
+ def _set_postfix(self, postfix: str) -> None:
200
+ if not self.enabled:
201
+ return
202
+
203
+ postfix = postfix or ""
204
+ self.postfix = postfix
205
+ self.bar.set_postfix_str(postfix)
206
+ self._refresh()
valediction/support.py ADDED
@@ -0,0 +1,72 @@
1
+ from dataclasses import dataclass
2
+ from datetime import datetime, timedelta
3
+ from math import trunc
4
+
5
+ BOLD_RED = "\033[1;31m"
6
+ BOLD_GREEN = "\033[1;92m"
7
+ RED = "\033[31m"
8
+ GREEN = "\033[92m"
9
+ RESET = "\033[0m"
10
+
11
+
12
+ @dataclass
13
+ class Runtime:
14
+ message: str
15
+ timedelta: timedelta
16
+
17
+
18
+ def print_bold_red(message: str, end: str | None = "\n") -> None:
19
+ print(f"{BOLD_RED}{message}{RESET}", end=end)
20
+
21
+
22
+ def print_bold_green(message: str, end: str | None = "\n") -> None:
23
+ print(f"{BOLD_GREEN}{message}{RESET}", end=end)
24
+
25
+
26
+ def print_green(message: str, end: str | None = "\n") -> None:
27
+ print(f"{GREEN}{message}{RESET}", end=end)
28
+
29
+
30
+ def print_red(message: str, end: str | None = "\n") -> None:
31
+ print(f"{RED}{message}{RESET}", end=end)
32
+
33
+
34
+ def list_as_bullets(elements: list, bullet: str = "\n - ") -> str:
35
+ return bullet + bullet.join(elements)
36
+
37
+
38
+ def _normalise_name(name: str) -> str:
39
+ return name.strip().upper()
40
+
41
+
42
+ def _get_runtime_string(runtime: timedelta) -> str:
43
+ total_seconds = runtime.total_seconds()
44
+ hours = trunc(total_seconds / 3600)
45
+ minutes = trunc((total_seconds - (hours * 3600)) / 60)
46
+ seconds = trunc((total_seconds - (hours * 3600) - (minutes * 60)) * 10) / 10
47
+ runtime_string = (
48
+ (f"{hours}h " if hours else "")
49
+ + (f"{minutes}m " if minutes else "")
50
+ + (f"{seconds}s" if not hours and not minutes else f"{trunc(seconds)}s")
51
+ )
52
+ return runtime_string
53
+
54
+
55
+ def calculate_runtime(start: datetime, stop: datetime | None = None) -> Runtime:
56
+ """
57
+ Summary:
58
+ - Takes two datetimes, and calculates the difference.
59
+ - Returns a message and raw timedelta as a named tuple, callable with .message or .delta
60
+
61
+ Args:
62
+ - start (datetime): Start time for calculation.
63
+ - stop (datetime): Stop time for calculation. Defaults to now if not entered.
64
+
65
+ Returns:
66
+ tuple[str, timedelta]: Returns tuple, callable with .message (string) or .delta (raw timedelta)
67
+ """
68
+ stop = stop if stop else datetime.now()
69
+ runtime = stop - start
70
+ runtime_string = _get_runtime_string(runtime)
71
+
72
+ return Runtime(message=runtime_string, timedelta=runtime)
File without changes