PyPI - valediction - Versions diffs - 1.0.0__py3-none-any.whl - Mend

valediction 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

valediction/__init__.py +8 -0
valediction/convenience.py +50 -0
valediction/data_types/__init__.py +0 -0
valediction/data_types/data_type_helpers.py +75 -0
valediction/data_types/data_types.py +58 -0
valediction/data_types/type_inference.py +541 -0
valediction/datasets/__init__.py +0 -0
valediction/datasets/datasets.py +870 -0
valediction/datasets/datasets_helpers.py +46 -0
valediction/demo/DEMO - Data Dictionary.xlsx +0 -0
valediction/demo/DEMOGRAPHICS.csv +101 -0
valediction/demo/DIAGNOSES.csv +650 -0
valediction/demo/LAB_TESTS.csv +1001 -0
valediction/demo/VITALS.csv +1001 -0
valediction/demo/__init__.py +6 -0
valediction/demo/demo_dictionary.py +129 -0
valediction/dictionary/__init__.py +0 -0
valediction/dictionary/exporting.py +501 -0
valediction/dictionary/exporting_helpers.py +371 -0
valediction/dictionary/generation.py +357 -0
valediction/dictionary/helpers.py +174 -0
valediction/dictionary/importing.py +494 -0
valediction/dictionary/integrity.py +37 -0
valediction/dictionary/model.py +582 -0
valediction/dictionary/template/PROJECT - Data Dictionary.xltx +0 -0
valediction/exceptions.py +22 -0
valediction/integrity.py +97 -0
valediction/io/__init__.py +0 -0
valediction/io/csv_readers.py +307 -0
valediction/progress.py +206 -0
valediction/support.py +72 -0
valediction/validation/__init__.py +0 -0
valediction/validation/helpers.py +315 -0
valediction/validation/issues.py +280 -0
valediction/validation/validation.py +598 -0
valediction-1.0.0.dist-info/METADATA +15 -0
valediction-1.0.0.dist-info/RECORD +38 -0
valediction-1.0.0.dist-info/WHEEL +4 -0

valediction/io/csv_readers.py ADDED Viewed

@@ -0,0 +1,307 @@
+# valediction/io/csv_readers.py
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+from math import ceil
+from pathlib import Path
+from typing import Iterator, NamedTuple
+import pandas as pd
+from pandas import DataFrame
+from pandas.errors import ParserError
+from valediction.support import _normalise_name
+class FrameChunk(NamedTuple):
+    """A chunk of rows + I/O metadata.
+    - start/end are 0-based inclusive row numbers in the full dataset.
+    - file_pos/total_size/bytes_read are None when not reading from disk.
+    """
+    df: DataFrame
+    start: int  # 0-based, inclusive
+    end: int  # 10-based, inclusive
+    total_size: int | None  # bytes of the whole file
+    file_pos: int | None  # f.tell() after producing this chunk
+    bytes_read: int | None  # bytes consumed to produce this chunk
+    chunk_index: int | None  # 0-based index of this chunk
+    # Cumulative Totals
+    total_bytes_read: int | None
+    total_chunks_seen: int | None
+    def estimate_chunk_count(self) -> int:
+        # Buffers (accounting for CSV tails/bytes innacuracy)
+        EPS_ABS = 4096  # Fixed
+        EPS_REL = 0.05  # 5% tail buffer
+        bytes_seen = int(self.total_bytes_read)
+        chunks_seen = max(1, int(self.total_chunks_seen))
+        average = max(1.0, bytes_seen / float(chunks_seen))
+        remaining = max(0, int(self.total_size) - bytes_seen)
+        # Account for small tail if potentially complete
+        tail_thresh = max(EPS_ABS, int(EPS_REL * average))
+        if remaining <= tail_thresh:
+            remaining = 0
+        return chunks_seen + (0 if remaining == 0 else int(ceil(remaining / average)))
+    def update_df(self, df: DataFrame) -> FrameChunk:
+        return self._replace(df=df)
+@dataclass(slots=True)
+class CsvReadConfig:
+    """Canonical CSV reading defaults for the overall project.
+    Notes:
+    - dtype="string" always reads columns as string, permitting downstream inference/validation.
+    - keep_default_na=False and na_values=[] prevent pandas from coercing tokens like "NA".
+    - We normalise headers and strip string values post-read (vectorised).
+    """
+    dtype: str = "string"
+    keep_default_na: bool = False
+    na_values: list[str] | None = None
+    encoding: str = "utf-8"
+    normalise_headers: bool = True
+    strip_values: bool = True
+    usecols: list[str] | None = None
+    def __post_init__(self) -> None:
+        if self.na_values is None:
+            self.na_values = []
+def _kwargs(cfg: CsvReadConfig | None = None) -> dict:
+    cfg = cfg or CsvReadConfig()
+    return dict(
+        dtype=cfg.dtype,
+        keep_default_na=cfg.keep_default_na,
+        na_values=cfg.na_values,
+        encoding=cfg.encoding,
+        usecols=cfg.usecols,
+    )
+def _post_read_processing(df: DataFrame, cfg: CsvReadConfig) -> DataFrame:
+    """Apply header normalisation and vectorised value stripping after reading."""
+    cfg = cfg or CsvReadConfig()
+    if cfg.normalise_headers:
+        df = df.rename(columns={c: _normalise_name(c) for c in df.columns})
+    if cfg.strip_values:
+        str_cols = df.select_dtypes(include=["string"]).columns
+        if len(str_cols) > 0:
+            df[str_cols] = df[str_cols].apply(lambda s: s.str.strip())
+    return df
+def read_csv_headers(path: str | Path, cfg: CsvReadConfig | None = None) -> DataFrame:
+    """Read headers only (nrows=0) with canonical settings; returns a DataFrame."""
+    cfg = cfg or CsvReadConfig()
+    cfg.strip_values = False
+    try:
+        header = pd.read_csv(path, nrows=0, **_kwargs(cfg))
+        return _post_read_processing(header, cfg)
+    except ParserError as e:
+        raise ParserError(
+            f"Malformed CSV while reading header from '{path}': {e}"
+        ) from e
+def read_csv_all(path: str | Path, cfg: CsvReadConfig | None = None) -> FrameChunk:
+    """Read the entire CSV with canonical settings; returns a single FrameChunk."""
+    cfg = cfg or CsvReadConfig()
+    try:
+        file_size = os.path.getsize(path)
+        with open(path, "rb") as file:
+            start_pos = file.tell()
+            df = pd.read_csv(file, **_kwargs(cfg))
+            end_pos = file.tell()
+        df = _post_read_processing(df, cfg)
+        n = len(df)
+        return FrameChunk(
+            df=df,
+            start=0,
+            end=n - 1,
+            total_size=file_size,
+            file_pos=end_pos,
+            bytes_read=end_pos - start_pos,
+            chunk_index=1,
+            total_bytes_read=file_size,
+            total_chunks_seen=1,
+        )
+    except ParserError as e:
+        raise ParserError(f"Malformed CSV while reading '{path}': {e}") from e
+def read_csv_sample(
+    path: str | Path, nrows: int, cfg: CsvReadConfig | None = None
+) -> FrameChunk:
+    """Read first `nrows` with canonical settings; returns a FrameChunk with I/O
+    metadata."""
+    cfg = cfg or CsvReadConfig()
+    try:
+        file_size = os.path.getsize(path)
+        with open(path, "rb") as file:
+            start_pos = file.tell()
+            df = pd.read_csv(file, nrows=nrows, **_kwargs(cfg))
+            end_pos = file.tell()
+        df = _post_read_processing(df, cfg)
+        n = len(df)
+        bytes_read = (end_pos - start_pos) if end_pos > 0 else None
+        file_pos = end_pos if end_pos > 0 else None
+        return FrameChunk(
+            df=df,
+            start=0,
+            end=n - 1,
+            total_size=file_size,
+            file_pos=file_pos,
+            bytes_read=bytes_read,
+            chunk_index=1,
+            total_bytes_read=bytes_read or 0,
+            total_chunks_seen=1,
+        )
+    except ParserError as e:
+        raise ParserError(
+            f"Malformed CSV while reading sample from '{path}': {e}"
+        ) from e
+def iter_csv_chunks(
+    path: str | Path, chunk_size: int | None, cfg: CsvReadConfig | None = None
+) -> Iterator[FrameChunk]:
+    """Yield FrameChunk with canonical settings.
+    Behaviour:
+    - If chunk_size is None or <= 0: yields a single chunk for the entire file.
+    - Else: yields multiple chunks each with populated bytes/position metadata.
+    """
+    cfg = cfg or CsvReadConfig()
+    try:
+        file_size = os.path.getsize(path)
+        # No chunking: one full-file chunk with metadata
+        if not chunk_size or (isinstance(chunk_size, int) and chunk_size <= 0):
+            with open(path, "rb") as file:
+                start_pos = file.tell()
+                df = pd.read_csv(file, **_kwargs(cfg))
+                end_pos = file.tell()
+            df = _post_read_processing(df, cfg)
+            n = len(df)
+            if n == 0:
+                return
+            yield FrameChunk(
+                df=df,
+                start=0,
+                end=n - 1,
+                total_size=file_size,
+                file_pos=end_pos,
+                bytes_read=file_size,
+                chunk_index=1,
+                total_bytes_read=end_pos - start_pos,
+                total_chunks_seen=1,
+            )
+            return
+        # Chunking: stream with bytes/pos metadata
+        with open(path, "rb") as file:
+            reader = pd.read_csv(file, chunksize=chunk_size, **_kwargs(cfg))
+            prev_pos = file.tell()
+            offset = 0
+            idx = 0
+            cumulative_bytes = 0
+            for raw in reader:
+                idx += 1
+                curr_pos = file.tell()
+                bytes_read = max(0, curr_pos - prev_pos)
+                prev_pos = curr_pos
+                cumulative_bytes += bytes_read
+                df = _post_read_processing(raw, cfg)
+                n = len(df)
+                if n == 0:
+                    continue
+                start = offset
+                end = offset + n - 1
+                offset += n
+                yield FrameChunk(
+                    df=df,
+                    start=start,
+                    end=end,
+                    total_size=file_size,
+                    file_pos=curr_pos,
+                    bytes_read=bytes_read,
+                    chunk_index=idx,
+                    total_bytes_read=cumulative_bytes,
+                    total_chunks_seen=idx,
+                )
+    except ParserError as e:
+        raise ParserError(
+            f"Malformed CSV while reading chunks from '{path}': {e}"
+        ) from e
+# Reading specific ranges
+def _intersect_local_spans(
+    ranges: list[tuple[int, int]],
+    chunk_start: int,
+    chunk_end: int,
+) -> list[tuple[int, int]]:
+    out: list[tuple[int, int]] = []
+    for r_start, r_end in ranges:
+        lo = max(r_start, chunk_start)
+        hi = min(r_end, chunk_end)
+        if hi >= lo:
+            out.append((lo - chunk_start, hi - chunk_start))
+    return out
+def read_csv_ranges(
+    path: str | Path,
+    ranges: list[tuple[int, int]],
+    cfg: CsvReadConfig | None = None,
+    chunk_size: int | None = 1_000_000,
+) -> pd.DataFrame:
+    """Read only the rows covered by `ranges` (global 0-based inclusive pairs).
+    Respects CsvReadConfig (including usecols for column pruning). Returns a
+    concatenated DataFrame (may be empty).
+    """
+    if not ranges:
+        # honour columns if specified
+        cols = cfg.usecols if (cfg and cfg.usecols) else None
+        return pd.DataFrame(columns=cols) if cols else pd.DataFrame()
+    parts: list[pd.DataFrame] = []
+    for chunk in iter_csv_chunks(path, chunk_size=chunk_size, cfg=cfg):
+        local_spans = _intersect_local_spans(ranges, chunk.start, chunk.end)
+        if not local_spans:
+            continue
+        for lo, hi in local_spans:
+            part = chunk.df.iloc[lo : hi + 1]
+            parts.append(part)
+    if not parts:
+        cols = cfg.usecols if (cfg and cfg.usecols) else None
+        return pd.DataFrame(columns=cols) if cols else pd.DataFrame()
+    return pd.concat(parts, axis=0, ignore_index=False)

valediction/progress.py ADDED Viewed

@@ -0,0 +1,206 @@
+# progress.py
+from __future__ import annotations
+from datetime import datetime, timedelta
+from tqdm import tqdm
+from valediction.support import BOLD_GREEN, BOLD_RED, RESET, calculate_runtime
+FORMAT_KNOWN_TOTAL = (
+    "{desc} {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} "
+    "[{elapsed}<{remaining}, {rate_fmt}{postfix}]"
+)
+FORMAT_UNKNOWN_TOTAL = (
+    "{desc} {percentage:3.0f}%|{bar}| ?/? [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
+)
+class Progress:
+    def __init__(
+        self,
+        desc: str = "",
+        est_total: int | None = 1,
+        smoothing_steps: int = 0,
+        unit: str = "step",
+        starting_step: str | None = None,
+        enabled: bool = True,
+    ) -> None:
+        """Progress bar (tqdm) with manual control.
+        Args:
+            desc (str): label shown to the left of the bar
+            starting_step (str, optional): initial step and starting postfix, e.g. "Importing Data".
+                Defaults to "".
+            est_total (int, optional): initial total number of steps (can grow/shrink later).
+                Defaults to 1.
+            smoothing_steps (int, optional): window length of previous steps to approximate ETA.
+                Use 0 for global average. Defaults to 0.
+            unit (str, optional): display unit (default: "step"). Defaults to "step".
+            bar_format (str, optional): custom bar format. Defaults to None (using Progress
+                default).
+            enabled (bool, optional): Enables switching off, avoiding duplication of upstream
+            checks. Defaults to True.
+        """
+        self.enabled: bool = enabled
+        self.desc: str = desc
+        self.est_total: int = est_total
+        self.smoothing_steps: int = max(0, int(smoothing_steps or 0))
+        self.unit: str = unit
+        self.postfix: str = ""
+        # Bar
+        self.bar: tqdm = None
+        self.total_steps: int = self.est_total
+        self.completed_steps: int = 0
+        # Runtimes
+        self.full_start: datetime = None
+        self.step_start: datetime = None
+        self.current_step = starting_step or ""
+        self.runtimes: dict[str, timedelta] = {}
+        self.__init_progress_bar()
+    # Context
+    def __enter__(self) -> Progress:
+        return self
+    def __exit__(self, exc_type, exc, tb) -> None:
+        self.close()
+    # Initialisation
+    def __init_progress_bar(self) -> None:
+        now = datetime.now()
+        self.full_start = now
+        self.step_start = now
+        if not self.enabled:
+            return
+        smoothing = (
+            0.0 if self.smoothing_steps == 0 else 2.0 / (self.smoothing_steps + 1)
+        )
+        self.bar = tqdm(
+            total=self.total_steps,
+            unit=self.unit,
+            desc=self.desc,
+            smoothing=smoothing,
+        )
+        self.__set_bar_format()
+        if self.current_step:
+            self.bar.set_postfix_str(self.current_step)
+    def __set_bar_format(self) -> None:
+        if self.est_total:
+            self.bar.bar_format = FORMAT_KNOWN_TOTAL
+        else:
+            self.bar.bar_format = FORMAT_UNKNOWN_TOTAL
+    # Management
+    def retarget_total(self, new_total: int) -> None:
+        if not self.enabled:
+            return
+        new_total = max(1, int(new_total))
+        self.total_steps = new_total
+        self.est_total = new_total
+        self.__set_bar_format()
+        if self.bar is None:
+            return
+        if int(self.bar.total or 0) == new_total:
+            return
+        self.bar.total = new_total
+        self._refresh()
+    def begin_step(self, step: str, alt_postfix: str = None) -> None:
+        self.step_start = datetime.now()
+        self.current_step = step
+        postfix = alt_postfix or self.current_step
+        if self.enabled:
+            self._set_postfix(postfix)
+            self._refresh()
+    def complete_step(
+        self, n: int = 1, from_time: datetime = None, save_as: str = None
+    ) -> None:
+        step = save_as or self.current_step
+        runtime = calculate_runtime(start=from_time or self.step_start)
+        if self.runtimes.get(step) is None:
+            self.runtimes[step] = runtime.timedelta
+        else:
+            self.runtimes[step] += runtime.timedelta
+        if self.enabled:
+            self._tick(n=n)
+    def finish(
+        self,
+        postfix: str | None = "Completed",
+        save_as: str = "Total",
+        good: bool = None,
+    ) -> None:
+        self.complete_step(n=0, from_time=self.full_start, save_as=save_as)
+        if not self.enabled:
+            return
+        postfix = (
+            f"{BOLD_GREEN if good else BOLD_RED if good is False else ''}"
+            + postfix
+            + f"{'' if good is None else RESET}"
+        )
+        self._set_postfix(postfix)
+        completed_steps = int(getattr(self.bar, "n", 0))
+        if completed_steps <= 0:
+            self.bar.total = 1
+            self.bar.update(1)
+            self.completed_steps = 1
+        else:
+            self.bar.total = completed_steps
+            if self.bar.n < completed_steps:
+                self.bar.update(completed_steps - self.bar.n)
+            self.completed_steps = completed_steps
+            self._refresh()
+    def close(self) -> None:
+        if not self.enabled:
+            return
+        if self.bar:
+            try:
+                self.bar.close()
+            finally:
+                self.bar = None
+    # Helpers
+    def _refresh(self) -> None:
+        if not self.enabled:
+            return
+        self.bar.refresh()
+    def _tick(self, n: int = 1):
+        self.completed_steps += n
+        if not self.enabled:
+            return
+        if n:
+            self.bar.update(n)
+            self._refresh()
+    def _set_postfix(self, postfix: str) -> None:
+        if not self.enabled:
+            return
+        postfix = postfix or ""
+        self.postfix = postfix
+        self.bar.set_postfix_str(postfix)
+        self._refresh()

valediction/support.py ADDED Viewed

@@ -0,0 +1,72 @@
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from math import trunc
+BOLD_RED = "\033[1;31m"
+BOLD_GREEN = "\033[1;92m"
+RED = "\033[31m"
+GREEN = "\033[92m"
+RESET = "\033[0m"
+@dataclass
+class Runtime:
+    message: str
+    timedelta: timedelta
+def print_bold_red(message: str, end: str | None = "\n") -> None:
+    print(f"{BOLD_RED}{message}{RESET}", end=end)
+def print_bold_green(message: str, end: str | None = "\n") -> None:
+    print(f"{BOLD_GREEN}{message}{RESET}", end=end)
+def print_green(message: str, end: str | None = "\n") -> None:
+    print(f"{GREEN}{message}{RESET}", end=end)
+def print_red(message: str, end: str | None = "\n") -> None:
+    print(f"{RED}{message}{RESET}", end=end)
+def list_as_bullets(elements: list, bullet: str = "\n - ") -> str:
+    return bullet + bullet.join(elements)
+def _normalise_name(name: str) -> str:
+    return name.strip().upper()
+def _get_runtime_string(runtime: timedelta) -> str:
+    total_seconds = runtime.total_seconds()
+    hours = trunc(total_seconds / 3600)
+    minutes = trunc((total_seconds - (hours * 3600)) / 60)
+    seconds = trunc((total_seconds - (hours * 3600) - (minutes * 60)) * 10) / 10
+    runtime_string = (
+        (f"{hours}h " if hours else "")
+        + (f"{minutes}m " if minutes else "")
+        + (f"{seconds}s" if not hours and not minutes else f"{trunc(seconds)}s")
+    )
+    return runtime_string
+def calculate_runtime(start: datetime, stop: datetime | None = None) -> Runtime:
+    """
+    Summary:
+        -   Takes two datetimes, and calculates the difference.
+        -   Returns a message and raw timedelta as a named tuple, callable with .message or .delta
+    Args:
+        -   start (datetime):   Start time for calculation.
+        -   stop (datetime):    Stop time for calculation. Defaults to now if not entered.
+    Returns:
+        tuple[str, timedelta]: Returns tuple, callable with .message (string) or .delta (raw timedelta)
+    """
+    stop = stop if stop else datetime.now()
+    runtime = stop - start
+    runtime_string = _get_runtime_string(runtime)
+    return Runtime(message=runtime_string, timedelta=runtime)

valediction/validation/__init__.py ADDED Viewed

File without changes