PyPI - markinp - Versions diffs - 0.1.0__py3-none-any.whl - Mend

markinp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

markinp/__init__.py +47 -0
markinp/build.py +314 -0
markinp/cli.py +233 -0
markinp/diagnostics.py +226 -0
markinp/model.py +115 -0
markinp/parse.py +257 -0
markinp/py.typed +0 -0
markinp/report.py +173 -0
markinp/tokens.py +48 -0
markinp/validate.py +191 -0
markinp/write.py +40 -0
markinp-0.1.0.dist-info/METADATA +242 -0
markinp-0.1.0.dist-info/RECORD +16 -0
markinp-0.1.0.dist-info/WHEEL +4 -0
markinp-0.1.0.dist-info/entry_points.txt +2 -0
markinp-0.1.0.dist-info/licenses/LICENSE +21 -0

markinp/__init__.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""markinp — read, validate, and build Program MARK encounter-history files.
+markinp is an independent, unofficial utility. It is not affiliated with,
+endorsed by, or maintained by the authors of Program MARK or RMark. "MARK" is
+referenced only to describe the file format it interoperates with.
+Public API (library-first; the CLI is a thin wrapper over these):
+    >>> from markinp import parse_text, validate
+    >>> result = parse_text("1001 1;\\n0101 2;\\n")
+    >>> diagnostics = validate(result.dataset)
+"""
+from __future__ import annotations
+from .build import BuildOptions, BuildResult, build_dataset, build_file
+from .model import (
+    Dataset,
+    DataType,
+    Diagnostic,
+    EncounterHistory,
+    Severity,
+)
+from .parse import ParseResult, parse_file, parse_text
+from .validate import validate
+from .write import write_file, write_text
+__version__ = "0.1.0"
+__all__ = [
+    "BuildOptions",
+    "BuildResult",
+    "Dataset",
+    "DataType",
+    "Diagnostic",
+    "EncounterHistory",
+    "ParseResult",
+    "Severity",
+    "__version__",
+    "build_dataset",
+    "build_file",
+    "parse_file",
+    "parse_text",
+    "validate",
+    "write_file",
+    "write_text",
+]

markinp/build.py ADDED Viewed

@@ -0,0 +1,314 @@
+"""Build a :class:`~markinp.model.Dataset` from a tidy capture table (CSV).
+Two layouts are supported:
+* **long** — one row per (individual x occasion) with a 0/1 detection flag.
+* **wide** — one row per individual, either as occasion columns or a single
+  prebuilt ``history`` column.
+The builder produces a deterministic dataset: individuals are collapsed by
+identical (history, covariates) when requested, groups and covariates are read
+in a stable order, and the caller is expected to run :mod:`markinp.validate` on
+the result before writing (the CLI does this and refuses to write a bad file).
+"""
+from __future__ import annotations
+import csv
+from dataclasses import dataclass, field
+from io import StringIO
+from pathlib import Path
+from . import diagnostics as dx
+from .model import Dataset, DataType, Diagnostic, EncounterHistory, Severity
+from .tokens import is_float_token, is_missing_marker
+_TRUE = {"1", "y", "yes", "true", "t", "detected", "seen"}
+_FALSE = {"0", "n", "no", "false", "f", "", "."}
+@dataclass
+class BuildOptions:
+    """Column mappings and toggles for :func:`build_dataset`."""
+    fmt: str = "auto"  # "long" | "wide" | "auto"
+    id_col: str | None = None
+    occasion_col: str | None = None
+    detect_col: str | None = None
+    history_col: str | None = None
+    group_col: str | None = None
+    covariate_cols: list[str] = field(default_factory=list)
+    comment_col: str | None = None
+    collapse: bool = True
+@dataclass
+class _Individual:
+    """One individual before frequency vectors and collapsing are applied."""
+    history: str
+    group: str | None
+    covariates: list[str]
+    comment: str | None
+@dataclass
+class BuildResult:
+    """The built dataset (or ``None`` on hard failure) plus diagnostics."""
+    dataset: Dataset | None
+    diagnostics: list[Diagnostic]
+    n_rows: int = 0
+def _detect_format(header: list[str], opts: BuildOptions) -> str:
+    """Choose long vs. wide when ``fmt == "auto"``."""
+    if opts.fmt in {"long", "wide"}:
+        return opts.fmt
+    if opts.history_col:
+        return "wide"
+    if opts.occasion_col or opts.detect_col:
+        return "long"
+    return "wide"
+def _normalize_detection(value: str, line: int) -> tuple[str, Diagnostic | None]:
+    """Map a detection cell to '0'/'1', or report an illegal value."""
+    token = value.strip().lower()
+    if token in _TRUE:
+        return "1", None
+    if token in _FALSE:
+        return "0", None
+    return "0", dx.mk005_illegal_history_char(line, value.strip())
+def _reserved_columns(opts: BuildOptions) -> set[str]:
+    reserved = set(opts.covariate_cols)
+    for col in (opts.id_col, opts.group_col, opts.comment_col, opts.history_col):
+        if col:
+            reserved.add(col)
+    return reserved
+def _covariates_for(
+    row: dict[str, str], opts: BuildOptions, line: int
+) -> tuple[list[str], list[Diagnostic]]:
+    """Extract and validate the covariate cells for one row."""
+    diagnostics: list[Diagnostic] = []
+    values: list[str] = []
+    for col in opts.covariate_cols:
+        raw = (row.get(col) or "").strip()
+        if is_missing_marker(raw) or not is_float_token(raw):
+            diagnostics.append(dx.mk007_missing_covariate(line, raw))
+            values.append(raw)
+        else:
+            values.append(_format_number(raw))
+    return values, diagnostics
+def _format_number(raw: str) -> str:
+    """Canonicalize a numeric string so equal values collapse identically."""
+    value = float(raw)
+    if value == int(value):
+        return str(int(value))
+    return repr(value)
+def _is_detection_token(value: str) -> bool:
+    return value.strip().lower() in _TRUE or value.strip().lower() in _FALSE
+def _occasion_columns(
+    rows: list[dict[str, str]], header: list[str], reserved: set[str]
+) -> list[str]:
+    """Pick occasion columns: unreserved columns whose cells are all 0/1 flags.
+    This lets ``id``/label columns coexist with occasion columns in wide format
+    without the user having to enumerate which is which — an id column holds
+    non-detection values and is skipped automatically.
+    """
+    occasion_cols: list[str] = []
+    for col in header:
+        if col in reserved:
+            continue
+        if all(_is_detection_token(row.get(col) or "") for row in rows):
+            occasion_cols.append(col)
+    return occasion_cols
+def _read_wide(
+    rows: list[dict[str, str]], header: list[str], opts: BuildOptions
+) -> tuple[list[_Individual], list[Diagnostic]]:
+    diagnostics: list[Diagnostic] = []
+    individuals: list[_Individual] = []
+    reserved = _reserved_columns(opts)
+    occasion_cols = [] if opts.history_col else _occasion_columns(rows, header, reserved)
+    for i, row in enumerate(rows):
+        line = i + 2  # +1 header, +1 to 1-based
+        if opts.history_col:
+            history = (row.get(opts.history_col) or "").strip()
+        else:
+            chars: list[str] = []
+            for col in occasion_cols:
+                char, diag = _normalize_detection(row.get(col) or "", line)
+                if diag:
+                    diagnostics.append(diag)
+                chars.append(char)
+            history = "".join(chars)
+        covs, cov_diags = _covariates_for(row, opts, line)
+        diagnostics.extend(cov_diags)
+        group = (row.get(opts.group_col) or "").strip() if opts.group_col else None
+        comment = (row.get(opts.comment_col) or "").strip() if opts.comment_col else None
+        individuals.append(_Individual(history, group or None, covs, comment or None))
+    return individuals, diagnostics
+def _read_long(
+    rows: list[dict[str, str]], opts: BuildOptions
+) -> tuple[list[_Individual], list[Diagnostic]]:
+    diagnostics: list[Diagnostic] = []
+    if not opts.id_col or not opts.occasion_col or not opts.detect_col:
+        return [], [_missing_long_columns()]
+    id_col = opts.id_col
+    occ_col = opts.occasion_col
+    det_col = opts.detect_col
+    # Preserve first-seen order of individuals.
+    order: list[str] = []
+    by_id: dict[str, list[tuple[str, dict[str, str], int]]] = {}
+    for i, row in enumerate(rows):
+        line = i + 2
+        ident = (row.get(id_col) or "").strip()
+        if ident not in by_id:
+            by_id[ident] = []
+            order.append(ident)
+        by_id[ident].append(((row.get(occ_col) or "").strip(), row, line))
+    individuals: list[_Individual] = []
+    for ident in order:
+        entries = by_id[ident]
+        entries.sort(key=lambda e: _occasion_key(e[0]))
+        chars: list[str] = []
+        for _occ_value, row, line in entries:
+            char, diag = _normalize_detection(row.get(det_col) or "", line)
+            if diag:
+                diagnostics.append(diag)
+            chars.append(char)
+        first_row = entries[0][1]
+        first_line = entries[0][2]
+        covs, cov_diags = _covariates_for(first_row, opts, first_line)
+        diagnostics.extend(cov_diags)
+        group = (first_row.get(opts.group_col) or "").strip() if opts.group_col else None
+        comment = (first_row.get(opts.comment_col) or "").strip() if opts.comment_col else None
+        individuals.append(_Individual("".join(chars), group or None, covs, comment or None))
+    return individuals, diagnostics
+def _missing_long_columns() -> Diagnostic:
+    return Diagnostic(
+        code="MK008",
+        severity=Severity.ERROR,
+        message="long format needs --id-col, --occasion-col, and --detect-col",
+        hint="Name the individual, occasion, and detection columns, or use --format wide",
+        line=None,
+    )
+def _occasion_key(value: str) -> tuple[int, float | str]:
+    """Sort occasions numerically when possible, else lexically."""
+    try:
+        return (0, float(value))
+    except ValueError:
+        return (1, value)
+@dataclass
+class _Bucket:
+    """Accumulator for one output record while collapsing individuals."""
+    history: str
+    covariates: list[str]
+    frequencies: list[int]
+    comment: str | None
+    merged: int = 1
+def _assemble(individuals: list[_Individual], opts: BuildOptions) -> Dataset:
+    """Turn individuals into a Dataset with frequency vectors and optional collapse."""
+    groups = sorted({ind.group for ind in individuals if ind.group is not None})
+    group_labels = groups if groups else None
+    n_groups = len(groups) if groups else 1
+    cov_labels = list(opts.covariate_cols) if opts.covariate_cols else None
+    n_covariates = len(opts.covariate_cols)
+    def group_index(group: str | None) -> int:
+        return groups.index(group) if (groups and group is not None) else 0
+    buckets: list[_Bucket] = []
+    by_key: dict[tuple[str, tuple[str, ...]], _Bucket] = {}
+    for ind in individuals:
+        vector = [0] * n_groups
+        vector[group_index(ind.group)] += 1
+        key = (ind.history, tuple(ind.covariates))
+        existing = by_key.get(key) if opts.collapse else None
+        if existing is None:
+            bucket = _Bucket(ind.history, ind.covariates, vector, ind.comment)
+            buckets.append(bucket)
+            if opts.collapse:
+                by_key[key] = bucket
+        else:
+            for i, freq in enumerate(vector):
+                existing.frequencies[i] += freq
+            existing.merged += 1
+            # A collapsed group of individuals loses its per-individual label.
+            if existing.comment != ind.comment:
+                existing.comment = None
+    records: list[EncounterHistory] = []
+    for bucket in buckets:
+        raw_values = [str(f) for f in bucket.frequencies] + bucket.covariates
+        records.append(
+            EncounterHistory(
+                history=bucket.history,
+                frequencies=list(bucket.frequencies),
+                covariates=[float(c) for c in bucket.covariates if is_float_token(c)],
+                comment=bucket.comment,
+                line=0,
+                raw_values=raw_values,
+            )
+        )
+    return Dataset(
+        n_occasions=len(records[0].history) if records else 0,
+        n_groups=n_groups,
+        n_covariates=n_covariates,
+        group_labels=group_labels,
+        cov_labels=cov_labels,
+        data_type=DataType.LIVE_RECAPTURE,
+        records=records,
+    )
+def build_dataset(text: str, opts: BuildOptions) -> BuildResult:
+    """Build a dataset from CSV ``text`` according to ``opts``."""
+    reader = csv.DictReader(StringIO(text))
+    header = list(reader.fieldnames or [])
+    rows = list(reader)
+    if not rows:
+        return BuildResult(None, [dx.mk008_no_records()], 0)
+    fmt = _detect_format(header, opts)
+    if fmt == "long":
+        individuals, diagnostics = _read_long(rows, opts)
+    else:
+        individuals, diagnostics = _read_wide(rows, header, opts)
+    dataset = _assemble(individuals, opts)
+    return BuildResult(dataset, diagnostics, len(rows))
+def build_file(path: str | Path, opts: BuildOptions) -> BuildResult:
+    """Read a CSV file and build a dataset from it."""
+    text = Path(path).read_text(encoding="utf-8-sig")
+    return build_dataset(text, opts)

markinp/cli.py ADDED Viewed

@@ -0,0 +1,233 @@
+"""Command-line interface for markinp.
+This module contains no domain logic. It parses arguments, calls library
+functions, hands results to :mod:`markinp.report`, and sets the exit code.
+Everything it does is doable in a few lines of Python via the library.
+"""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Annotated
+import typer
+from . import __version__, report
+from .build import BuildOptions, build_file
+from .model import DataType, Diagnostic
+from .parse import parse_file
+from .validate import validate
+from .write import write_file
+app = typer.Typer(
+    add_completion=False,
+    no_args_is_help=True,
+    help="Read, validate, and build Program MARK encounter-history (.inp) files.",
+)
+def _parse_data_type(value: str | None) -> DataType | None:
+    if value is None:
+        return None
+    try:
+        return DataType(value.lower())
+    except ValueError as exc:
+        allowed = ", ".join(dt.value for dt in DataType)
+        raise typer.BadParameter(f"unknown data type; choose one of: {allowed}") from exc
+def _version_callback(show: bool) -> None:
+    if show:
+        typer.echo(f"markinp {__version__}")
+        raise typer.Exit()
+@app.callback()
+def main(
+    version: Annotated[
+        bool,
+        typer.Option(
+            "--version", callback=_version_callback, is_eager=True, help="Show version and exit."
+        ),
+    ] = False,
+) -> None:
+    """markinp — a friendly linter and builder for MARK .inp files."""
+@app.command(name="validate")
+def validate_cmd(
+    files: Annotated[
+        list[Path],
+        typer.Argument(help="One or more .inp files to validate.", exists=True, dir_okay=False),
+    ],
+    groups: Annotated[
+        int | None, typer.Option("--groups", help="Assert the expected number of groups.")
+    ] = None,
+    occasions: Annotated[
+        int | None, typer.Option("--occasions", help="Assert the expected history length.")
+    ] = None,
+    covariates: Annotated[
+        int | None, typer.Option("--covariates", help="Assert the expected covariate count.")
+    ] = None,
+    data_type: Annotated[
+        str | None, typer.Option("--data-type", help="Hint the data type for stricter checks.")
+    ] = None,
+    strict: Annotated[bool, typer.Option("--strict", help="Treat warnings as errors.")] = False,
+    as_json: Annotated[bool, typer.Option("--json", help="Emit machine-readable JSON.")] = False,
+) -> None:
+    """Validate one or more .inp files and report precise, actionable diagnostics.
+    Exits non-zero if any file has an error, so it drops straight into CI and
+    pre-commit. With several files, ``--json`` emits an array of per-file objects.
+    """
+    dtype = _parse_data_type(data_type)
+    any_errors = False
+    payloads: list[dict[str, object]] = []
+    blocks: list[str] = []
+    for file in files:
+        result = parse_file(file)
+        diagnostics = result.diagnostics + validate(
+            result.dataset,
+            groups=groups,
+            occasions=occasions,
+            covariates=covariates,
+            data_type=dtype,
+        )
+        diagnostics.sort(key=lambda d: (d.line if d.line is not None else -1, d.code))
+        if report.has_errors(diagnostics, strict):
+            any_errors = True
+        path = str(file)
+        if as_json:
+            payloads.append(
+                report.validate_payload(result.dataset, diagnostics, strict=strict, path=path)
+            )
+        else:
+            blocks.append(
+                report.render_validate_human(result.dataset, diagnostics, strict=strict, path=path)
+            )
+    if as_json:
+        output: object = payloads[0] if len(payloads) == 1 else payloads
+        typer.echo(json.dumps(output, indent=2))
+    else:
+        typer.echo("\n\n".join(blocks))
+    if any_errors:
+        raise typer.Exit(1)
+@app.command()
+def inspect(
+    file: Annotated[
+        Path, typer.Argument(help="The .inp file to inspect.", exists=True, dir_okay=False)
+    ],
+    as_json: Annotated[bool, typer.Option("--json", help="Emit machine-readable JSON.")] = False,
+) -> None:
+    """Summarize the inferred structure of an .inp file (read-only)."""
+    result = parse_file(file)
+    diagnostics = result.diagnostics + validate(result.dataset)
+    path = str(file)
+    if as_json:
+        typer.echo(report.render_inspect_json(result.dataset, diagnostics, path=path))
+    else:
+        typer.echo(report.render_inspect_human(result.dataset, diagnostics, path=path))
+@app.command()
+def build(
+    input_csv: Annotated[
+        Path, typer.Argument(help="Tidy capture table (CSV).", exists=True, dir_okay=False)
+    ],
+    output: Annotated[Path, typer.Option("-o", "--output", help="Path to write the .inp file.")],
+    fmt: Annotated[str, typer.Option("--format", help="Layout: long, wide, or auto.")] = "auto",
+    id_col: Annotated[
+        str | None, typer.Option("--id-col", help="Individual id column (long).")
+    ] = None,
+    occasion_col: Annotated[
+        str | None, typer.Option("--occasion-col", help="Occasion column (long).")
+    ] = None,
+    detect_col: Annotated[
+        str | None, typer.Option("--detect-col", help="0/1 detection column (long).")
+    ] = None,
+    history_col: Annotated[
+        str | None, typer.Option("--history-col", help="Prebuilt history column (wide).")
+    ] = None,
+    group_col: Annotated[
+        str | None, typer.Option("--group-col", help="Column defining groups.")
+    ] = None,
+    covariate_cols: Annotated[
+        str | None, typer.Option("--covariate-cols", help="Comma-separated covariate columns.")
+    ] = None,
+    comment_col: Annotated[
+        str | None, typer.Option("--comment-col", help="Column to write as /* comment */.")
+    ] = None,
+    collapse: Annotated[
+        bool, typer.Option("--collapse/--no-collapse", help="Aggregate identical histories.")
+    ] = True,
+    as_json: Annotated[
+        bool, typer.Option("--json", help="Emit a machine-readable build report.")
+    ] = False,
+) -> None:
+    """Build a valid, deterministic .inp file from a tidy capture table."""
+    opts = BuildOptions(
+        fmt=fmt,
+        id_col=id_col,
+        occasion_col=occasion_col,
+        detect_col=detect_col,
+        history_col=history_col,
+        group_col=group_col,
+        covariate_cols=[c.strip() for c in covariate_cols.split(",")] if covariate_cols else [],
+        comment_col=comment_col,
+        collapse=collapse,
+    )
+    result = build_file(input_csv, opts)
+    diagnostics: list[Diagnostic] = list(result.diagnostics)
+    if result.dataset is not None:
+        diagnostics += validate(result.dataset)
+    diagnostics.sort(key=lambda d: (d.line if d.line is not None else -1, d.code))
+    wrote = False
+    if result.dataset is not None and not report.has_errors(diagnostics):
+        write_file(result.dataset, output)
+        wrote = True
+    if as_json:
+        typer.echo(_build_json(result, diagnostics, str(output), wrote))
+    else:
+        typer.echo(_build_human(result, diagnostics, str(output), wrote))
+    if not wrote:
+        raise typer.Exit(1)
+def _build_human(result, diagnostics, output: str, wrote: bool) -> str:  # type: ignore[no-untyped-def]
+    lines = [f"markinp build -> {output}", ""]
+    n_records = len(result.dataset.records) if result.dataset else 0
+    lines.append(f"Read {result.n_rows} row(s); produced {n_records} record(s).")
+    if diagnostics:
+        lines.append("")
+        for diag in diagnostics:
+            lines.append(report._format_diagnostic(diag, strict=False))
+    lines.append("")
+    lines.append(f"Wrote {output}" if wrote else "Refused to write: fix the errors above.")
+    return "\n".join(lines)
+def _build_json(result, diagnostics, output: str, wrote: bool) -> str:  # type: ignore[no-untyped-def]
+    payload = {
+        "schema_version": report.SCHEMA_VERSION,
+        "command": "build",
+        "output": output,
+        "written": wrote,
+        "rows_read": result.n_rows,
+        "records": len(result.dataset.records) if result.dataset else 0,
+        "diagnostics": [report._diag_to_dict(d, strict=False) for d in diagnostics],
+    }
+    return json.dumps(payload, indent=2)
+if __name__ == "__main__":
+    app()