markinp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
markinp/__init__.py ADDED
@@ -0,0 +1,47 @@
1
+ """markinp — read, validate, and build Program MARK encounter-history files.
2
+
3
+ markinp is an independent, unofficial utility. It is not affiliated with,
4
+ endorsed by, or maintained by the authors of Program MARK or RMark. "MARK" is
5
+ referenced only to describe the file format it interoperates with.
6
+
7
+ Public API (library-first; the CLI is a thin wrapper over these):
8
+
9
+ >>> from markinp import parse_text, validate
10
+ >>> result = parse_text("1001 1;\\n0101 2;\\n")
11
+ >>> diagnostics = validate(result.dataset)
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from .build import BuildOptions, BuildResult, build_dataset, build_file
17
+ from .model import (
18
+ Dataset,
19
+ DataType,
20
+ Diagnostic,
21
+ EncounterHistory,
22
+ Severity,
23
+ )
24
+ from .parse import ParseResult, parse_file, parse_text
25
+ from .validate import validate
26
+ from .write import write_file, write_text
27
+
28
+ __version__ = "0.1.0"
29
+
30
+ __all__ = [
31
+ "BuildOptions",
32
+ "BuildResult",
33
+ "Dataset",
34
+ "DataType",
35
+ "Diagnostic",
36
+ "EncounterHistory",
37
+ "ParseResult",
38
+ "Severity",
39
+ "__version__",
40
+ "build_dataset",
41
+ "build_file",
42
+ "parse_file",
43
+ "parse_text",
44
+ "validate",
45
+ "write_file",
46
+ "write_text",
47
+ ]
markinp/build.py ADDED
@@ -0,0 +1,314 @@
1
+ """Build a :class:`~markinp.model.Dataset` from a tidy capture table (CSV).
2
+
3
+ Two layouts are supported:
4
+
5
+ * **long** — one row per (individual x occasion) with a 0/1 detection flag.
6
+ * **wide** — one row per individual, either as occasion columns or a single
7
+ prebuilt ``history`` column.
8
+
9
+ The builder produces a deterministic dataset: individuals are collapsed by
10
+ identical (history, covariates) when requested, groups and covariates are read
11
+ in a stable order, and the caller is expected to run :mod:`markinp.validate` on
12
+ the result before writing (the CLI does this and refuses to write a bad file).
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import csv
18
+ from dataclasses import dataclass, field
19
+ from io import StringIO
20
+ from pathlib import Path
21
+
22
+ from . import diagnostics as dx
23
+ from .model import Dataset, DataType, Diagnostic, EncounterHistory, Severity
24
+ from .tokens import is_float_token, is_missing_marker
25
+
26
+ _TRUE = {"1", "y", "yes", "true", "t", "detected", "seen"}
27
+ _FALSE = {"0", "n", "no", "false", "f", "", "."}
28
+
29
+
30
+ @dataclass
31
+ class BuildOptions:
32
+ """Column mappings and toggles for :func:`build_dataset`."""
33
+
34
+ fmt: str = "auto" # "long" | "wide" | "auto"
35
+ id_col: str | None = None
36
+ occasion_col: str | None = None
37
+ detect_col: str | None = None
38
+ history_col: str | None = None
39
+ group_col: str | None = None
40
+ covariate_cols: list[str] = field(default_factory=list)
41
+ comment_col: str | None = None
42
+ collapse: bool = True
43
+
44
+
45
+ @dataclass
46
+ class _Individual:
47
+ """One individual before frequency vectors and collapsing are applied."""
48
+
49
+ history: str
50
+ group: str | None
51
+ covariates: list[str]
52
+ comment: str | None
53
+
54
+
55
+ @dataclass
56
+ class BuildResult:
57
+ """The built dataset (or ``None`` on hard failure) plus diagnostics."""
58
+
59
+ dataset: Dataset | None
60
+ diagnostics: list[Diagnostic]
61
+ n_rows: int = 0
62
+
63
+
64
+ def _detect_format(header: list[str], opts: BuildOptions) -> str:
65
+ """Choose long vs. wide when ``fmt == "auto"``."""
66
+ if opts.fmt in {"long", "wide"}:
67
+ return opts.fmt
68
+ if opts.history_col:
69
+ return "wide"
70
+ if opts.occasion_col or opts.detect_col:
71
+ return "long"
72
+ return "wide"
73
+
74
+
75
+ def _normalize_detection(value: str, line: int) -> tuple[str, Diagnostic | None]:
76
+ """Map a detection cell to '0'/'1', or report an illegal value."""
77
+ token = value.strip().lower()
78
+ if token in _TRUE:
79
+ return "1", None
80
+ if token in _FALSE:
81
+ return "0", None
82
+ return "0", dx.mk005_illegal_history_char(line, value.strip())
83
+
84
+
85
+ def _reserved_columns(opts: BuildOptions) -> set[str]:
86
+ reserved = set(opts.covariate_cols)
87
+ for col in (opts.id_col, opts.group_col, opts.comment_col, opts.history_col):
88
+ if col:
89
+ reserved.add(col)
90
+ return reserved
91
+
92
+
93
+ def _covariates_for(
94
+ row: dict[str, str], opts: BuildOptions, line: int
95
+ ) -> tuple[list[str], list[Diagnostic]]:
96
+ """Extract and validate the covariate cells for one row."""
97
+ diagnostics: list[Diagnostic] = []
98
+ values: list[str] = []
99
+ for col in opts.covariate_cols:
100
+ raw = (row.get(col) or "").strip()
101
+ if is_missing_marker(raw) or not is_float_token(raw):
102
+ diagnostics.append(dx.mk007_missing_covariate(line, raw))
103
+ values.append(raw)
104
+ else:
105
+ values.append(_format_number(raw))
106
+ return values, diagnostics
107
+
108
+
109
+ def _format_number(raw: str) -> str:
110
+ """Canonicalize a numeric string so equal values collapse identically."""
111
+ value = float(raw)
112
+ if value == int(value):
113
+ return str(int(value))
114
+ return repr(value)
115
+
116
+
117
+ def _is_detection_token(value: str) -> bool:
118
+ return value.strip().lower() in _TRUE or value.strip().lower() in _FALSE
119
+
120
+
121
+ def _occasion_columns(
122
+ rows: list[dict[str, str]], header: list[str], reserved: set[str]
123
+ ) -> list[str]:
124
+ """Pick occasion columns: unreserved columns whose cells are all 0/1 flags.
125
+
126
+ This lets ``id``/label columns coexist with occasion columns in wide format
127
+ without the user having to enumerate which is which — an id column holds
128
+ non-detection values and is skipped automatically.
129
+ """
130
+ occasion_cols: list[str] = []
131
+ for col in header:
132
+ if col in reserved:
133
+ continue
134
+ if all(_is_detection_token(row.get(col) or "") for row in rows):
135
+ occasion_cols.append(col)
136
+ return occasion_cols
137
+
138
+
139
+ def _read_wide(
140
+ rows: list[dict[str, str]], header: list[str], opts: BuildOptions
141
+ ) -> tuple[list[_Individual], list[Diagnostic]]:
142
+ diagnostics: list[Diagnostic] = []
143
+ individuals: list[_Individual] = []
144
+ reserved = _reserved_columns(opts)
145
+ occasion_cols = [] if opts.history_col else _occasion_columns(rows, header, reserved)
146
+
147
+ for i, row in enumerate(rows):
148
+ line = i + 2 # +1 header, +1 to 1-based
149
+ if opts.history_col:
150
+ history = (row.get(opts.history_col) or "").strip()
151
+ else:
152
+ chars: list[str] = []
153
+ for col in occasion_cols:
154
+ char, diag = _normalize_detection(row.get(col) or "", line)
155
+ if diag:
156
+ diagnostics.append(diag)
157
+ chars.append(char)
158
+ history = "".join(chars)
159
+ covs, cov_diags = _covariates_for(row, opts, line)
160
+ diagnostics.extend(cov_diags)
161
+ group = (row.get(opts.group_col) or "").strip() if opts.group_col else None
162
+ comment = (row.get(opts.comment_col) or "").strip() if opts.comment_col else None
163
+ individuals.append(_Individual(history, group or None, covs, comment or None))
164
+ return individuals, diagnostics
165
+
166
+
167
+ def _read_long(
168
+ rows: list[dict[str, str]], opts: BuildOptions
169
+ ) -> tuple[list[_Individual], list[Diagnostic]]:
170
+ diagnostics: list[Diagnostic] = []
171
+ if not opts.id_col or not opts.occasion_col or not opts.detect_col:
172
+ return [], [_missing_long_columns()]
173
+ id_col = opts.id_col
174
+ occ_col = opts.occasion_col
175
+ det_col = opts.detect_col
176
+
177
+ # Preserve first-seen order of individuals.
178
+ order: list[str] = []
179
+ by_id: dict[str, list[tuple[str, dict[str, str], int]]] = {}
180
+ for i, row in enumerate(rows):
181
+ line = i + 2
182
+ ident = (row.get(id_col) or "").strip()
183
+ if ident not in by_id:
184
+ by_id[ident] = []
185
+ order.append(ident)
186
+ by_id[ident].append(((row.get(occ_col) or "").strip(), row, line))
187
+
188
+ individuals: list[_Individual] = []
189
+ for ident in order:
190
+ entries = by_id[ident]
191
+ entries.sort(key=lambda e: _occasion_key(e[0]))
192
+ chars: list[str] = []
193
+ for _occ_value, row, line in entries:
194
+ char, diag = _normalize_detection(row.get(det_col) or "", line)
195
+ if diag:
196
+ diagnostics.append(diag)
197
+ chars.append(char)
198
+ first_row = entries[0][1]
199
+ first_line = entries[0][2]
200
+ covs, cov_diags = _covariates_for(first_row, opts, first_line)
201
+ diagnostics.extend(cov_diags)
202
+ group = (first_row.get(opts.group_col) or "").strip() if opts.group_col else None
203
+ comment = (first_row.get(opts.comment_col) or "").strip() if opts.comment_col else None
204
+ individuals.append(_Individual("".join(chars), group or None, covs, comment or None))
205
+ return individuals, diagnostics
206
+
207
+
208
+ def _missing_long_columns() -> Diagnostic:
209
+ return Diagnostic(
210
+ code="MK008",
211
+ severity=Severity.ERROR,
212
+ message="long format needs --id-col, --occasion-col, and --detect-col",
213
+ hint="Name the individual, occasion, and detection columns, or use --format wide",
214
+ line=None,
215
+ )
216
+
217
+
218
+ def _occasion_key(value: str) -> tuple[int, float | str]:
219
+ """Sort occasions numerically when possible, else lexically."""
220
+ try:
221
+ return (0, float(value))
222
+ except ValueError:
223
+ return (1, value)
224
+
225
+
226
+ @dataclass
227
+ class _Bucket:
228
+ """Accumulator for one output record while collapsing individuals."""
229
+
230
+ history: str
231
+ covariates: list[str]
232
+ frequencies: list[int]
233
+ comment: str | None
234
+ merged: int = 1
235
+
236
+
237
+ def _assemble(individuals: list[_Individual], opts: BuildOptions) -> Dataset:
238
+ """Turn individuals into a Dataset with frequency vectors and optional collapse."""
239
+ groups = sorted({ind.group for ind in individuals if ind.group is not None})
240
+ group_labels = groups if groups else None
241
+ n_groups = len(groups) if groups else 1
242
+ cov_labels = list(opts.covariate_cols) if opts.covariate_cols else None
243
+ n_covariates = len(opts.covariate_cols)
244
+
245
+ def group_index(group: str | None) -> int:
246
+ return groups.index(group) if (groups and group is not None) else 0
247
+
248
+ buckets: list[_Bucket] = []
249
+ by_key: dict[tuple[str, tuple[str, ...]], _Bucket] = {}
250
+ for ind in individuals:
251
+ vector = [0] * n_groups
252
+ vector[group_index(ind.group)] += 1
253
+ key = (ind.history, tuple(ind.covariates))
254
+ existing = by_key.get(key) if opts.collapse else None
255
+ if existing is None:
256
+ bucket = _Bucket(ind.history, ind.covariates, vector, ind.comment)
257
+ buckets.append(bucket)
258
+ if opts.collapse:
259
+ by_key[key] = bucket
260
+ else:
261
+ for i, freq in enumerate(vector):
262
+ existing.frequencies[i] += freq
263
+ existing.merged += 1
264
+ # A collapsed group of individuals loses its per-individual label.
265
+ if existing.comment != ind.comment:
266
+ existing.comment = None
267
+
268
+ records: list[EncounterHistory] = []
269
+ for bucket in buckets:
270
+ raw_values = [str(f) for f in bucket.frequencies] + bucket.covariates
271
+ records.append(
272
+ EncounterHistory(
273
+ history=bucket.history,
274
+ frequencies=list(bucket.frequencies),
275
+ covariates=[float(c) for c in bucket.covariates if is_float_token(c)],
276
+ comment=bucket.comment,
277
+ line=0,
278
+ raw_values=raw_values,
279
+ )
280
+ )
281
+
282
+ return Dataset(
283
+ n_occasions=len(records[0].history) if records else 0,
284
+ n_groups=n_groups,
285
+ n_covariates=n_covariates,
286
+ group_labels=group_labels,
287
+ cov_labels=cov_labels,
288
+ data_type=DataType.LIVE_RECAPTURE,
289
+ records=records,
290
+ )
291
+
292
+
293
+ def build_dataset(text: str, opts: BuildOptions) -> BuildResult:
294
+ """Build a dataset from CSV ``text`` according to ``opts``."""
295
+ reader = csv.DictReader(StringIO(text))
296
+ header = list(reader.fieldnames or [])
297
+ rows = list(reader)
298
+ if not rows:
299
+ return BuildResult(None, [dx.mk008_no_records()], 0)
300
+
301
+ fmt = _detect_format(header, opts)
302
+ if fmt == "long":
303
+ individuals, diagnostics = _read_long(rows, opts)
304
+ else:
305
+ individuals, diagnostics = _read_wide(rows, header, opts)
306
+
307
+ dataset = _assemble(individuals, opts)
308
+ return BuildResult(dataset, diagnostics, len(rows))
309
+
310
+
311
+ def build_file(path: str | Path, opts: BuildOptions) -> BuildResult:
312
+ """Read a CSV file and build a dataset from it."""
313
+ text = Path(path).read_text(encoding="utf-8-sig")
314
+ return build_dataset(text, opts)
markinp/cli.py ADDED
@@ -0,0 +1,233 @@
1
+ """Command-line interface for markinp.
2
+
3
+ This module contains no domain logic. It parses arguments, calls library
4
+ functions, hands results to :mod:`markinp.report`, and sets the exit code.
5
+ Everything it does is doable in a few lines of Python via the library.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ from pathlib import Path
12
+ from typing import Annotated
13
+
14
+ import typer
15
+
16
+ from . import __version__, report
17
+ from .build import BuildOptions, build_file
18
+ from .model import DataType, Diagnostic
19
+ from .parse import parse_file
20
+ from .validate import validate
21
+ from .write import write_file
22
+
23
+ app = typer.Typer(
24
+ add_completion=False,
25
+ no_args_is_help=True,
26
+ help="Read, validate, and build Program MARK encounter-history (.inp) files.",
27
+ )
28
+
29
+
30
+ def _parse_data_type(value: str | None) -> DataType | None:
31
+ if value is None:
32
+ return None
33
+ try:
34
+ return DataType(value.lower())
35
+ except ValueError as exc:
36
+ allowed = ", ".join(dt.value for dt in DataType)
37
+ raise typer.BadParameter(f"unknown data type; choose one of: {allowed}") from exc
38
+
39
+
40
+ def _version_callback(show: bool) -> None:
41
+ if show:
42
+ typer.echo(f"markinp {__version__}")
43
+ raise typer.Exit()
44
+
45
+
46
+ @app.callback()
47
+ def main(
48
+ version: Annotated[
49
+ bool,
50
+ typer.Option(
51
+ "--version", callback=_version_callback, is_eager=True, help="Show version and exit."
52
+ ),
53
+ ] = False,
54
+ ) -> None:
55
+ """markinp — a friendly linter and builder for MARK .inp files."""
56
+
57
+
58
+ @app.command(name="validate")
59
+ def validate_cmd(
60
+ files: Annotated[
61
+ list[Path],
62
+ typer.Argument(help="One or more .inp files to validate.", exists=True, dir_okay=False),
63
+ ],
64
+ groups: Annotated[
65
+ int | None, typer.Option("--groups", help="Assert the expected number of groups.")
66
+ ] = None,
67
+ occasions: Annotated[
68
+ int | None, typer.Option("--occasions", help="Assert the expected history length.")
69
+ ] = None,
70
+ covariates: Annotated[
71
+ int | None, typer.Option("--covariates", help="Assert the expected covariate count.")
72
+ ] = None,
73
+ data_type: Annotated[
74
+ str | None, typer.Option("--data-type", help="Hint the data type for stricter checks.")
75
+ ] = None,
76
+ strict: Annotated[bool, typer.Option("--strict", help="Treat warnings as errors.")] = False,
77
+ as_json: Annotated[bool, typer.Option("--json", help="Emit machine-readable JSON.")] = False,
78
+ ) -> None:
79
+ """Validate one or more .inp files and report precise, actionable diagnostics.
80
+
81
+ Exits non-zero if any file has an error, so it drops straight into CI and
82
+ pre-commit. With several files, ``--json`` emits an array of per-file objects.
83
+ """
84
+ dtype = _parse_data_type(data_type)
85
+ any_errors = False
86
+ payloads: list[dict[str, object]] = []
87
+ blocks: list[str] = []
88
+
89
+ for file in files:
90
+ result = parse_file(file)
91
+ diagnostics = result.diagnostics + validate(
92
+ result.dataset,
93
+ groups=groups,
94
+ occasions=occasions,
95
+ covariates=covariates,
96
+ data_type=dtype,
97
+ )
98
+ diagnostics.sort(key=lambda d: (d.line if d.line is not None else -1, d.code))
99
+ if report.has_errors(diagnostics, strict):
100
+ any_errors = True
101
+
102
+ path = str(file)
103
+ if as_json:
104
+ payloads.append(
105
+ report.validate_payload(result.dataset, diagnostics, strict=strict, path=path)
106
+ )
107
+ else:
108
+ blocks.append(
109
+ report.render_validate_human(result.dataset, diagnostics, strict=strict, path=path)
110
+ )
111
+
112
+ if as_json:
113
+ output: object = payloads[0] if len(payloads) == 1 else payloads
114
+ typer.echo(json.dumps(output, indent=2))
115
+ else:
116
+ typer.echo("\n\n".join(blocks))
117
+
118
+ if any_errors:
119
+ raise typer.Exit(1)
120
+
121
+
122
+ @app.command()
123
+ def inspect(
124
+ file: Annotated[
125
+ Path, typer.Argument(help="The .inp file to inspect.", exists=True, dir_okay=False)
126
+ ],
127
+ as_json: Annotated[bool, typer.Option("--json", help="Emit machine-readable JSON.")] = False,
128
+ ) -> None:
129
+ """Summarize the inferred structure of an .inp file (read-only)."""
130
+ result = parse_file(file)
131
+ diagnostics = result.diagnostics + validate(result.dataset)
132
+ path = str(file)
133
+ if as_json:
134
+ typer.echo(report.render_inspect_json(result.dataset, diagnostics, path=path))
135
+ else:
136
+ typer.echo(report.render_inspect_human(result.dataset, diagnostics, path=path))
137
+
138
+
139
+ @app.command()
140
+ def build(
141
+ input_csv: Annotated[
142
+ Path, typer.Argument(help="Tidy capture table (CSV).", exists=True, dir_okay=False)
143
+ ],
144
+ output: Annotated[Path, typer.Option("-o", "--output", help="Path to write the .inp file.")],
145
+ fmt: Annotated[str, typer.Option("--format", help="Layout: long, wide, or auto.")] = "auto",
146
+ id_col: Annotated[
147
+ str | None, typer.Option("--id-col", help="Individual id column (long).")
148
+ ] = None,
149
+ occasion_col: Annotated[
150
+ str | None, typer.Option("--occasion-col", help="Occasion column (long).")
151
+ ] = None,
152
+ detect_col: Annotated[
153
+ str | None, typer.Option("--detect-col", help="0/1 detection column (long).")
154
+ ] = None,
155
+ history_col: Annotated[
156
+ str | None, typer.Option("--history-col", help="Prebuilt history column (wide).")
157
+ ] = None,
158
+ group_col: Annotated[
159
+ str | None, typer.Option("--group-col", help="Column defining groups.")
160
+ ] = None,
161
+ covariate_cols: Annotated[
162
+ str | None, typer.Option("--covariate-cols", help="Comma-separated covariate columns.")
163
+ ] = None,
164
+ comment_col: Annotated[
165
+ str | None, typer.Option("--comment-col", help="Column to write as /* comment */.")
166
+ ] = None,
167
+ collapse: Annotated[
168
+ bool, typer.Option("--collapse/--no-collapse", help="Aggregate identical histories.")
169
+ ] = True,
170
+ as_json: Annotated[
171
+ bool, typer.Option("--json", help="Emit a machine-readable build report.")
172
+ ] = False,
173
+ ) -> None:
174
+ """Build a valid, deterministic .inp file from a tidy capture table."""
175
+ opts = BuildOptions(
176
+ fmt=fmt,
177
+ id_col=id_col,
178
+ occasion_col=occasion_col,
179
+ detect_col=detect_col,
180
+ history_col=history_col,
181
+ group_col=group_col,
182
+ covariate_cols=[c.strip() for c in covariate_cols.split(",")] if covariate_cols else [],
183
+ comment_col=comment_col,
184
+ collapse=collapse,
185
+ )
186
+ result = build_file(input_csv, opts)
187
+ diagnostics: list[Diagnostic] = list(result.diagnostics)
188
+ if result.dataset is not None:
189
+ diagnostics += validate(result.dataset)
190
+ diagnostics.sort(key=lambda d: (d.line if d.line is not None else -1, d.code))
191
+
192
+ wrote = False
193
+ if result.dataset is not None and not report.has_errors(diagnostics):
194
+ write_file(result.dataset, output)
195
+ wrote = True
196
+
197
+ if as_json:
198
+ typer.echo(_build_json(result, diagnostics, str(output), wrote))
199
+ else:
200
+ typer.echo(_build_human(result, diagnostics, str(output), wrote))
201
+
202
+ if not wrote:
203
+ raise typer.Exit(1)
204
+
205
+
206
+ def _build_human(result, diagnostics, output: str, wrote: bool) -> str: # type: ignore[no-untyped-def]
207
+ lines = [f"markinp build -> {output}", ""]
208
+ n_records = len(result.dataset.records) if result.dataset else 0
209
+ lines.append(f"Read {result.n_rows} row(s); produced {n_records} record(s).")
210
+ if diagnostics:
211
+ lines.append("")
212
+ for diag in diagnostics:
213
+ lines.append(report._format_diagnostic(diag, strict=False))
214
+ lines.append("")
215
+ lines.append(f"Wrote {output}" if wrote else "Refused to write: fix the errors above.")
216
+ return "\n".join(lines)
217
+
218
+
219
+ def _build_json(result, diagnostics, output: str, wrote: bool) -> str: # type: ignore[no-untyped-def]
220
+ payload = {
221
+ "schema_version": report.SCHEMA_VERSION,
222
+ "command": "build",
223
+ "output": output,
224
+ "written": wrote,
225
+ "rows_read": result.n_rows,
226
+ "records": len(result.dataset.records) if result.dataset else 0,
227
+ "diagnostics": [report._diag_to_dict(d, strict=False) for d in diagnostics],
228
+ }
229
+ return json.dumps(payload, indent=2)
230
+
231
+
232
+ if __name__ == "__main__":
233
+ app()