fairscape-wizard 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ """Helpers for the FAIRSCAPE RO-Crate wizard skill bundle."""
2
+
3
+ from fairscape_wizard.ids import generate_guid, slugify, NAAN
4
+
5
+ __all__ = ["generate_guid", "slugify", "NAAN"]
@@ -0,0 +1,36 @@
1
+ """ARK GUID generation for the FAIRSCAPE RO-Crate wizard.
2
+
3
+ Used by emitted build_rocrate.py scripts. Produces GUIDs of the form
4
+ ``ark:59853/<prefix>-<slug>-<squid>`` (e.g. ``ark:59853/dataset-raw-images-1a2b3c4d5e``).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import datetime
10
+ import random
11
+ import re
12
+
13
+ NAAN = "59853"
14
+
15
+
16
+ def _squid() -> str:
17
+ ts = int(datetime.datetime.now(datetime.timezone.utc).timestamp())
18
+ rand = random.randint(0, 99999)
19
+ return f"{ts:x}{rand:04x}"
20
+
21
+
22
+ def slugify(name: str) -> str:
23
+ s = name.lower().strip()
24
+ s = re.sub(r"\s+", "-", s)
25
+ s = s.replace(".", "-").replace("/", "-")
26
+ s = re.sub(r"[^a-z0-9-]", "", s)
27
+ s = re.sub(r"-+", "-", s).strip("-")
28
+ return s or "entity"
29
+
30
+
31
+ def generate_guid(prefix: str, name: str = "") -> str:
32
+ """Return ``ark:59853/<prefix>-<slug>-<squid>`` (slug omitted if name is empty)."""
33
+ squid = _squid()
34
+ if name:
35
+ return f"ark:{NAAN}/{prefix}-{slugify(name)}-{squid}"
36
+ return f"ark:{NAAN}/{prefix}-{squid}"
@@ -0,0 +1,122 @@
1
+ """Range-GET a remote file to a local path, capped at N bytes.
2
+
3
+ Used by the remote-schema-infer skill to sample tabular files (CSV/TSV) from
4
+ Dataverse / PhysioNet without downloading the entire dataset. If the server
5
+ honors HTTP Range, only the first ``max-bytes`` are transferred. If not, the
6
+ stream is read and truncated client-side.
7
+
8
+ For text formats (default), the trailing partial line is dropped after
9
+ truncation so schema-infer never sees a half-written row.
10
+
11
+ For binary formats like Parquet/HDF5 — where the schema lives in the
12
+ file footer — pass ``--no-trim-tail`` and ``--max-bytes 0`` (unlimited)
13
+ to download the full file. Range-truncating a parquet file makes the
14
+ footer magic bytes unreadable and ``schema infer`` will error out.
15
+
16
+ CLI:
17
+ python -m fairscape_wizard.remote_fetch <url> <out> [--max-bytes N] [--no-trim-tail]
18
+
19
+ ``--max-bytes 0`` means no cap (download the whole file).
20
+
21
+ Exits 0 on success, prints a one-line JSON summary to stdout:
22
+ {"path": "...", "bytes_written": N, "used_range": true|false,
23
+ "status": 200|206, "truncated_tail_line": true|false}
24
+ """
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import json
29
+ import sys
30
+ import urllib.error
31
+ import urllib.request
32
+ from pathlib import Path
33
+
34
+ DEFAULT_MAX_BYTES = 5 * 1024 * 1024 # 5 MiB
35
+ CHUNK = 64 * 1024
36
+
37
+
38
+ def fetch_sample(url: str, out: Path, max_bytes: int, trim_tail: bool = True) -> dict:
39
+ out.parent.mkdir(parents=True, exist_ok=True)
40
+ headers = {
41
+ "User-Agent": "fairscape-wizard/remote_fetch",
42
+ "Accept": "*/*",
43
+ }
44
+ if max_bytes > 0:
45
+ headers["Range"] = f"bytes=0-{max_bytes - 1}"
46
+ req = urllib.request.Request(url, headers=headers)
47
+ written = 0
48
+ used_range = False
49
+ status = None
50
+ try:
51
+ with urllib.request.urlopen(req, timeout=120) as resp:
52
+ status = resp.status
53
+ used_range = status == 206
54
+ with out.open("wb") as f:
55
+ while True:
56
+ if max_bytes > 0 and written >= max_bytes:
57
+ break
58
+ to_read = CHUNK if max_bytes <= 0 else min(CHUNK, max_bytes - written)
59
+ chunk = resp.read(to_read)
60
+ if not chunk:
61
+ break
62
+ f.write(chunk)
63
+ written += len(chunk)
64
+ except urllib.error.HTTPError as e:
65
+ raise SystemExit(f"HTTP {e.code} fetching {url}: {e.reason}")
66
+ except urllib.error.URLError as e:
67
+ raise SystemExit(f"network error fetching {url}: {e.reason}")
68
+
69
+ truncated_tail = _trim_incomplete_last_line(out) if trim_tail else False
70
+ return {
71
+ "path": str(out),
72
+ "bytes_written": out.stat().st_size,
73
+ "raw_bytes": written,
74
+ "used_range": used_range,
75
+ "status": status,
76
+ "truncated_tail_line": truncated_tail,
77
+ }
78
+
79
+
80
+ def _trim_incomplete_last_line(out: Path) -> bool:
81
+ """If the file doesn't end in a newline, drop everything after the last \\n.
82
+
83
+ Returns True if any bytes were dropped.
84
+ """
85
+ size = out.stat().st_size
86
+ if size == 0:
87
+ return False
88
+ with out.open("rb") as f:
89
+ f.seek(-1, 2)
90
+ last = f.read(1)
91
+ if last == b"\n":
92
+ return False
93
+ with out.open("rb") as f:
94
+ data = f.read()
95
+ idx = data.rfind(b"\n")
96
+ if idx < 0:
97
+ return False
98
+ with out.open("wb") as f:
99
+ f.write(data[: idx + 1])
100
+ return True
101
+
102
+
103
+ def main(argv: list[str] | None = None) -> int:
104
+ ap = argparse.ArgumentParser(description="Range-GET a URL to disk, optionally capped at N bytes.")
105
+ ap.add_argument("url")
106
+ ap.add_argument("out", type=Path)
107
+ ap.add_argument(
108
+ "--max-bytes", type=int, default=DEFAULT_MAX_BYTES,
109
+ help="cap on bytes downloaded; 0 disables the cap (default: %(default)s)",
110
+ )
111
+ ap.add_argument(
112
+ "--no-trim-tail", action="store_true",
113
+ help="don't drop a trailing partial line — use for binary formats (parquet, hdf5)",
114
+ )
115
+ args = ap.parse_args(argv)
116
+ summary = fetch_sample(args.url, args.out, args.max_bytes, trim_tail=not args.no_trim_tail)
117
+ print(json.dumps(summary))
118
+ return 0
119
+
120
+
121
+ if __name__ == "__main__":
122
+ sys.exit(main())
@@ -0,0 +1,224 @@
1
+ """Evidence dump and score aggregation for agentic RO-Crate rubric scoring.
2
+
3
+ This is a thin wrapper around ``fairscape-agent/rubrics/ai-ready/extract.py``
4
+ that splits the grading flow into two halves:
5
+
6
+ * ``extract-evidence`` — load a crate, run all 28 ``RubricExtractor`` classes,
7
+ write ``<out_dir>/<rubric_id>-<slug>/{rubric.yaml,evidence.json}``. This is
8
+ the deterministic half — no LLM involved.
9
+
10
+ * ``aggregate`` — scan ``<out_dir>/*/score.json`` (written by the wizard one
11
+ rubric at a time, in-conversation) and emit ``<out_dir>/aggregated_score.json``
12
+ with totals grouped by criterion (id[0]). Match the shape that
13
+ ``grade.py:_aggregate`` produces so downstream tooling can consume either.
14
+
15
+ The agentic scoring itself lives in the ``agentic-rescore`` SKILL — Claude reads
16
+ ``rubric.yaml`` + ``evidence.json`` and writes ``score.json`` per rubric. We never
17
+ import or call ``grade.py``.
18
+
19
+ CLI:
20
+ python -m fairscape_wizard.rubric_eval extract-evidence <crate.json> <out_dir>
21
+ python -m fairscape_wizard.rubric_eval aggregate <out_dir>
22
+ """
23
+ from __future__ import annotations
24
+
25
+ import argparse
26
+ import json
27
+ import shutil
28
+ import sys
29
+ from collections import defaultdict
30
+ from pathlib import Path
31
+
32
+ HERE = Path(__file__).resolve()
33
+ AGENT_ROOT = HERE.parents[2] # .../fairscape-agent
34
+ RUBRIC_SRC_DIR = AGENT_ROOT / "rubrics" / "ai-ready"
35
+
36
+ if str(RUBRIC_SRC_DIR) not in sys.path:
37
+ sys.path.insert(0, str(RUBRIC_SRC_DIR))
38
+
39
+ # fairscape_models lives as a sibling repo when not pip-installed.
40
+ _MODELS_DIR = AGENT_ROOT.parent / "fairscape_models"
41
+ if _MODELS_DIR.exists() and str(_MODELS_DIR) not in sys.path:
42
+ sys.path.insert(0, str(_MODELS_DIR))
43
+
44
+ from extract import ALL_EXTRACTORS, ExtractContext, ReleaseBundle, root_summary # noqa: E402
45
+
46
+
47
+ CRITERION_NAMES = {
48
+ "0": "FAIRness",
49
+ "1": "Provenance",
50
+ "2": "Characterization",
51
+ "3": "Pre-model Explainability",
52
+ "4": "Ethics",
53
+ "5": "Sustainability",
54
+ "6": "Computability",
55
+ }
56
+
57
+
58
+ def cmd_extract_evidence(crate_path: Path, out_dir: Path) -> int:
59
+ if not crate_path.exists():
60
+ raise SystemExit(f"crate not found: {crate_path}")
61
+ out_dir.mkdir(parents=True, exist_ok=True)
62
+
63
+ print(f"[rubric_eval] loading {crate_path}", file=sys.stderr)
64
+ bundle = ReleaseBundle.load(crate_path)
65
+ print(
66
+ f"[rubric_eval] loaded {len(bundle.entities)} entities "
67
+ f"({len(bundle.sub_crates)} sub-crates)",
68
+ file=sys.stderr,
69
+ )
70
+
71
+ ctx = ExtractContext(bundle)
72
+
73
+ summary = {
74
+ "target": str(crate_path),
75
+ "root_summary": root_summary(bundle),
76
+ "stats": ctx.stats,
77
+ "rubric_ids": [c.rubric_id for c in ALL_EXTRACTORS],
78
+ }
79
+ (out_dir / "summary.json").write_text(
80
+ json.dumps(summary, indent=2, sort_keys=True, default=str) + "\n"
81
+ )
82
+
83
+ for cls in ALL_EXTRACTORS:
84
+ slug_dir = out_dir / f"{cls.rubric_id}-{cls.rubric_slug}"
85
+ slug_dir.mkdir(parents=True, exist_ok=True)
86
+
87
+ src_yaml = RUBRIC_SRC_DIR / f"{cls.rubric_id}-{cls.rubric_slug}.yaml"
88
+ if not src_yaml.exists():
89
+ raise SystemExit(f"rubric YAML missing: {src_yaml}")
90
+ shutil.copy(src_yaml, slug_dir / "rubric.yaml")
91
+
92
+ evidence = cls().extract(ctx)
93
+ (slug_dir / "evidence.json").write_text(
94
+ json.dumps(evidence, indent=2, sort_keys=True, default=str) + "\n"
95
+ )
96
+ print(f" [{cls.rubric_id}] {cls.rubric_slug}", file=sys.stderr)
97
+
98
+ print(json.dumps({"out_dir": str(out_dir), "rubrics": len(ALL_EXTRACTORS)}))
99
+ return 0
100
+
101
+
102
+ def cmd_aggregate(out_dir: Path, model: str = "agentic:claude-code") -> int:
103
+ if not out_dir.exists():
104
+ raise SystemExit(f"output dir not found: {out_dir}")
105
+
106
+ per_rubric: list[dict] = []
107
+ for slug_dir in sorted(out_dir.iterdir()):
108
+ if not slug_dir.is_dir():
109
+ continue
110
+ score_path = slug_dir / "score.json"
111
+ if not score_path.exists():
112
+ continue
113
+ rubric_yaml = slug_dir / "rubric.yaml"
114
+ rubric_id, _, slug = slug_dir.name.partition("-")
115
+ score = json.loads(score_path.read_text())
116
+ per_rubric.append({
117
+ "id": rubric_id,
118
+ "slug": slug,
119
+ "score": score.get("score"),
120
+ "rationale": score.get("rationale"),
121
+ "evidence": score.get("evidence", []),
122
+ "gaps": score.get("gaps", []),
123
+ "error": score.get("error"),
124
+ "rubric_yaml_path": str(rubric_yaml) if rubric_yaml.exists() else None,
125
+ })
126
+
127
+ if not per_rubric:
128
+ raise SystemExit(f"no score.json files found under {out_dir}")
129
+
130
+ aggregate = _aggregate(per_rubric, model)
131
+ aggregate_path = out_dir / "aggregated_score.json"
132
+ aggregate_path.write_text(json.dumps(aggregate, indent=2, default=str) + "\n")
133
+
134
+ counts = aggregate["counts"]
135
+ print(
136
+ f"[rubric_eval] {aggregate['total_score']}/{aggregate['max_score']} "
137
+ f"= {aggregate['percentage']}% (substantive={counts['substantive']}, "
138
+ f"partial={counts['partial']}, absent={counts['absent']}, "
139
+ f"error={counts['error']})",
140
+ file=sys.stderr,
141
+ )
142
+ print(json.dumps({
143
+ "aggregated_score_path": str(aggregate_path),
144
+ "total_score": aggregate["total_score"],
145
+ "max_score": aggregate["max_score"],
146
+ "percentage": aggregate["percentage"],
147
+ "rubrics_scored": len(per_rubric),
148
+ }))
149
+ return 0
150
+
151
+
152
+ def _aggregate(per_rubric: list[dict], model: str) -> dict:
153
+ """Mirror of ``grade.py:_aggregate`` — group by ``id[0]``, sum scores,
154
+ compute percentage, count outcomes. Kept in sync with grade.py.
155
+ """
156
+ groups: dict[str, list[dict]] = defaultdict(list)
157
+ for r in per_rubric:
158
+ groups[r["id"][0]].append(r)
159
+
160
+ criteria = []
161
+ total = 0
162
+ max_total = 0
163
+ counts = {"substantive": 0, "partial": 0, "absent": 0, "error": 0}
164
+
165
+ for prefix in sorted(groups):
166
+ rubrics = groups[prefix]
167
+ c_score = sum((r["score"] or 0) for r in rubrics if r["score"] is not None)
168
+ c_max = 2 * len(rubrics)
169
+ for r in rubrics:
170
+ s = r["score"]
171
+ if s == 2:
172
+ counts["substantive"] += 1
173
+ elif s == 1:
174
+ counts["partial"] += 1
175
+ elif s == 0:
176
+ counts["absent"] += 1
177
+ else:
178
+ counts["error"] += 1
179
+ criteria.append({
180
+ "id": prefix,
181
+ "name": CRITERION_NAMES.get(prefix, f"Unknown ({prefix})"),
182
+ "score": c_score,
183
+ "max": c_max,
184
+ "rubrics": rubrics,
185
+ })
186
+ total += c_score
187
+ max_total += c_max
188
+
189
+ percentage = round(100 * total / max_total, 1) if max_total else 0.0
190
+ return {
191
+ "model": model,
192
+ "total_score": total,
193
+ "max_score": max_total,
194
+ "percentage": percentage,
195
+ "counts": counts,
196
+ "criteria": criteria,
197
+ }
198
+
199
+
200
+ def main(argv: list[str] | None = None) -> int:
201
+ ap = argparse.ArgumentParser(
202
+ description="Evidence dump + score aggregation for agentic RO-Crate grading.",
203
+ )
204
+ sub = ap.add_subparsers(dest="cmd", required=True)
205
+
206
+ ee = sub.add_parser("extract-evidence", help="Run extractors and dump evidence.json + rubric.yaml per rubric.")
207
+ ee.add_argument("crate_path", type=Path)
208
+ ee.add_argument("out_dir", type=Path)
209
+
210
+ ag = sub.add_parser("aggregate", help="Aggregate per-rubric score.json files into aggregated_score.json.")
211
+ ag.add_argument("out_dir", type=Path)
212
+ ag.add_argument("--model", default="agentic:claude-code")
213
+
214
+ args = ap.parse_args(argv)
215
+
216
+ if args.cmd == "extract-evidence":
217
+ return cmd_extract_evidence(args.crate_path, args.out_dir)
218
+ if args.cmd == "aggregate":
219
+ return cmd_aggregate(args.out_dir, args.model)
220
+ return 2
221
+
222
+
223
+ if __name__ == "__main__":
224
+ sys.exit(main())
@@ -0,0 +1,9 @@
1
+ Metadata-Version: 2.4
2
+ Name: fairscape-wizard
3
+ Version: 0.2.0
4
+ Summary: Helper module for the FAIRSCAPE RO-Crate wizard Claude Code skill bundle
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: fairscape-cli
7
+ Requires-Dist: fairscape-models
8
+ Requires-Dist: pydantic-ai>=0.0.14
9
+ Requires-Dist: pyyaml
@@ -0,0 +1,7 @@
1
+ fairscape_wizard/__init__.py,sha256=RVI6ONBh-JJEW8P63HGSbrv-uB_d5M9JdIdcrpF4DW8,173
2
+ fairscape_wizard/ids.py,sha256=8M1jeYme7cseFiDvVw72h83f1IoeYtNYKXk0Ywt9lgc,1001
3
+ fairscape_wizard/remote_fetch.py,sha256=QRlK1I87YvpCvkNbMaTTkETUwi-A3B3O_QPBYAaisO0,4186
4
+ fairscape_wizard/rubric_eval.py,sha256=9A0Uy9ebGQU-_Z7yMEbByUNMBqZC4G2H2tptk8GITVc,7824
5
+ fairscape_wizard-0.2.0.dist-info/METADATA,sha256=2aacnlWU67GZ2LXnjIo41vQ8ngJVBcZvrxouZAbEzWk,284
6
+ fairscape_wizard-0.2.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
7
+ fairscape_wizard-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any