conjure-eval 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ """Public-slice harness for the CONJURE transformative-creativity benchmark.
2
+
3
+ This package ships the 358-instance public split (70 percent of the
4
+ 510-instance Phase 4.6 frozen corpus across 17 Lakatos families).
5
+ See README.md for the full description and the hidden-split policy.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from .corpus import (
11
+ PUBLIC_CORPUS_PATH,
12
+ Instance,
13
+ load_public_corpus,
14
+ public_instance_ids,
15
+ public_instance_by_id,
16
+ )
17
+
18
+ __all__ = [
19
+ "PUBLIC_CORPUS_PATH",
20
+ "Instance",
21
+ "load_public_corpus",
22
+ "public_instance_ids",
23
+ "public_instance_by_id",
24
+ "__version__",
25
+ ]
26
+
27
+ __version__ = "0.1.0"
conjure_eval/cli.py ADDED
@@ -0,0 +1,233 @@
1
+ """`conjure-eval` command-line entry point.
2
+
3
+ Three subcommands ship in v0.1.0:
4
+
5
+ list-public print every public-slice instance ID
6
+ show <instance_id> print one instance prompt + axis
7
+ verify-submission <jsonl> well-formedness check on a JSONL of
8
+ submissions; reports per-record reasons.
9
+
10
+ These are *all* developer conveniences; the actual kernel-verified
11
+ adjudication runs against the hidden split inside the private repo.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import json
18
+ import os
19
+ import sys
20
+ from pathlib import Path
21
+
22
+ from .corpus import (
23
+ load_public_corpus,
24
+ public_instance_by_id,
25
+ public_instance_ids,
26
+ )
27
+ from .runner import run_against_endpoint
28
+
29
+
30
+ def cmd_list_public(_args: argparse.Namespace) -> int:
31
+ for iid in public_instance_ids():
32
+ print(iid)
33
+ return 0
34
+
35
+
36
+ def cmd_show(args: argparse.Namespace) -> int:
37
+ try:
38
+ inst = public_instance_by_id(args.instance_id)
39
+ except KeyError as exc:
40
+ print(str(exc), file=sys.stderr)
41
+ return 2
42
+ print(f"instance_id: {inst.instance_id}")
43
+ print(f"axis: {inst.axis}")
44
+ print("-" * 60)
45
+ print(inst.prompt or "<no prompt body in public-slice JSON>")
46
+ return 0
47
+
48
+
49
+ def _verify_record(rec: dict, public_ids: set[str]) -> list[str]:
50
+ """Return a list of human-readable errors for a single submission record.
51
+
52
+ Empty list means the record is well-formed.
53
+ """
54
+ errors: list[str] = []
55
+ iid = rec.get("instance_id")
56
+ if iid is None:
57
+ errors.append("missing field: instance_id")
58
+ elif iid not in public_ids:
59
+ errors.append(f"instance_id {iid!r} is not in the public slice")
60
+ if not isinstance(rec.get("submission"), str):
61
+ errors.append("missing or non-string field: submission")
62
+ return errors
63
+
64
+
65
+ def cmd_verify_submission(args: argparse.Namespace) -> int:
66
+ path = Path(args.path)
67
+ if not path.exists():
68
+ print(f"no such file: {path}", file=sys.stderr)
69
+ return 2
70
+
71
+ public_ids = set(public_instance_ids())
72
+ total = 0
73
+ failed = 0
74
+ seen_ids: set[str] = set()
75
+
76
+ for lineno, line in enumerate(path.read_text(encoding="utf-8").splitlines(), 1):
77
+ line = line.strip()
78
+ if not line:
79
+ continue
80
+ total += 1
81
+ try:
82
+ rec = json.loads(line)
83
+ except json.JSONDecodeError as exc:
84
+ failed += 1
85
+ print(f"line {lineno}: invalid JSON: {exc}")
86
+ continue
87
+ errs = _verify_record(rec, public_ids)
88
+ iid = rec.get("instance_id", "<unknown>")
89
+ if errs:
90
+ failed += 1
91
+ for e in errs:
92
+ print(f"line {lineno} ({iid}): {e}")
93
+ else:
94
+ seen_ids.add(iid)
95
+
96
+ missing = sorted(public_ids - seen_ids)
97
+ print()
98
+ print(f"records: {total}")
99
+ print(f"well-formed: {total - failed}")
100
+ print(f"failed: {failed}")
101
+ print(f"covered ids: {len(seen_ids)} / {len(public_ids)} public-slice instances")
102
+ if missing:
103
+ print("missing-id sample:", ", ".join(missing[:5]) + ("..." if len(missing) > 5 else ""))
104
+
105
+ return 1 if failed else 0
106
+
107
+
108
+ def cmd_run(args: argparse.Namespace) -> int:
109
+ api_key = os.environ.get(args.api_key_env, "")
110
+ if not api_key:
111
+ print(
112
+ f"error: environment variable {args.api_key_env!r} is not set or empty",
113
+ file=sys.stderr,
114
+ )
115
+ return 2
116
+
117
+ corpus = load_public_corpus()
118
+ instances = corpus["instances"]
119
+ total = min(args.limit, len(instances)) if args.limit else len(instances)
120
+
121
+ print(
122
+ f"conjure-eval run: {total} instances, model={args.model}, "
123
+ f"out={args.out}",
124
+ file=sys.stderr,
125
+ )
126
+
127
+ def _progress(i: int, n: int, rec) -> None:
128
+ status = "ok" if rec.error is None else f"ERR: {rec.error[:60]}"
129
+ print(f" [{i}/{n}] {rec.instance_id} {status}", file=sys.stderr)
130
+
131
+ run_against_endpoint(
132
+ base_url=args.base_url,
133
+ api_key=api_key,
134
+ model=args.model,
135
+ out_path=args.out,
136
+ instances=instances,
137
+ limit=args.limit,
138
+ rate_limit_ms=args.rate_limit_ms,
139
+ max_retries=args.max_retries,
140
+ timeout_s=args.timeout_s,
141
+ progress_fn=_progress,
142
+ )
143
+ print(f"done: results written to {args.out}", file=sys.stderr)
144
+ return 0
145
+
146
+
147
+ def build_parser() -> argparse.ArgumentParser:
148
+ parser = argparse.ArgumentParser(
149
+ prog="conjure-eval",
150
+ description=(
151
+ "Public-slice harness for the CONJURE benchmark "
152
+ "(Phase 4.6 frozen corpus, 358 public instances across 17 Lakatos families)."
153
+ ),
154
+ )
155
+ sub = parser.add_subparsers(dest="cmd", required=True)
156
+
157
+ sub_list = sub.add_parser("list-public", help="print every public-slice instance ID")
158
+ sub_list.set_defaults(func=cmd_list_public)
159
+
160
+ sub_show = sub.add_parser("show", help="print one public-slice instance prompt + axis")
161
+ sub_show.add_argument("instance_id", help="instance ID (see `list-public`)")
162
+ sub_show.set_defaults(func=cmd_show)
163
+
164
+ sub_verify = sub.add_parser(
165
+ "verify-submission",
166
+ help="well-formedness check on a JSONL of {instance_id, submission} records",
167
+ )
168
+ sub_verify.add_argument("path", help="path to submissions.jsonl")
169
+ sub_verify.set_defaults(func=cmd_verify_submission)
170
+
171
+ sub_prov = sub.add_parser("provenance", help="print public-slice provenance fields")
172
+ sub_prov.set_defaults(func=cmd_provenance)
173
+
174
+ sub_run = sub.add_parser(
175
+ "run",
176
+ help="run a model against the public slice and write submissions.jsonl",
177
+ )
178
+ sub_run.add_argument(
179
+ "--base-url", required=True,
180
+ help="OpenAI-compatible base URL, e.g. https://api.openai.com/v1",
181
+ )
182
+ sub_run.add_argument(
183
+ "--api-key-env", required=True,
184
+ help="name of the environment variable holding the bearer token",
185
+ )
186
+ sub_run.add_argument("--model", required=True, help="model name string")
187
+ sub_run.add_argument(
188
+ "--out", required=True,
189
+ help="output JSONL path (appended, not overwritten)",
190
+ )
191
+ sub_run.add_argument(
192
+ "--limit", type=int, default=None,
193
+ help="stop after N instances (for smoke tests)",
194
+ )
195
+ sub_run.add_argument(
196
+ "--rate-limit-ms", type=int, default=0,
197
+ help="milliseconds to sleep between requests (default 0)",
198
+ )
199
+ sub_run.add_argument(
200
+ "--max-retries", type=int, default=3,
201
+ help="retries on 429/5xx with exponential back-off (default 3)",
202
+ )
203
+ sub_run.add_argument(
204
+ "--timeout-s", type=float, default=120.0,
205
+ help="per-request HTTP timeout in seconds (default 120)",
206
+ )
207
+ sub_run.set_defaults(func=cmd_run)
208
+
209
+ return parser
210
+
211
+
212
+ def cmd_provenance(_args: argparse.Namespace) -> int:
213
+ corpus = load_public_corpus()
214
+ prov = corpus.get("public_split_provenance", {})
215
+ print(f"public_corpus.name: {corpus.get('name', '<unnamed>')}")
216
+ print(f"split_schema_version: {prov.get('split_schema_version', '<unknown>')}")
217
+ print(f"split_seed: {prov.get('split_seed', '<unknown>')}")
218
+ print(f"split_public_ratio: {prov.get('split_public_ratio', '<unknown>')}")
219
+ print(f"source_corpus_sha256: {prov.get('corpus_sha256', '<unknown>')}")
220
+ print("public_axis_counts:")
221
+ for axis, n in sorted((prov.get("public_axis_counts") or {}).items()):
222
+ print(f" {axis}: {n}")
223
+ return 0
224
+
225
+
226
+ def main(argv: list[str] | None = None) -> int:
227
+ parser = build_parser()
228
+ args = parser.parse_args(argv)
229
+ return args.func(args)
230
+
231
+
232
+ if __name__ == "__main__":
233
+ raise SystemExit(main())
conjure_eval/corpus.py ADDED
@@ -0,0 +1,79 @@
1
+ """Public-corpus loader for `conjure-eval`.
2
+
3
+ The frozen public-slice corpus JSON ships as package data under
4
+ `conjure_eval/data/public_corpus.json`. This module loads it lazily and
5
+ exposes a thin typed view over the instance records.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ from dataclasses import dataclass
12
+ from importlib import resources
13
+ from typing import Any
14
+
15
+
16
+ PUBLIC_CORPUS_RESOURCE = ("conjure_eval.data", "public_corpus.json")
17
+
18
+ # Filesystem path is exposed for tests and the CLI; package-data lookup is
19
+ # the source of truth at runtime.
20
+ PUBLIC_CORPUS_PATH = resources.files("conjure_eval.data").joinpath(
21
+ "public_corpus.json"
22
+ )
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class Instance:
27
+ """A single public-slice CONJURE instance.
28
+
29
+ The `raw` field preserves the full source JSON record so downstream
30
+ tooling can read schema fields this package doesn't model yet.
31
+ """
32
+
33
+ instance_id: str
34
+ axis: str
35
+ prompt: str
36
+ raw: dict[str, Any]
37
+
38
+
39
+ def load_public_corpus() -> dict[str, Any]:
40
+ """Return the parsed public-corpus JSON.
41
+
42
+ Raises
43
+ ------
44
+ FileNotFoundError
45
+ If the package was installed without the data file present (i.e.,
46
+ somebody mis-built the wheel).
47
+ """
48
+ pkg, name = PUBLIC_CORPUS_RESOURCE
49
+ text = resources.files(pkg).joinpath(name).read_text(encoding="utf-8")
50
+ return json.loads(text)
51
+
52
+
53
+ def public_instance_ids() -> list[str]:
54
+ """Return the sorted list of public instance IDs."""
55
+ corpus = load_public_corpus()
56
+ return sorted(inst["instance_id"] for inst in corpus["instances"])
57
+
58
+
59
+ def public_instance_by_id(instance_id: str) -> Instance:
60
+ """Look up a single public-slice instance by ID.
61
+
62
+ Raises
63
+ ------
64
+ KeyError
65
+ If the ID is not part of the public slice.
66
+ """
67
+ corpus = load_public_corpus()
68
+ for inst in corpus["instances"]:
69
+ if inst["instance_id"] == instance_id:
70
+ return Instance(
71
+ instance_id=inst["instance_id"],
72
+ axis=inst["axis"],
73
+ prompt=inst.get("prompt", ""),
74
+ raw=inst,
75
+ )
76
+ raise KeyError(
77
+ f"{instance_id!r} is not in the public slice (or not present in "
78
+ f"this conjure-eval release)"
79
+ )
@@ -0,0 +1 @@
1
+ """Frozen public-corpus data for `conjure-eval`."""