conjure-eval 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,83 @@
1
+ Metadata-Version: 2.4
2
+ Name: conjure-eval
3
+ Version: 0.1.0
4
+ Summary: Public-slice harness for the CONJURE transformative-creativity benchmark.
5
+ Author-email: Patrick Cooper <patrick.cooper@colorado.edu>
6
+ License: Apache-2.0
7
+ Keywords: benchmark,lean4,mathlib,llm,creativity
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ Provides-Extra: verify
20
+
21
+ # conjure-eval
22
+
23
+ Public-slice harness for the CONJURE transformative-creativity benchmark.
24
+ Ships the 358-instance public split (70 percent of the 510-instance Phase 4.6
25
+ frozen corpus across 17 Lakatos families, SHA-256
26
+ `33e9daebbfc1382b08c4b518f6bc9b30e62c13cc9d7e178327675929ebd74cc9`) so
27
+ frontier-model developers can self-evaluate locally before submitting to the
28
+ hidden split.
29
+
30
+ This package contains:
31
+
32
+ - The frozen public-slice corpus JSON (`conjure_eval.data.public_corpus`).
33
+ - A CLI for inspecting the corpus, driving a model pass, and checking
34
+ submission files before they are sent to the hidden-split adjudicator.
35
+ - The deterministic split provenance, so any third party can re-derive the
36
+ public/hidden split byte-for-byte from the source corpus.
37
+
38
+ ## What this package is and isn't
39
+
40
+ `conjure-eval` is a self-service developer convenience: it lets a model team
41
+ inspect the public contracts, run their model against the public slice, and
42
+ smoke-test their submission format before sending results to the benchmark
43
+ author. It does not ship the hidden split, and it does not run the
44
+ kernel-verified tight-mode adjudicator that produces the headline accept rate.
45
+ Those live in the private `blanc` repository and are operated by the benchmark
46
+ author against frozen model snapshots; the headline number reported in the
47
+ brief is the hidden-split rate.
48
+
49
+ ## Install
50
+
51
+ ```bash
52
+ pip install conjure-eval
53
+ ```
54
+
55
+ ## Usage
56
+
57
+ ```bash
58
+ # List all 358 public-slice instance IDs
59
+ conjure-eval list-public
60
+
61
+ # Inspect a single instance
62
+ conjure-eval show C1-bv-001
63
+
64
+ # Drive a model pass (OpenAI-compatible endpoint)
65
+ conjure-eval run \
66
+ --base-url https://your-endpoint/v1 \
67
+ --api-key-env MY_API_KEY \
68
+ --model your-model-name \
69
+ --out submissions.jsonl
70
+
71
+ # Check submission file well-formedness before sending
72
+ conjure-eval verify-submission submissions.jsonl
73
+
74
+ # Print corpus provenance fields
75
+ conjure-eval provenance
76
+ ```
77
+
78
+ ## Provenance
79
+
80
+ The public corpus is a deterministic 70/30 axis-stratified slice of the
81
+ 510-instance Phase 4.6 frozen corpus maintained in the private `blanc`
82
+ repository. Seed: `4317`. Anyone with the source corpus can reproduce both
83
+ slices via `scripts/build_conjure_split.py`.
@@ -0,0 +1,63 @@
1
+ # conjure-eval
2
+
3
+ Public-slice harness for the CONJURE transformative-creativity benchmark.
4
+ Ships the 358-instance public split (70 percent of the 510-instance Phase 4.6
5
+ frozen corpus across 17 Lakatos families, SHA-256
6
+ `33e9daebbfc1382b08c4b518f6bc9b30e62c13cc9d7e178327675929ebd74cc9`) so
7
+ frontier-model developers can self-evaluate locally before submitting to the
8
+ hidden split.
9
+
10
+ This package contains:
11
+
12
+ - The frozen public-slice corpus JSON (`conjure_eval.data.public_corpus`).
13
+ - A CLI for inspecting the corpus, driving a model pass, and checking
14
+ submission files before they are sent to the hidden-split adjudicator.
15
+ - The deterministic split provenance, so any third party can re-derive the
16
+ public/hidden split byte-for-byte from the source corpus.
17
+
18
+ ## What this package is and isn't
19
+
20
+ `conjure-eval` is a self-service developer convenience: it lets a model team
21
+ inspect the public contracts, run their model against the public slice, and
22
+ smoke-test their submission format before sending results to the benchmark
23
+ author. It does not ship the hidden split, and it does not run the
24
+ kernel-verified tight-mode adjudicator that produces the headline accept rate.
25
+ Those live in the private `blanc` repository and are operated by the benchmark
26
+ author against frozen model snapshots; the headline number reported in the
27
+ brief is the hidden-split rate.
28
+
29
+ ## Install
30
+
31
+ ```bash
32
+ pip install conjure-eval
33
+ ```
34
+
35
+ ## Usage
36
+
37
+ ```bash
38
+ # List all 358 public-slice instance IDs
39
+ conjure-eval list-public
40
+
41
+ # Inspect a single instance
42
+ conjure-eval show C1-bv-001
43
+
44
+ # Drive a model pass (OpenAI-compatible endpoint)
45
+ conjure-eval run \
46
+ --base-url https://your-endpoint/v1 \
47
+ --api-key-env MY_API_KEY \
48
+ --model your-model-name \
49
+ --out submissions.jsonl
50
+
51
+ # Check submission file well-formedness before sending
52
+ conjure-eval verify-submission submissions.jsonl
53
+
54
+ # Print corpus provenance fields
55
+ conjure-eval provenance
56
+ ```
57
+
58
+ ## Provenance
59
+
60
+ The public corpus is a deterministic 70/30 axis-stratified slice of the
61
+ 510-instance Phase 4.6 frozen corpus maintained in the private `blanc`
62
+ repository. Seed: `4317`. Anyone with the source corpus can reproduce both
63
+ slices via `scripts/build_conjure_split.py`.
@@ -0,0 +1,43 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "conjure-eval"
7
+ version = "0.1.0"
8
+ description = "Public-slice harness for the CONJURE transformative-creativity benchmark."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "Apache-2.0" }
12
+ authors = [
13
+ { name = "Patrick Cooper", email = "patrick.cooper@colorado.edu" },
14
+ ]
15
+ keywords = ["benchmark", "lean4", "mathlib", "llm", "creativity"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Science/Research",
19
+ "License :: OSI Approved :: Apache Software License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Programming Language :: Python :: 3.13",
25
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
26
+ ]
27
+ dependencies = []
28
+
29
+ [project.optional-dependencies]
30
+ verify = []
31
+
32
+ [project.scripts]
33
+ conjure-eval = "conjure_eval.cli:main"
34
+
35
+ [tool.setuptools]
36
+ package-dir = { "" = "src" }
37
+
38
+ [tool.setuptools.packages.find]
39
+ where = ["src"]
40
+ include = ["conjure_eval*"]
41
+
42
+ [tool.setuptools.package-data]
43
+ conjure_eval = ["data/*.json"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,27 @@
1
+ """Public-slice harness for the CONJURE transformative-creativity benchmark.
2
+
3
+ This package ships the 358-instance public split (70 percent of the
4
+ 510-instance Phase 4.6 frozen corpus across 17 Lakatos families).
5
+ See README.md for the full description and the hidden-split policy.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from .corpus import (
11
+ PUBLIC_CORPUS_PATH,
12
+ Instance,
13
+ load_public_corpus,
14
+ public_instance_ids,
15
+ public_instance_by_id,
16
+ )
17
+
18
+ __all__ = [
19
+ "PUBLIC_CORPUS_PATH",
20
+ "Instance",
21
+ "load_public_corpus",
22
+ "public_instance_ids",
23
+ "public_instance_by_id",
24
+ "__version__",
25
+ ]
26
+
27
+ __version__ = "0.1.0"
@@ -0,0 +1,233 @@
1
+ """`conjure-eval` command-line entry point.
2
+
3
+ Three subcommands ship in v0.1.0:
4
+
5
+ list-public print every public-slice instance ID
6
+ show <instance_id> print one instance prompt + axis
7
+ verify-submission <jsonl> well-formedness check on a JSONL of
8
+ submissions; reports per-record reasons.
9
+
10
+ These are *all* developer conveniences; the actual kernel-verified
11
+ adjudication runs against the hidden split inside the private repo.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import argparse
17
+ import json
18
+ import os
19
+ import sys
20
+ from pathlib import Path
21
+
22
+ from .corpus import (
23
+ load_public_corpus,
24
+ public_instance_by_id,
25
+ public_instance_ids,
26
+ )
27
+ from .runner import run_against_endpoint
28
+
29
+
30
+ def cmd_list_public(_args: argparse.Namespace) -> int:
31
+ for iid in public_instance_ids():
32
+ print(iid)
33
+ return 0
34
+
35
+
36
+ def cmd_show(args: argparse.Namespace) -> int:
37
+ try:
38
+ inst = public_instance_by_id(args.instance_id)
39
+ except KeyError as exc:
40
+ print(str(exc), file=sys.stderr)
41
+ return 2
42
+ print(f"instance_id: {inst.instance_id}")
43
+ print(f"axis: {inst.axis}")
44
+ print("-" * 60)
45
+ print(inst.prompt or "<no prompt body in public-slice JSON>")
46
+ return 0
47
+
48
+
49
+ def _verify_record(rec: dict, public_ids: set[str]) -> list[str]:
50
+ """Return a list of human-readable errors for a single submission record.
51
+
52
+ Empty list means the record is well-formed.
53
+ """
54
+ errors: list[str] = []
55
+ iid = rec.get("instance_id")
56
+ if iid is None:
57
+ errors.append("missing field: instance_id")
58
+ elif iid not in public_ids:
59
+ errors.append(f"instance_id {iid!r} is not in the public slice")
60
+ if not isinstance(rec.get("submission"), str):
61
+ errors.append("missing or non-string field: submission")
62
+ return errors
63
+
64
+
65
+ def cmd_verify_submission(args: argparse.Namespace) -> int:
66
+ path = Path(args.path)
67
+ if not path.exists():
68
+ print(f"no such file: {path}", file=sys.stderr)
69
+ return 2
70
+
71
+ public_ids = set(public_instance_ids())
72
+ total = 0
73
+ failed = 0
74
+ seen_ids: set[str] = set()
75
+
76
+ for lineno, line in enumerate(path.read_text(encoding="utf-8").splitlines(), 1):
77
+ line = line.strip()
78
+ if not line:
79
+ continue
80
+ total += 1
81
+ try:
82
+ rec = json.loads(line)
83
+ except json.JSONDecodeError as exc:
84
+ failed += 1
85
+ print(f"line {lineno}: invalid JSON: {exc}")
86
+ continue
87
+ errs = _verify_record(rec, public_ids)
88
+ iid = rec.get("instance_id", "<unknown>")
89
+ if errs:
90
+ failed += 1
91
+ for e in errs:
92
+ print(f"line {lineno} ({iid}): {e}")
93
+ else:
94
+ seen_ids.add(iid)
95
+
96
+ missing = sorted(public_ids - seen_ids)
97
+ print()
98
+ print(f"records: {total}")
99
+ print(f"well-formed: {total - failed}")
100
+ print(f"failed: {failed}")
101
+ print(f"covered ids: {len(seen_ids)} / {len(public_ids)} public-slice instances")
102
+ if missing:
103
+ print("missing-id sample:", ", ".join(missing[:5]) + ("..." if len(missing) > 5 else ""))
104
+
105
+ return 1 if failed else 0
106
+
107
+
108
+ def cmd_run(args: argparse.Namespace) -> int:
109
+ api_key = os.environ.get(args.api_key_env, "")
110
+ if not api_key:
111
+ print(
112
+ f"error: environment variable {args.api_key_env!r} is not set or empty",
113
+ file=sys.stderr,
114
+ )
115
+ return 2
116
+
117
+ corpus = load_public_corpus()
118
+ instances = corpus["instances"]
119
+ total = min(args.limit, len(instances)) if args.limit else len(instances)
120
+
121
+ print(
122
+ f"conjure-eval run: {total} instances, model={args.model}, "
123
+ f"out={args.out}",
124
+ file=sys.stderr,
125
+ )
126
+
127
+ def _progress(i: int, n: int, rec) -> None:
128
+ status = "ok" if rec.error is None else f"ERR: {rec.error[:60]}"
129
+ print(f" [{i}/{n}] {rec.instance_id} {status}", file=sys.stderr)
130
+
131
+ run_against_endpoint(
132
+ base_url=args.base_url,
133
+ api_key=api_key,
134
+ model=args.model,
135
+ out_path=args.out,
136
+ instances=instances,
137
+ limit=args.limit,
138
+ rate_limit_ms=args.rate_limit_ms,
139
+ max_retries=args.max_retries,
140
+ timeout_s=args.timeout_s,
141
+ progress_fn=_progress,
142
+ )
143
+ print(f"done: results written to {args.out}", file=sys.stderr)
144
+ return 0
145
+
146
+
147
+ def build_parser() -> argparse.ArgumentParser:
148
+ parser = argparse.ArgumentParser(
149
+ prog="conjure-eval",
150
+ description=(
151
+ "Public-slice harness for the CONJURE benchmark "
152
+ "(Phase 4.6 frozen corpus, 358 public instances across 17 Lakatos families)."
153
+ ),
154
+ )
155
+ sub = parser.add_subparsers(dest="cmd", required=True)
156
+
157
+ sub_list = sub.add_parser("list-public", help="print every public-slice instance ID")
158
+ sub_list.set_defaults(func=cmd_list_public)
159
+
160
+ sub_show = sub.add_parser("show", help="print one public-slice instance prompt + axis")
161
+ sub_show.add_argument("instance_id", help="instance ID (see `list-public`)")
162
+ sub_show.set_defaults(func=cmd_show)
163
+
164
+ sub_verify = sub.add_parser(
165
+ "verify-submission",
166
+ help="well-formedness check on a JSONL of {instance_id, submission} records",
167
+ )
168
+ sub_verify.add_argument("path", help="path to submissions.jsonl")
169
+ sub_verify.set_defaults(func=cmd_verify_submission)
170
+
171
+ sub_prov = sub.add_parser("provenance", help="print public-slice provenance fields")
172
+ sub_prov.set_defaults(func=cmd_provenance)
173
+
174
+ sub_run = sub.add_parser(
175
+ "run",
176
+ help="run a model against the public slice and write submissions.jsonl",
177
+ )
178
+ sub_run.add_argument(
179
+ "--base-url", required=True,
180
+ help="OpenAI-compatible base URL, e.g. https://api.openai.com/v1",
181
+ )
182
+ sub_run.add_argument(
183
+ "--api-key-env", required=True,
184
+ help="name of the environment variable holding the bearer token",
185
+ )
186
+ sub_run.add_argument("--model", required=True, help="model name string")
187
+ sub_run.add_argument(
188
+ "--out", required=True,
189
+ help="output JSONL path (appended, not overwritten)",
190
+ )
191
+ sub_run.add_argument(
192
+ "--limit", type=int, default=None,
193
+ help="stop after N instances (for smoke tests)",
194
+ )
195
+ sub_run.add_argument(
196
+ "--rate-limit-ms", type=int, default=0,
197
+ help="milliseconds to sleep between requests (default 0)",
198
+ )
199
+ sub_run.add_argument(
200
+ "--max-retries", type=int, default=3,
201
+ help="retries on 429/5xx with exponential back-off (default 3)",
202
+ )
203
+ sub_run.add_argument(
204
+ "--timeout-s", type=float, default=120.0,
205
+ help="per-request HTTP timeout in seconds (default 120)",
206
+ )
207
+ sub_run.set_defaults(func=cmd_run)
208
+
209
+ return parser
210
+
211
+
212
+ def cmd_provenance(_args: argparse.Namespace) -> int:
213
+ corpus = load_public_corpus()
214
+ prov = corpus.get("public_split_provenance", {})
215
+ print(f"public_corpus.name: {corpus.get('name', '<unnamed>')}")
216
+ print(f"split_schema_version: {prov.get('split_schema_version', '<unknown>')}")
217
+ print(f"split_seed: {prov.get('split_seed', '<unknown>')}")
218
+ print(f"split_public_ratio: {prov.get('split_public_ratio', '<unknown>')}")
219
+ print(f"source_corpus_sha256: {prov.get('corpus_sha256', '<unknown>')}")
220
+ print("public_axis_counts:")
221
+ for axis, n in sorted((prov.get("public_axis_counts") or {}).items()):
222
+ print(f" {axis}: {n}")
223
+ return 0
224
+
225
+
226
+ def main(argv: list[str] | None = None) -> int:
227
+ parser = build_parser()
228
+ args = parser.parse_args(argv)
229
+ return args.func(args)
230
+
231
+
232
+ if __name__ == "__main__":
233
+ raise SystemExit(main())
@@ -0,0 +1,79 @@
1
+ """Public-corpus loader for `conjure-eval`.
2
+
3
+ The frozen public-slice corpus JSON ships as package data under
4
+ `conjure_eval/data/public_corpus.json`. This module loads it lazily and
5
+ exposes a thin typed view over the instance records.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ from dataclasses import dataclass
12
+ from importlib import resources
13
+ from typing import Any
14
+
15
+
16
+ PUBLIC_CORPUS_RESOURCE = ("conjure_eval.data", "public_corpus.json")
17
+
18
+ # Filesystem path is exposed for tests and the CLI; package-data lookup is
19
+ # the source of truth at runtime.
20
+ PUBLIC_CORPUS_PATH = resources.files("conjure_eval.data").joinpath(
21
+ "public_corpus.json"
22
+ )
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class Instance:
27
+ """A single public-slice CONJURE instance.
28
+
29
+ The `raw` field preserves the full source JSON record so downstream
30
+ tooling can read schema fields this package doesn't model yet.
31
+ """
32
+
33
+ instance_id: str
34
+ axis: str
35
+ prompt: str
36
+ raw: dict[str, Any]
37
+
38
+
39
+ def load_public_corpus() -> dict[str, Any]:
40
+ """Return the parsed public-corpus JSON.
41
+
42
+ Raises
43
+ ------
44
+ FileNotFoundError
45
+ If the package was installed without the data file present (i.e.,
46
+ somebody mis-built the wheel).
47
+ """
48
+ pkg, name = PUBLIC_CORPUS_RESOURCE
49
+ text = resources.files(pkg).joinpath(name).read_text(encoding="utf-8")
50
+ return json.loads(text)
51
+
52
+
53
+ def public_instance_ids() -> list[str]:
54
+ """Return the sorted list of public instance IDs."""
55
+ corpus = load_public_corpus()
56
+ return sorted(inst["instance_id"] for inst in corpus["instances"])
57
+
58
+
59
+ def public_instance_by_id(instance_id: str) -> Instance:
60
+ """Look up a single public-slice instance by ID.
61
+
62
+ Raises
63
+ ------
64
+ KeyError
65
+ If the ID is not part of the public slice.
66
+ """
67
+ corpus = load_public_corpus()
68
+ for inst in corpus["instances"]:
69
+ if inst["instance_id"] == instance_id:
70
+ return Instance(
71
+ instance_id=inst["instance_id"],
72
+ axis=inst["axis"],
73
+ prompt=inst.get("prompt", ""),
74
+ raw=inst,
75
+ )
76
+ raise KeyError(
77
+ f"{instance_id!r} is not in the public slice (or not present in "
78
+ f"this conjure-eval release)"
79
+ )
@@ -0,0 +1 @@
1
+ """Frozen public-corpus data for `conjure-eval`."""