PyPI - conjure-eval - Versions diffs - 0.1.0__tar.gz - Mend

conjure-eval 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

conjure_eval-0.1.0/PKG-INFO +83 -0
conjure_eval-0.1.0/README.md +63 -0
conjure_eval-0.1.0/pyproject.toml +43 -0
conjure_eval-0.1.0/setup.cfg +4 -0
conjure_eval-0.1.0/src/conjure_eval/__init__.py +27 -0
conjure_eval-0.1.0/src/conjure_eval/cli.py +233 -0
conjure_eval-0.1.0/src/conjure_eval/corpus.py +79 -0
conjure_eval-0.1.0/src/conjure_eval/data/__init__.py +1 -0
conjure_eval-0.1.0/src/conjure_eval/data/public_corpus.json +5516 -0
conjure_eval-0.1.0/src/conjure_eval/runner.py +200 -0
conjure_eval-0.1.0/src/conjure_eval.egg-info/PKG-INFO +83 -0
conjure_eval-0.1.0/src/conjure_eval.egg-info/SOURCES.txt +17 -0
conjure_eval-0.1.0/src/conjure_eval.egg-info/dependency_links.txt +1 -0
conjure_eval-0.1.0/src/conjure_eval.egg-info/entry_points.txt +2 -0
conjure_eval-0.1.0/src/conjure_eval.egg-info/requires.txt +2 -0
conjure_eval-0.1.0/src/conjure_eval.egg-info/top_level.txt +1 -0
conjure_eval-0.1.0/tests/test_cli.py +86 -0
conjure_eval-0.1.0/tests/test_corpus.py +93 -0
conjure_eval-0.1.0/tests/test_runner.py +187 -0

conjure_eval-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,83 @@
+Metadata-Version: 2.4
+Name: conjure-eval
+Version: 0.1.0
+Summary: Public-slice harness for the CONJURE transformative-creativity benchmark.
+Author-email: Patrick Cooper <patrick.cooper@colorado.edu>
+License: Apache-2.0
+Keywords: benchmark,lean4,mathlib,llm,creativity
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Provides-Extra: verify
+# conjure-eval
+Public-slice harness for the CONJURE transformative-creativity benchmark.
+Ships the 358-instance public split (70 percent of the 510-instance Phase 4.6
+frozen corpus across 17 Lakatos families, SHA-256
+`33e9daebbfc1382b08c4b518f6bc9b30e62c13cc9d7e178327675929ebd74cc9`) so
+frontier-model developers can self-evaluate locally before submitting to the
+hidden split.
+This package contains:
+- The frozen public-slice corpus JSON (`conjure_eval.data.public_corpus`).
+- A CLI for inspecting the corpus, driving a model pass, and checking
+  submission files before they are sent to the hidden-split adjudicator.
+- The deterministic split provenance, so any third party can re-derive the
+  public/hidden split byte-for-byte from the source corpus.
+## What this package is and isn't
+`conjure-eval` is a self-service developer convenience: it lets a model team
+inspect the public contracts, run their model against the public slice, and
+smoke-test their submission format before sending results to the benchmark
+author. It does not ship the hidden split, and it does not run the
+kernel-verified tight-mode adjudicator that produces the headline accept rate.
+Those live in the private `blanc` repository and are operated by the benchmark
+author against frozen model snapshots; the headline number reported in the
+brief is the hidden-split rate.
+## Install
+```bash
+pip install conjure-eval
+```
+## Usage
+```bash
+# List all 358 public-slice instance IDs
+conjure-eval list-public
+# Inspect a single instance
+conjure-eval show C1-bv-001
+# Drive a model pass (OpenAI-compatible endpoint)
+conjure-eval run \
+    --base-url https://your-endpoint/v1 \
+    --api-key-env MY_API_KEY \
+    --model your-model-name \
+    --out submissions.jsonl
+# Check submission file well-formedness before sending
+conjure-eval verify-submission submissions.jsonl
+# Print corpus provenance fields
+conjure-eval provenance
+```
+## Provenance
+The public corpus is a deterministic 70/30 axis-stratified slice of the
+510-instance Phase 4.6 frozen corpus maintained in the private `blanc`
+repository. Seed: `4317`. Anyone with the source corpus can reproduce both
+slices via `scripts/build_conjure_split.py`.

conjure_eval-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,63 @@
+# conjure-eval
+Public-slice harness for the CONJURE transformative-creativity benchmark.
+Ships the 358-instance public split (70 percent of the 510-instance Phase 4.6
+frozen corpus across 17 Lakatos families, SHA-256
+`33e9daebbfc1382b08c4b518f6bc9b30e62c13cc9d7e178327675929ebd74cc9`) so
+frontier-model developers can self-evaluate locally before submitting to the
+hidden split.
+This package contains:
+- The frozen public-slice corpus JSON (`conjure_eval.data.public_corpus`).
+- A CLI for inspecting the corpus, driving a model pass, and checking
+  submission files before they are sent to the hidden-split adjudicator.
+- The deterministic split provenance, so any third party can re-derive the
+  public/hidden split byte-for-byte from the source corpus.
+## What this package is and isn't
+`conjure-eval` is a self-service developer convenience: it lets a model team
+inspect the public contracts, run their model against the public slice, and
+smoke-test their submission format before sending results to the benchmark
+author. It does not ship the hidden split, and it does not run the
+kernel-verified tight-mode adjudicator that produces the headline accept rate.
+Those live in the private `blanc` repository and are operated by the benchmark
+author against frozen model snapshots; the headline number reported in the
+brief is the hidden-split rate.
+## Install
+```bash
+pip install conjure-eval
+```
+## Usage
+```bash
+# List all 358 public-slice instance IDs
+conjure-eval list-public
+# Inspect a single instance
+conjure-eval show C1-bv-001
+# Drive a model pass (OpenAI-compatible endpoint)
+conjure-eval run \
+    --base-url https://your-endpoint/v1 \
+    --api-key-env MY_API_KEY \
+    --model your-model-name \
+    --out submissions.jsonl
+# Check submission file well-formedness before sending
+conjure-eval verify-submission submissions.jsonl
+# Print corpus provenance fields
+conjure-eval provenance
+```
+## Provenance
+The public corpus is a deterministic 70/30 axis-stratified slice of the
+510-instance Phase 4.6 frozen corpus maintained in the private `blanc`
+repository. Seed: `4317`. Anyone with the source corpus can reproduce both
+slices via `scripts/build_conjure_split.py`.

conjure_eval-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,43 @@
+[build-system]
+requires = ["setuptools>=68.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "conjure-eval"
+version = "0.1.0"
+description = "Public-slice harness for the CONJURE transformative-creativity benchmark."
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "Apache-2.0" }
+authors = [
+    { name = "Patrick Cooper", email = "patrick.cooper@colorado.edu" },
+]
+keywords = ["benchmark", "lean4", "mathlib", "llm", "creativity"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+dependencies = []
+[project.optional-dependencies]
+verify = []
+[project.scripts]
+conjure-eval = "conjure_eval.cli:main"
+[tool.setuptools]
+package-dir = { "" = "src" }
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["conjure_eval*"]
+[tool.setuptools.package-data]
+conjure_eval = ["data/*.json"]

conjure_eval-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

conjure_eval-0.1.0/src/conjure_eval/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""Public-slice harness for the CONJURE transformative-creativity benchmark.
+This package ships the 358-instance public split (70 percent of the
+510-instance Phase 4.6 frozen corpus across 17 Lakatos families).
+See README.md for the full description and the hidden-split policy.
+"""
+from __future__ import annotations
+from .corpus import (
+    PUBLIC_CORPUS_PATH,
+    Instance,
+    load_public_corpus,
+    public_instance_ids,
+    public_instance_by_id,
+)
+__all__ = [
+    "PUBLIC_CORPUS_PATH",
+    "Instance",
+    "load_public_corpus",
+    "public_instance_ids",
+    "public_instance_by_id",
+    "__version__",
+]
+__version__ = "0.1.0"

conjure_eval-0.1.0/src/conjure_eval/cli.py ADDED Viewed

@@ -0,0 +1,233 @@
+"""`conjure-eval` command-line entry point.
+Three subcommands ship in v0.1.0:
+  list-public                        print every public-slice instance ID
+  show <instance_id>                 print one instance prompt + axis
+  verify-submission <jsonl>          well-formedness check on a JSONL of
+                                     submissions; reports per-record reasons.
+These are *all* developer conveniences; the actual kernel-verified
+adjudication runs against the hidden split inside the private repo.
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+from .corpus import (
+    load_public_corpus,
+    public_instance_by_id,
+    public_instance_ids,
+)
+from .runner import run_against_endpoint
+def cmd_list_public(_args: argparse.Namespace) -> int:
+    for iid in public_instance_ids():
+        print(iid)
+    return 0
+def cmd_show(args: argparse.Namespace) -> int:
+    try:
+        inst = public_instance_by_id(args.instance_id)
+    except KeyError as exc:
+        print(str(exc), file=sys.stderr)
+        return 2
+    print(f"instance_id: {inst.instance_id}")
+    print(f"axis:        {inst.axis}")
+    print("-" * 60)
+    print(inst.prompt or "<no prompt body in public-slice JSON>")
+    return 0
+def _verify_record(rec: dict, public_ids: set[str]) -> list[str]:
+    """Return a list of human-readable errors for a single submission record.
+    Empty list means the record is well-formed.
+    """
+    errors: list[str] = []
+    iid = rec.get("instance_id")
+    if iid is None:
+        errors.append("missing field: instance_id")
+    elif iid not in public_ids:
+        errors.append(f"instance_id {iid!r} is not in the public slice")
+    if not isinstance(rec.get("submission"), str):
+        errors.append("missing or non-string field: submission")
+    return errors
+def cmd_verify_submission(args: argparse.Namespace) -> int:
+    path = Path(args.path)
+    if not path.exists():
+        print(f"no such file: {path}", file=sys.stderr)
+        return 2
+    public_ids = set(public_instance_ids())
+    total = 0
+    failed = 0
+    seen_ids: set[str] = set()
+    for lineno, line in enumerate(path.read_text(encoding="utf-8").splitlines(), 1):
+        line = line.strip()
+        if not line:
+            continue
+        total += 1
+        try:
+            rec = json.loads(line)
+        except json.JSONDecodeError as exc:
+            failed += 1
+            print(f"line {lineno}: invalid JSON: {exc}")
+            continue
+        errs = _verify_record(rec, public_ids)
+        iid = rec.get("instance_id", "<unknown>")
+        if errs:
+            failed += 1
+            for e in errs:
+                print(f"line {lineno} ({iid}): {e}")
+        else:
+            seen_ids.add(iid)
+    missing = sorted(public_ids - seen_ids)
+    print()
+    print(f"records:    {total}")
+    print(f"well-formed: {total - failed}")
+    print(f"failed:     {failed}")
+    print(f"covered ids: {len(seen_ids)} / {len(public_ids)} public-slice instances")
+    if missing:
+        print("missing-id sample:", ", ".join(missing[:5]) + ("..." if len(missing) > 5 else ""))
+    return 1 if failed else 0
+def cmd_run(args: argparse.Namespace) -> int:
+    api_key = os.environ.get(args.api_key_env, "")
+    if not api_key:
+        print(
+            f"error: environment variable {args.api_key_env!r} is not set or empty",
+            file=sys.stderr,
+        )
+        return 2
+    corpus = load_public_corpus()
+    instances = corpus["instances"]
+    total = min(args.limit, len(instances)) if args.limit else len(instances)
+    print(
+        f"conjure-eval run: {total} instances, model={args.model}, "
+        f"out={args.out}",
+        file=sys.stderr,
+    )
+    def _progress(i: int, n: int, rec) -> None:
+        status = "ok" if rec.error is None else f"ERR: {rec.error[:60]}"
+        print(f"  [{i}/{n}] {rec.instance_id} {status}", file=sys.stderr)
+    run_against_endpoint(
+        base_url=args.base_url,
+        api_key=api_key,
+        model=args.model,
+        out_path=args.out,
+        instances=instances,
+        limit=args.limit,
+        rate_limit_ms=args.rate_limit_ms,
+        max_retries=args.max_retries,
+        timeout_s=args.timeout_s,
+        progress_fn=_progress,
+    )
+    print(f"done: results written to {args.out}", file=sys.stderr)
+    return 0
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="conjure-eval",
+        description=(
+            "Public-slice harness for the CONJURE benchmark "
+            "(Phase 4.6 frozen corpus, 358 public instances across 17 Lakatos families)."
+        ),
+    )
+    sub = parser.add_subparsers(dest="cmd", required=True)
+    sub_list = sub.add_parser("list-public", help="print every public-slice instance ID")
+    sub_list.set_defaults(func=cmd_list_public)
+    sub_show = sub.add_parser("show", help="print one public-slice instance prompt + axis")
+    sub_show.add_argument("instance_id", help="instance ID (see `list-public`)")
+    sub_show.set_defaults(func=cmd_show)
+    sub_verify = sub.add_parser(
+        "verify-submission",
+        help="well-formedness check on a JSONL of {instance_id, submission} records",
+    )
+    sub_verify.add_argument("path", help="path to submissions.jsonl")
+    sub_verify.set_defaults(func=cmd_verify_submission)
+    sub_prov = sub.add_parser("provenance", help="print public-slice provenance fields")
+    sub_prov.set_defaults(func=cmd_provenance)
+    sub_run = sub.add_parser(
+        "run",
+        help="run a model against the public slice and write submissions.jsonl",
+    )
+    sub_run.add_argument(
+        "--base-url", required=True,
+        help="OpenAI-compatible base URL, e.g. https://api.openai.com/v1",
+    )
+    sub_run.add_argument(
+        "--api-key-env", required=True,
+        help="name of the environment variable holding the bearer token",
+    )
+    sub_run.add_argument("--model", required=True, help="model name string")
+    sub_run.add_argument(
+        "--out", required=True,
+        help="output JSONL path (appended, not overwritten)",
+    )
+    sub_run.add_argument(
+        "--limit", type=int, default=None,
+        help="stop after N instances (for smoke tests)",
+    )
+    sub_run.add_argument(
+        "--rate-limit-ms", type=int, default=0,
+        help="milliseconds to sleep between requests (default 0)",
+    )
+    sub_run.add_argument(
+        "--max-retries", type=int, default=3,
+        help="retries on 429/5xx with exponential back-off (default 3)",
+    )
+    sub_run.add_argument(
+        "--timeout-s", type=float, default=120.0,
+        help="per-request HTTP timeout in seconds (default 120)",
+    )
+    sub_run.set_defaults(func=cmd_run)
+    return parser
+def cmd_provenance(_args: argparse.Namespace) -> int:
+    corpus = load_public_corpus()
+    prov = corpus.get("public_split_provenance", {})
+    print(f"public_corpus.name:    {corpus.get('name', '<unnamed>')}")
+    print(f"split_schema_version:  {prov.get('split_schema_version', '<unknown>')}")
+    print(f"split_seed:            {prov.get('split_seed', '<unknown>')}")
+    print(f"split_public_ratio:    {prov.get('split_public_ratio', '<unknown>')}")
+    print(f"source_corpus_sha256:  {prov.get('corpus_sha256', '<unknown>')}")
+    print("public_axis_counts:")
+    for axis, n in sorted((prov.get("public_axis_counts") or {}).items()):
+        print(f"  {axis}: {n}")
+    return 0
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    return args.func(args)
+if __name__ == "__main__":
+    raise SystemExit(main())

conjure_eval-0.1.0/src/conjure_eval/corpus.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Public-corpus loader for `conjure-eval`.
+The frozen public-slice corpus JSON ships as package data under
+`conjure_eval/data/public_corpus.json`. This module loads it lazily and
+exposes a thin typed view over the instance records.
+"""
+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from importlib import resources
+from typing import Any
+PUBLIC_CORPUS_RESOURCE = ("conjure_eval.data", "public_corpus.json")
+# Filesystem path is exposed for tests and the CLI; package-data lookup is
+# the source of truth at runtime.
+PUBLIC_CORPUS_PATH = resources.files("conjure_eval.data").joinpath(
+    "public_corpus.json"
+)
+@dataclass(frozen=True)
+class Instance:
+    """A single public-slice CONJURE instance.
+    The `raw` field preserves the full source JSON record so downstream
+    tooling can read schema fields this package doesn't model yet.
+    """
+    instance_id: str
+    axis: str
+    prompt: str
+    raw: dict[str, Any]
+def load_public_corpus() -> dict[str, Any]:
+    """Return the parsed public-corpus JSON.
+    Raises
+    ------
+    FileNotFoundError
+        If the package was installed without the data file present (i.e.,
+        somebody mis-built the wheel).
+    """
+    pkg, name = PUBLIC_CORPUS_RESOURCE
+    text = resources.files(pkg).joinpath(name).read_text(encoding="utf-8")
+    return json.loads(text)
+def public_instance_ids() -> list[str]:
+    """Return the sorted list of public instance IDs."""
+    corpus = load_public_corpus()
+    return sorted(inst["instance_id"] for inst in corpus["instances"])
+def public_instance_by_id(instance_id: str) -> Instance:
+    """Look up a single public-slice instance by ID.
+    Raises
+    ------
+    KeyError
+        If the ID is not part of the public slice.
+    """
+    corpus = load_public_corpus()
+    for inst in corpus["instances"]:
+        if inst["instance_id"] == instance_id:
+            return Instance(
+                instance_id=inst["instance_id"],
+                axis=inst["axis"],
+                prompt=inst.get("prompt", ""),
+                raw=inst,
+            )
+    raise KeyError(
+        f"{instance_id!r} is not in the public slice (or not present in "
+        f"this conjure-eval release)"
+    )

conjure_eval-0.1.0/src/conjure_eval/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Frozen public-corpus data for `conjure-eval`."""