june-bench 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. june_bench-0.0.1/PKG-INFO +58 -0
  2. june_bench-0.0.1/README.md +28 -0
  3. june_bench-0.0.1/june_bench/__init__.py +19 -0
  4. june_bench-0.0.1/june_bench/cli.py +104 -0
  5. june_bench-0.0.1/june_bench/datasets/__init__.py +7 -0
  6. june_bench-0.0.1/june_bench/datasets/fixtures/hotpot.smoke.json +20 -0
  7. june_bench-0.0.1/june_bench/datasets/fixtures/june.smoke.json +13 -0
  8. june_bench-0.0.1/june_bench/datasets/fixtures/smoke.jsonl +4 -0
  9. june_bench-0.0.1/june_bench/datasets/loaders.py +138 -0
  10. june_bench-0.0.1/june_bench/datasets/registry.py +40 -0
  11. june_bench-0.0.1/june_bench/ports.py +73 -0
  12. june_bench-0.0.1/june_bench/profiles.py +37 -0
  13. june_bench-0.0.1/june_bench/report.py +60 -0
  14. june_bench-0.0.1/june_bench/runner.py +46 -0
  15. june_bench-0.0.1/june_bench/score.py +91 -0
  16. june_bench-0.0.1/june_bench/systems/__init__.py +49 -0
  17. june_bench-0.0.1/june_bench/systems/base.py +35 -0
  18. june_bench-0.0.1/june_bench/systems/cognee.py +100 -0
  19. june_bench-0.0.1/june_bench/systems/june_api.py +76 -0
  20. june_bench-0.0.1/june_bench/systems/june_local.py +30 -0
  21. june_bench-0.0.1/june_bench.egg-info/PKG-INFO +58 -0
  22. june_bench-0.0.1/june_bench.egg-info/SOURCES.txt +34 -0
  23. june_bench-0.0.1/june_bench.egg-info/dependency_links.txt +1 -0
  24. june_bench-0.0.1/june_bench.egg-info/entry_points.txt +2 -0
  25. june_bench-0.0.1/june_bench.egg-info/requires.txt +15 -0
  26. june_bench-0.0.1/june_bench.egg-info/top_level.txt +1 -0
  27. june_bench-0.0.1/pyproject.toml +51 -0
  28. june_bench-0.0.1/setup.cfg +4 -0
  29. june_bench-0.0.1/tests/test_sb0_smoke.py +63 -0
  30. june_bench-0.0.1/tests/test_sb1_parity.py +105 -0
  31. june_bench-0.0.1/tests/test_sb1_profiles.py +39 -0
  32. june_bench-0.0.1/tests/test_sb2_loaders.py +88 -0
  33. june_bench-0.0.1/tests/test_sb3_june_api.py +119 -0
  34. june_bench-0.0.1/tests/test_sb4_cognee.py +93 -0
  35. june_bench-0.0.1/tests/test_sb5_suite.py +57 -0
  36. june_bench-0.0.1/tests/test_sb6_wiring.py +58 -0
@@ -0,0 +1,58 @@
1
+ Metadata-Version: 2.4
2
+ Name: june-bench
3
+ Version: 0.0.1
4
+ Summary: Reproducible benchmark suite for memory/QA systems — June + pluggable competitors.
5
+ Author: Junemind
6
+ License: Proprietary
7
+ Project-URL: Homepage, https://github.com/Junemind
8
+ Keywords: benchmark,rag,memory,qa,retrieval,evaluation,llm,june
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: License :: Other/Proprietary License
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ Provides-Extra: june-api
21
+ Requires-Dist: httpx>=0.24; extra == "june-api"
22
+ Provides-Extra: june-local
23
+ Requires-Dist: june>=0.1.0; extra == "june-local"
24
+ Provides-Extra: cognee
25
+ Requires-Dist: cognee[evals]; extra == "cognee"
26
+ Provides-Extra: all
27
+ Requires-Dist: httpx>=0.24; extra == "all"
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest>=8; extra == "dev"
30
+
31
+ # june-bench
32
+
33
+ A pip-installable, **reproducible** benchmark suite for memory / QA systems — **June + pluggable
34
+ competitors** — over LoCoMo, LongMemEval, HotpotQA/2Wiki/MuSiQue, and FinanceBench, with the same
35
+ data and the same scorer.
36
+
37
+ ```bash
38
+ pip install june-bench
39
+ june-bench list
40
+ june-bench run --system echo --dataset smoke --split smoke # offline, no key, no download
41
+ ```
42
+
43
+ A benchmark is `run(system, dataset) → records → score`. Two typed ports are the only extension
44
+ points:
45
+
46
+ * **`System`** — the thing benchmarked. `JuneApiSystem` (default; a thin HTTP client to June's
47
+ `/v1/answer`, so **no June source is shipped**), `JuneLocalSystem` (`[june-local]` extra; a
48
+ source-protected compiled wheel), `CogneeSystem` (`[cognee]` extra), or any future system as one
49
+ adapter.
50
+ * **`Dataset`** — what it runs on. The four benchmarks behind a registry.
51
+
52
+ The scorer is the canonical SQuAD/HotpotQA EM/F1 + selective-accuracy/coverage/cost — Cognee-comparable.
53
+ Tiny **smoke fixtures ship in the wheel** (offline wiring proof); full splits are **fetched, sha-verified,
54
+ from a pinned release**. No score is ever baked into the package — every result row records
55
+ dataset + scorer + system + model + cost, so a published number is reproducible by a stranger.
56
+
57
+ Status: **SB0** (contracts + no-deps smoke + skeleton). Datasets, June/Cognee systems, and the full CLI
58
+ land in SB1–SB6.
@@ -0,0 +1,28 @@
1
+ # june-bench
2
+
3
+ A pip-installable, **reproducible** benchmark suite for memory / QA systems — **June + pluggable
4
+ competitors** — over LoCoMo, LongMemEval, HotpotQA/2Wiki/MuSiQue, and FinanceBench, with the same
5
+ data and the same scorer.
6
+
7
+ ```bash
8
+ pip install june-bench
9
+ june-bench list
10
+ june-bench run --system echo --dataset smoke --split smoke # offline, no key, no download
11
+ ```
12
+
13
+ A benchmark is `run(system, dataset) → records → score`. Two typed ports are the only extension
14
+ points:
15
+
16
+ * **`System`** — the thing benchmarked. `JuneApiSystem` (default; a thin HTTP client to June's
17
+ `/v1/answer`, so **no June source is shipped**), `JuneLocalSystem` (`[june-local]` extra; a
18
+ source-protected compiled wheel), `CogneeSystem` (`[cognee]` extra), or any future system as one
19
+ adapter.
20
+ * **`Dataset`** — what it runs on. The four benchmarks behind a registry.
21
+
22
+ The scorer is the canonical SQuAD/HotpotQA EM/F1 + selective-accuracy/coverage/cost — Cognee-comparable.
23
+ Tiny **smoke fixtures ship in the wheel** (offline wiring proof); full splits are **fetched, sha-verified,
24
+ from a pinned release**. No score is ever baked into the package — every result row records
25
+ dataset + scorer + system + model + cost, so a published number is reproducible by a stranger.
26
+
27
+ Status: **SB0** (contracts + no-deps smoke + skeleton). Datasets, June/Cognee systems, and the full CLI
28
+ land in SB1–SB6.
@@ -0,0 +1,19 @@
1
+ """june_bench — a pip-installable, reproducible benchmark suite for memory/QA systems.
2
+
3
+ `run(system, dataset) → records → score`. Two typed ports (`System`, `Dataset`) are the only
4
+ extension points; June and pluggable competitors plug in behind `System`, the four benchmarks behind
5
+ `Dataset`. The harness core is pure (no model, no network) and unit-tests with the bundled `smoke`
6
+ fixtures; June is reached over its REST API by default (no June source shipped).
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from june_bench.ports import Dataset, Example, Prediction, Record, System
11
+ from june_bench.runner import run, run_async
12
+ from june_bench.score import score
13
+
14
+ __version__ = "0.0.1"
15
+
16
+ __all__ = [
17
+ "Example", "Prediction", "Record", "System", "Dataset",
18
+ "run", "run_async", "score", "__version__",
19
+ ]
@@ -0,0 +1,104 @@
1
+ """june-bench CLI (SB0) — `run`, `list`. SB5 adds `suite`/`score`/`report`.
2
+
3
+ june-bench list
4
+ june-bench run --system echo --dataset smoke --split smoke
5
+ june-bench run --system null --dataset smoke --json
6
+
7
+ The `echo`/`null` systems + the bundled `smoke` dataset need no deps, key, or network, so this runs
8
+ offline immediately after `pip install june-bench`.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import sys
14
+
15
+ import pathlib
16
+
17
+ from june_bench import datasets, systems
18
+ from june_bench.report import suite_json, suite_markdown, to_json, to_markdown
19
+ from june_bench.runner import run
20
+ from june_bench.score import score
21
+
22
+
23
+ def _cmd_list(_args) -> int: # noqa: ANN001
24
+ print("systems: ", ", ".join(systems.names()))
25
+ print("datasets:", ", ".join(datasets.registry.names()))
26
+ return 0
27
+
28
+
29
+ def _cmd_run(args) -> int: # noqa: ANN001
30
+ try:
31
+ system = systems.get(args.system)
32
+ dataset = datasets.registry.get(args.dataset)
33
+ except KeyError as exc:
34
+ print(f"error: {exc}", file=sys.stderr)
35
+ return 2
36
+ records = run(system, dataset, split=args.split, limit=args.limit)
37
+ summary = score(records)
38
+ if args.json:
39
+ print(to_json(summary, system=args.system, dataset=args.dataset,
40
+ split=args.split, model=args.model))
41
+ else:
42
+ print(to_markdown(summary, system=args.system, dataset=args.dataset,
43
+ split=args.split, model=args.model))
44
+ return 0
45
+
46
+
47
+ def _cmd_suite(args) -> int: # noqa: ANN001
48
+ sysnames = [s.strip() for s in args.systems.split(",") if s.strip()]
49
+ dsnames = (datasets.registry.names() if args.datasets == "all"
50
+ else [d.strip() for d in args.datasets.split(",") if d.strip()])
51
+ rows: list[dict] = []
52
+ for sn in sysnames:
53
+ for dn in dsnames:
54
+ try: # fail-soft per cell — one bad cell never kills the matrix
55
+ recs = run(systems.get(sn), datasets.registry.get(dn),
56
+ split=args.split, limit=args.limit)
57
+ rows.append({"system": sn, "dataset": dn, "summary": score(recs)})
58
+ except Exception as exc: # noqa: BLE001
59
+ rows.append({"system": sn, "dataset": dn, "error": str(exc)})
60
+ if args.json:
61
+ out = suite_json(rows, split=args.split, model=args.model)
62
+ else:
63
+ out = suite_markdown(rows, split=args.split, model=args.model)
64
+ if args.out:
65
+ pathlib.Path(args.out).write_text(out, encoding="utf-8")
66
+ print(f"wrote {args.out} ({len(rows)} cells)")
67
+ else:
68
+ print(out)
69
+ return 0
70
+
71
+
72
+ def build_parser() -> argparse.ArgumentParser:
73
+ ap = argparse.ArgumentParser(prog="june-bench",
74
+ description="Reproducible benchmark suite (June + pluggable competitors)")
75
+ sub = ap.add_subparsers(dest="cmd", required=True)
76
+ sub.add_parser("list", help="list registered systems + datasets").set_defaults(func=_cmd_list)
77
+ r = sub.add_parser("run", help="run one system on one dataset split")
78
+ r.add_argument("--system", default="echo")
79
+ r.add_argument("--dataset", default="smoke")
80
+ r.add_argument("--split", default="smoke")
81
+ r.add_argument("--limit", type=int, default=None)
82
+ r.add_argument("--model", default="", help="model id, recorded in the result header")
83
+ r.add_argument("--json", action="store_true")
84
+ r.set_defaults(func=_cmd_run)
85
+
86
+ su = sub.add_parser("suite", help="run a systems × datasets matrix → a RESULTS.md table")
87
+ su.add_argument("--systems", default="echo", help="comma-separated system names")
88
+ su.add_argument("--datasets", default="smoke", help="comma-separated names, or 'all'")
89
+ su.add_argument("--split", default="smoke")
90
+ su.add_argument("--limit", type=int, default=None)
91
+ su.add_argument("--model", default="")
92
+ su.add_argument("--out", default="", help="write the table to this path (else stdout)")
93
+ su.add_argument("--json", action="store_true")
94
+ su.set_defaults(func=_cmd_suite)
95
+ return ap
96
+
97
+
98
+ def main(argv=None) -> int:
99
+ args = build_parser().parse_args(argv)
100
+ return args.func(args)
101
+
102
+
103
+ if __name__ == "__main__":
104
+ raise SystemExit(main())
@@ -0,0 +1,7 @@
1
+ """Datasets. SB0 ships the bundled `smoke` set; SB2 registers the four real benchmarks."""
2
+ from __future__ import annotations
3
+
4
+ from june_bench.datasets import registry
5
+ from june_bench.datasets.loaders import SmokeDataset
6
+
7
+ __all__ = ["registry", "SmokeDataset"]
@@ -0,0 +1,20 @@
1
+ [
2
+ {
3
+ "_id": "hp-smoke-1",
4
+ "question": "What nationality was the director of the 1994 film Ed Wood?",
5
+ "answer": "American",
6
+ "context": [
7
+ ["Ed Wood (film)", ["Ed Wood is a 1994 American biographical period comedy-drama film directed by Tim Burton.", " It stars Johnny Depp as the cult filmmaker Ed Wood."]],
8
+ ["Tim Burton", ["Timothy Walter Burton is an American filmmaker, born in Burbank, California."]]
9
+ ]
10
+ },
11
+ {
12
+ "_id": "hp-smoke-2",
13
+ "question": "In which city is the Eiffel Tower located?",
14
+ "answer": "Paris",
15
+ "context": [
16
+ ["Eiffel Tower", ["The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France."]],
17
+ ["Paris", ["Paris is the capital and most populous city of France."]]
18
+ ]
19
+ }
20
+ ]
@@ -0,0 +1,13 @@
1
+ {
2
+ "meta": {"source": "june_bench smoke fixture", "format": "june"},
3
+ "documents": [
4
+ {"id": "conv-smoke#0::session_1", "text": "Caroline mentioned she joined an LGBTQ support group on 7 May 2023."},
5
+ {"id": "conv-smoke#0::session_2", "text": "They talked about the weather and weekend plans."},
6
+ {"id": "conv-smoke#1::session_1", "text": "I graduated with a degree in Business Administration in 2018."},
7
+ {"id": "conv-smoke#1::session_2", "text": "Later we discussed my first job after college."}
8
+ ],
9
+ "queries": [
10
+ {"id": "conv-smoke#0", "query": "When did Caroline go to the LGBTQ support group?", "gold": ["conv-smoke#0::session_1"], "haystack": ["conv-smoke#0::session_1", "conv-smoke#0::session_2"], "question_type": "cat2", "answer": "7 May 2023"},
11
+ {"id": "conv-smoke#1", "query": "What degree did I graduate with?", "gold": ["conv-smoke#1::session_1"], "haystack": ["conv-smoke#1::session_1", "conv-smoke#1::session_2"], "question_type": "single-session-user", "answer": "Business Administration"}
12
+ ]
13
+ }
@@ -0,0 +1,4 @@
1
+ {"qid": "s1", "question": "Who wrote the first computer algorithm?", "golds": ["Ada Lovelace"], "context": ["Ada Lovelace is regarded as the first computer programmer; she wrote the first algorithm intended for a machine."]}
2
+ {"qid": "s2", "question": "What city is the Eiffel Tower in?", "golds": ["Paris"], "context": ["The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France."]}
3
+ {"qid": "s3", "question": "Which planet is known as the Red Planet?", "golds": ["Mars"], "context": ["Mars is often called the Red Planet because of its reddish appearance."]}
4
+ {"qid": "s4", "question": "Who painted the Mona Lisa?", "golds": ["Leonardo da Vinci", "da Vinci"], "context": ["The Mona Lisa is a portrait painted by the Italian artist Leonardo da Vinci."]}
@@ -0,0 +1,138 @@
1
+ """Dataset loaders (SB0 smoke + SB2 the four real benchmarks).
2
+
3
+ Two formats cover all six datasets, so each is one small adapter behind the `Dataset` port:
4
+
5
+ * **HotpotQA format** (`hotpot` / `2wiki` / `musique`) — `[{_id, question, answer, context:[[title,
6
+ [sents]]]}]`. Context-QA: the distractor passages ride in `Example.context`.
7
+ * **June format** (`locomo` / `longmemeval` / `financebench`) — `{documents:[{id,text}], queries:[{query,
8
+ answer, gold, haystack, question_type}]}` with **conversation-scoped** doc ids (`<conv>::<chunk>`).
9
+ Memory-QA: the conversation's documents ride in `Example.corpus` (a memory system ingests them, then
10
+ answers); `gold`/`question_type` ride in `meta`.
11
+
12
+ Reproducibility: the bundled few-KB fixtures (`fixtures/*.smoke.json`) make `--split smoke` run offline.
13
+ The full splits are read from the data dir (`JUNE_BENCH_DATA`, else the repo `data/` / `benchmarks/apex_qa/`
14
+ for in-repo dev); if absent, the loader raises with a `june-bench fetch` hint (the fetch lands SB-later).
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ import os
20
+ import pathlib
21
+ from collections.abc import Sequence
22
+
23
+ from june_bench.ports import Example
24
+
25
+ _HERE = pathlib.Path(__file__).resolve().parent
26
+ _FIXTURES = _HERE / "fixtures"
27
+ # In-repo dev locations (a standalone install relies on JUNE_BENCH_DATA / the fetch cache instead).
28
+ # loaders.py lives at <repo>/june_bench/june_bench/datasets/loaders.py → repo root is parents[2].
29
+ _REPO = _HERE.parents[2] if len(_HERE.parents) >= 3 else _HERE
30
+ _REPO_DATA = _REPO / "data"
31
+ _REPO_APEX = _REPO / "benchmarks" / "apex_qa"
32
+
33
+
34
+ def _read_json(path: pathlib.Path):
35
+ return json.loads(path.read_text(encoding="utf-8"))
36
+
37
+
38
+ def _resolve(filename: str) -> pathlib.Path | None:
39
+ for base in (os.environ.get("JUNE_BENCH_DATA"), _REPO_DATA, _REPO_APEX):
40
+ if not base:
41
+ continue
42
+ p = pathlib.Path(base) / filename
43
+ if p.exists():
44
+ return p
45
+ return None
46
+
47
+
48
+ def _need(filename: str) -> pathlib.Path:
49
+ p = _resolve(filename)
50
+ if p is None:
51
+ raise FileNotFoundError(
52
+ f"dataset file {filename!r} not found. Set JUNE_BENCH_DATA to the data dir, "
53
+ f"or run `june-bench fetch` to download the full splits (the bundled `smoke` split "
54
+ f"needs no data).")
55
+ return p
56
+
57
+
58
+ # ── HotpotQA format (hotpot / 2wiki / musique) ──────────────────────────────────────────────
59
+ def _hotpot_context(ctx) -> tuple[str, ...]:
60
+ out: list[str] = []
61
+ for entry in ctx or []:
62
+ # entry is [title, [sentences]]
63
+ if isinstance(entry, (list, tuple)) and len(entry) == 2 and isinstance(entry[1], (list, tuple)):
64
+ out.append(" ".join(str(s) for s in entry[1]).strip())
65
+ elif isinstance(entry, str):
66
+ out.append(entry)
67
+ return tuple(p for p in out if p)
68
+
69
+
70
+ def _hotpot_examples(raw: Sequence[dict]) -> list[Example]:
71
+ out = []
72
+ for i, r in enumerate(raw):
73
+ ans = r.get("answer", "")
74
+ out.append(Example(
75
+ qid=str(r.get("_id", i)), question=str(r.get("question", "")),
76
+ golds=tuple([ans]) if ans else (), context=_hotpot_context(r.get("context")),
77
+ meta={"question_type": r.get("type", ""), "level": r.get("level", "")}))
78
+ return out
79
+
80
+
81
+ class HotpotFormatDataset:
82
+ def __init__(self, name: str, full_file: str, *, fixture: str = "hotpot.smoke.json") -> None:
83
+ self.name = name
84
+ self._full = full_file
85
+ self._fixture = fixture
86
+
87
+ def load(self, split: str = "smoke") -> Sequence[Example]:
88
+ path = _FIXTURES / self._fixture if split == "smoke" else _need(self._full)
89
+ return _hotpot_examples(_read_json(path))
90
+
91
+
92
+ # ── June format (locomo / longmemeval / financebench) ───────────────────────────────────────
93
+ def _conv_key(doc_or_query_id: str) -> str:
94
+ return str(doc_or_query_id).split("::", 1)[0]
95
+
96
+
97
+ def _june_examples(blob: dict, *, max_corpus: int = 200) -> list[Example]:
98
+ docs_by_conv: dict[str, list[str]] = {}
99
+ for d in blob.get("documents", []):
100
+ docs_by_conv.setdefault(_conv_key(d.get("id", "")), []).append(str(d.get("text", "")))
101
+ out = []
102
+ for q in blob.get("queries", []):
103
+ conv = _conv_key(q.get("id", ""))
104
+ corpus = tuple(docs_by_conv.get(conv, [])[:max_corpus])
105
+ ans = q.get("answer")
106
+ out.append(Example(
107
+ qid=str(q.get("id", "")), question=str(q.get("query", "")),
108
+ golds=tuple([str(ans)]) if ans not in (None, "") else (),
109
+ corpus=corpus,
110
+ meta={"question_type": q.get("question_type", ""),
111
+ "gold_ids": list(q.get("gold", []) or [])}))
112
+ return out
113
+
114
+
115
+ class JuneFormatDataset:
116
+ def __init__(self, name: str, full_file: str, *, fixture: str = "june.smoke.json") -> None:
117
+ self.name = name
118
+ self._full = full_file
119
+ self._fixture = fixture
120
+
121
+ def load(self, split: str = "smoke") -> Sequence[Example]:
122
+ path = _FIXTURES / self._fixture if split == "smoke" else _need(self._full)
123
+ return _june_examples(_read_json(path))
124
+
125
+
126
+ # ── the bundled smoke trivia set (SB0) ──────────────────────────────────────────────────────
127
+ class SmokeDataset:
128
+ name = "smoke"
129
+
130
+ def load(self, split: str = "smoke") -> Sequence[Example]:
131
+ raws = [json.loads(ln) for ln in (_FIXTURES / "smoke.jsonl").read_text(
132
+ encoding="utf-8").splitlines() if ln.strip()]
133
+ return [Example(qid=str(r.get("qid", i)), question=str(r.get("question", "")),
134
+ golds=tuple(r.get("golds", [])), context=tuple(r.get("context", [])))
135
+ for i, r in enumerate(raws)]
136
+
137
+
138
+ __all__ = ["SmokeDataset", "HotpotFormatDataset", "JuneFormatDataset"]
@@ -0,0 +1,40 @@
1
+ """Dataset registry (SB0) — name → Dataset. Adding a benchmark is one entry, never a runner edit."""
2
+ from __future__ import annotations
3
+
4
+ from june_bench.ports import Dataset
5
+ from june_bench.datasets.loaders import (
6
+ HotpotFormatDataset,
7
+ JuneFormatDataset,
8
+ SmokeDataset,
9
+ )
10
+
11
+ _DATASETS: dict[str, Dataset] = {}
12
+
13
+
14
+ def register(dataset: Dataset) -> None:
15
+ if not getattr(dataset, "name", ""):
16
+ raise ValueError("dataset must have a non-empty name")
17
+ _DATASETS[dataset.name] = dataset
18
+
19
+
20
+ def get(name: str) -> Dataset:
21
+ if name not in _DATASETS:
22
+ raise KeyError(f"unknown dataset {name!r}; registered: {names()}")
23
+ return _DATASETS[name]
24
+
25
+
26
+ def names() -> list[str]:
27
+ return sorted(_DATASETS)
28
+
29
+
30
+ register(SmokeDataset())
31
+ # SB2 — the four benchmarks (six datasets, two formats). Full files resolve from the data dir
32
+ # (JUNE_BENCH_DATA / repo); the bundled `smoke` split needs no data.
33
+ register(HotpotFormatDataset("hotpot", "hotpot_dev.json"))
34
+ register(HotpotFormatDataset("2wiki", "2wiki_dev.json"))
35
+ register(HotpotFormatDataset("musique", "musique_dev.json"))
36
+ register(JuneFormatDataset("locomo", "locomo.june.json"))
37
+ register(JuneFormatDataset("longmemeval", "lme.june.json"))
38
+ register(JuneFormatDataset("financebench", "financebench.june.json"))
39
+
40
+ __all__ = ["register", "get", "names"]
@@ -0,0 +1,73 @@
1
+ """june_bench ports — the two typed contracts the whole suite extends through (SB0).
2
+
3
+ A benchmark is ``run(system, dataset) → records → score``. Only two things vary: the **System**
4
+ being benchmarked and the **Dataset** it runs on. Everything else (runner, scorer, reporter) is
5
+ fixed. So a new competitor or a new dataset is exactly one adapter behind one of these Protocols —
6
+ never an edit to the harness. Pure: stdlib only, so the core installs and unit-tests with no model,
7
+ no datasets, no network (the ``NullSystem``/``EchoSystem`` smoke).
8
+ """
9
+ from __future__ import annotations
10
+
11
+ from collections.abc import Sequence
12
+ from dataclasses import dataclass, field
13
+ from typing import Protocol, runtime_checkable
14
+
15
+
16
+ @dataclass(frozen=True, slots=True)
17
+ class Example:
18
+ """One benchmark item. ``context`` is in-prompt passages (context-QA like HotpotQA);
19
+ ``corpus`` is documents a *memory* system ingests before answering (LoCoMo/LongMemEval).
20
+ A system uses whichever its modality needs. ``meta`` carries dataset-specific fields
21
+ (category, session id, …) so per-category scoring needs no schema change."""
22
+ qid: str
23
+ question: str
24
+ golds: tuple[str, ...]
25
+ context: tuple[str, ...] = ()
26
+ corpus: tuple[str, ...] = ()
27
+ meta: dict = field(default_factory=dict)
28
+
29
+
30
+ @dataclass(frozen=True, slots=True)
31
+ class Prediction:
32
+ """A system's answer + optional provenance (``calls``/``cost``/``abstained`` in ``meta``)
33
+ so the cost axis is measured, not guessed."""
34
+ text: str
35
+ meta: dict = field(default_factory=dict)
36
+
37
+
38
+ @dataclass(frozen=True, slots=True)
39
+ class Record:
40
+ """One scored row: the prediction joined to its gold + provenance. The scorer reads only
41
+ these, so scoring is decoupled from how the answer was produced (any System, any Dataset)."""
42
+ qid: str
43
+ question: str
44
+ golds: tuple[str, ...]
45
+ prediction: str
46
+ context: tuple[str, ...] = ()
47
+ calls: int = 0
48
+ cost: float = 0.0
49
+ abstained: bool = False
50
+ meta: dict = field(default_factory=dict)
51
+
52
+
53
+ @runtime_checkable
54
+ class System(Protocol):
55
+ """A thing being benchmarked. ``answer`` is the one required method; a memory system may also
56
+ expose ``ingest(corpus)`` (the runner calls it when present). Async so real adapters (June over
57
+ HTTP, an LLM) fit without blocking; sync logic just returns. The model/HTTP/heavy deps live in
58
+ the adapter, never in this contract."""
59
+ name: str
60
+
61
+ async def answer(self, example: Example) -> Prediction: ...
62
+
63
+
64
+ @runtime_checkable
65
+ class Dataset(Protocol):
66
+ """A benchmark's data. ``load(split)`` returns the examples for a named split (e.g. ``smoke``,
67
+ ``dev100``, ``full``). Loaders are pure readers over the bundled fixtures or the fetched dumps."""
68
+ name: str
69
+
70
+ def load(self, split: str) -> Sequence[Example]: ...
71
+
72
+
73
+ __all__ = ["Example", "Prediction", "Record", "System", "Dataset"]
@@ -0,0 +1,37 @@
1
+ """Per-dataset scoring profiles (SB1) — which axes are the *headline* for each benchmark.
2
+
3
+ All benchmarks are scored on the same three axes (accuracy/coverage/cost, `score.py`), but they don't
4
+ all *report* the same headline: HotpotQA/2Wiki/MuSiQue/FinanceBench are EM/F1; the memory benchmarks
5
+ (LoCoMo, LongMemEval) pair accuracy with coverage / retrieval-recall. A profile is just the ordered
6
+ metric keys a reporter surfaces — adding a dataset's profile is one entry, never a scorer change."""
7
+ from __future__ import annotations
8
+
9
+ _PROFILES: dict[str, tuple[str, ...]] = {
10
+ "default": ("em", "f1", "coverage"),
11
+ "hotpot": ("em", "f1"),
12
+ "2wiki": ("em", "f1"),
13
+ "musique": ("em", "f1"),
14
+ "locomo": ("em", "f1", "coverage"),
15
+ "longmemeval": ("em", "f1", "context_recall"),
16
+ "financebench": ("em", "f1"),
17
+ "smoke": ("em", "f1"),
18
+ }
19
+
20
+
21
+ def headline_metrics(dataset: str) -> tuple[str, ...]:
22
+ """The ordered headline metric keys for a dataset (falls back to the default profile)."""
23
+ return _PROFILES.get(dataset, _PROFILES["default"])
24
+
25
+
26
+ def headline(summary: dict, dataset: str) -> dict:
27
+ """Project a full `score()` summary down to a dataset's headline metrics (order preserved)."""
28
+ return {k: summary[k] for k in headline_metrics(dataset) if k in summary}
29
+
30
+
31
+ def register_profile(dataset: str, metrics: tuple[str, ...]) -> None:
32
+ if not metrics:
33
+ raise ValueError("a profile needs at least one metric")
34
+ _PROFILES[dataset] = tuple(metrics)
35
+
36
+
37
+ __all__ = ["headline_metrics", "headline", "register_profile"]
@@ -0,0 +1,60 @@
1
+ """Reporter (SB0) — a scored summary → a self-describing markdown row + JSON.
2
+
3
+ Every row records *system · dataset · split · model + the three axes*, so a published number is
4
+ reproducible and self-describing (the honesty discipline). SB5 extends this to a full multi-arm
5
+ `RESULTS.md` table; SB0 ships the single-run row.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import json
10
+
11
+
12
+ def to_json(summary: dict, *, system: str, dataset: str, split: str, model: str = "") -> str:
13
+ head = {"system": system, "dataset": dataset, "split": split, "model": model}
14
+ return json.dumps({**head, **summary}, indent=2)
15
+
16
+
17
+ def to_markdown(summary: dict, *, system: str, dataset: str, split: str, model: str = "") -> str:
18
+ s = summary
19
+ return (
20
+ f"### {system} · {dataset}/{split}" + (f" · {model}" if model else "") + "\n\n"
21
+ "| n | answered | EM | F1 | coverage | ctx-recall | calls/ans | cost/ans |\n"
22
+ "|---|---|---|---|---|---|---|---|\n"
23
+ f"| {s['n']} | {s['answered']} | {s['em']} | {s['f1']} | {s['coverage']} | "
24
+ f"{s['context_recall']} | {s['calls_per_answer']} | {s['cost_per_answer']} |\n"
25
+ )
26
+
27
+
28
+ def suite_markdown(rows: list[dict], *, split: str = "", model: str = "") -> str:
29
+ """A systems × datasets matrix → one `RESULTS.md`-ready table. Each row is
30
+ ``{system, dataset, summary}`` or ``{system, dataset, error}`` (fail-soft per cell). The
31
+ per-dataset *headline* metrics (profiles) are noted, but every axis is shown for honesty."""
32
+ from june_bench.profiles import headline_metrics
33
+ head = f"# june-bench results"
34
+ if split:
35
+ head += f" · split `{split}`"
36
+ if model:
37
+ head += f" · model `{model}`"
38
+ lines = [head, "",
39
+ "| system | dataset | headline | n | EM | F1 | coverage | ctx-recall | calls/ans | cost/ans |",
40
+ "|---|---|---|---|---|---|---|---|---|---|"]
41
+ for r in rows:
42
+ sysn, dsn = r.get("system", "?"), r.get("dataset", "?")
43
+ if "error" in r:
44
+ lines.append(f"| {sysn} | {dsn} | — | — | _error_ | {r['error'][:60]} | | | | |")
45
+ continue
46
+ s = r["summary"]
47
+ hl = "/".join(headline_metrics(dsn))
48
+ lines.append(f"| {sysn} | {dsn} | {hl} | {s['n']} | {s['em']} | {s['f1']} | "
49
+ f"{s['coverage']} | {s['context_recall']} | {s['calls_per_answer']} | "
50
+ f"{s['cost_per_answer']} |")
51
+ lines += ["", "_Same data, same scorer; every cell records the system, dataset, split and model. "
52
+ "Headline = the dataset's reported metric(s); all axes shown for honesty._"]
53
+ return "\n".join(lines) + "\n"
54
+
55
+
56
+ def suite_json(rows: list[dict], *, split: str = "", model: str = "") -> str:
57
+ return json.dumps({"split": split, "model": model, "rows": rows}, indent=2)
58
+
59
+
60
+ __all__ = ["to_json", "to_markdown", "suite_markdown", "suite_json"]
@@ -0,0 +1,46 @@
1
+ """june_bench runner — the fixed orchestration: ask the System each Example → Records (SB0).
2
+
3
+ Pure: depends only on the two ports, so it's unit-tested with a fake System + fixture Dataset
4
+ (no model, no network). If a System exposes ``ingest`` (a memory system), the runner calls it with
5
+ each example's ``corpus`` before asking — so context-QA and memory systems share one path.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ import inspect
11
+ from collections.abc import Sequence
12
+
13
+ from june_bench.ports import Dataset, Example, Record, System
14
+
15
+
16
+ async def _maybe_await(value):
17
+ return await value if inspect.isawaitable(value) else value
18
+
19
+
20
+ async def _answer_one(system: System, ex: Example) -> Record:
21
+ ingest = getattr(system, "ingest", None)
22
+ if ingest is not None and ex.corpus:
23
+ await _maybe_await(ingest(ex.corpus))
24
+ pred = await system.answer(ex)
25
+ m = pred.meta or {}
26
+ return Record(
27
+ qid=ex.qid, question=ex.question, golds=ex.golds, prediction=pred.text,
28
+ context=ex.context, calls=int(m.get("calls", 0)), cost=float(m.get("cost", 0.0)),
29
+ abstained=bool(m.get("abstained", False)), meta=dict(m))
30
+
31
+
32
+ async def run_async(system: System, examples: Sequence[Example]) -> list[Record]:
33
+ return [await _answer_one(system, ex) for ex in examples]
34
+
35
+
36
+ def run(system: System, dataset: Dataset, *, split: str = "smoke",
37
+ limit: int | None = None) -> list[Record]:
38
+ """Load the split, ask the system each example, return scored-ready Records. Sync entry point
39
+ (wraps the async path) so the CLI and tests call one function."""
40
+ examples = list(dataset.load(split))
41
+ if limit is not None:
42
+ examples = examples[:limit]
43
+ return asyncio.run(run_async(system, examples))
44
+
45
+
46
+ __all__ = ["run", "run_async"]