june-bench 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- june_bench-0.0.1/PKG-INFO +58 -0
- june_bench-0.0.1/README.md +28 -0
- june_bench-0.0.1/june_bench/__init__.py +19 -0
- june_bench-0.0.1/june_bench/cli.py +104 -0
- june_bench-0.0.1/june_bench/datasets/__init__.py +7 -0
- june_bench-0.0.1/june_bench/datasets/fixtures/hotpot.smoke.json +20 -0
- june_bench-0.0.1/june_bench/datasets/fixtures/june.smoke.json +13 -0
- june_bench-0.0.1/june_bench/datasets/fixtures/smoke.jsonl +4 -0
- june_bench-0.0.1/june_bench/datasets/loaders.py +138 -0
- june_bench-0.0.1/june_bench/datasets/registry.py +40 -0
- june_bench-0.0.1/june_bench/ports.py +73 -0
- june_bench-0.0.1/june_bench/profiles.py +37 -0
- june_bench-0.0.1/june_bench/report.py +60 -0
- june_bench-0.0.1/june_bench/runner.py +46 -0
- june_bench-0.0.1/june_bench/score.py +91 -0
- june_bench-0.0.1/june_bench/systems/__init__.py +49 -0
- june_bench-0.0.1/june_bench/systems/base.py +35 -0
- june_bench-0.0.1/june_bench/systems/cognee.py +100 -0
- june_bench-0.0.1/june_bench/systems/june_api.py +76 -0
- june_bench-0.0.1/june_bench/systems/june_local.py +30 -0
- june_bench-0.0.1/june_bench.egg-info/PKG-INFO +58 -0
- june_bench-0.0.1/june_bench.egg-info/SOURCES.txt +34 -0
- june_bench-0.0.1/june_bench.egg-info/dependency_links.txt +1 -0
- june_bench-0.0.1/june_bench.egg-info/entry_points.txt +2 -0
- june_bench-0.0.1/june_bench.egg-info/requires.txt +15 -0
- june_bench-0.0.1/june_bench.egg-info/top_level.txt +1 -0
- june_bench-0.0.1/pyproject.toml +51 -0
- june_bench-0.0.1/setup.cfg +4 -0
- june_bench-0.0.1/tests/test_sb0_smoke.py +63 -0
- june_bench-0.0.1/tests/test_sb1_parity.py +105 -0
- june_bench-0.0.1/tests/test_sb1_profiles.py +39 -0
- june_bench-0.0.1/tests/test_sb2_loaders.py +88 -0
- june_bench-0.0.1/tests/test_sb3_june_api.py +119 -0
- june_bench-0.0.1/tests/test_sb4_cognee.py +93 -0
- june_bench-0.0.1/tests/test_sb5_suite.py +57 -0
- june_bench-0.0.1/tests/test_sb6_wiring.py +58 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: june-bench
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Reproducible benchmark suite for memory/QA systems — June + pluggable competitors.
|
|
5
|
+
Author: Junemind
|
|
6
|
+
License: Proprietary
|
|
7
|
+
Project-URL: Homepage, https://github.com/Junemind
|
|
8
|
+
Keywords: benchmark,rag,memory,qa,retrieval,evaluation,llm,june
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Classifier: License :: Other/Proprietary License
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Provides-Extra: june-api
|
|
21
|
+
Requires-Dist: httpx>=0.24; extra == "june-api"
|
|
22
|
+
Provides-Extra: june-local
|
|
23
|
+
Requires-Dist: june>=0.1.0; extra == "june-local"
|
|
24
|
+
Provides-Extra: cognee
|
|
25
|
+
Requires-Dist: cognee[evals]; extra == "cognee"
|
|
26
|
+
Provides-Extra: all
|
|
27
|
+
Requires-Dist: httpx>=0.24; extra == "all"
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
30
|
+
|
|
31
|
+
# june-bench
|
|
32
|
+
|
|
33
|
+
A pip-installable, **reproducible** benchmark suite for memory / QA systems — **June + pluggable
|
|
34
|
+
competitors** — over LoCoMo, LongMemEval, HotpotQA/2Wiki/MuSiQue, and FinanceBench, with the same
|
|
35
|
+
data and the same scorer.
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install june-bench
|
|
39
|
+
june-bench list
|
|
40
|
+
june-bench run --system echo --dataset smoke --split smoke # offline, no key, no download
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
A benchmark is `run(system, dataset) → records → score`. Two typed ports are the only extension
|
|
44
|
+
points:
|
|
45
|
+
|
|
46
|
+
* **`System`** — the thing benchmarked. `JuneApiSystem` (default; a thin HTTP client to June's
|
|
47
|
+
`/v1/answer`, so **no June source is shipped**), `JuneLocalSystem` (`[june-local]` extra; a
|
|
48
|
+
source-protected compiled wheel), `CogneeSystem` (`[cognee]` extra), or any future system as one
|
|
49
|
+
adapter.
|
|
50
|
+
* **`Dataset`** — what it runs on. The four benchmarks behind a registry.
|
|
51
|
+
|
|
52
|
+
The scorer is the canonical SQuAD/HotpotQA EM/F1 + selective-accuracy/coverage/cost — Cognee-comparable.
|
|
53
|
+
Tiny **smoke fixtures ship in the wheel** (offline wiring proof); full splits are **fetched, sha-verified,
|
|
54
|
+
from a pinned release**. No score is ever baked into the package — every result row records
|
|
55
|
+
dataset + scorer + system + model + cost, so a published number is reproducible by a stranger.
|
|
56
|
+
|
|
57
|
+
Status: **SB0** (contracts + no-deps smoke + skeleton). Datasets, June/Cognee systems, and the full CLI
|
|
58
|
+
land in SB1–SB6.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# june-bench
|
|
2
|
+
|
|
3
|
+
A pip-installable, **reproducible** benchmark suite for memory / QA systems — **June + pluggable
|
|
4
|
+
competitors** — over LoCoMo, LongMemEval, HotpotQA/2Wiki/MuSiQue, and FinanceBench, with the same
|
|
5
|
+
data and the same scorer.
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install june-bench
|
|
9
|
+
june-bench list
|
|
10
|
+
june-bench run --system echo --dataset smoke --split smoke # offline, no key, no download
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
A benchmark is `run(system, dataset) → records → score`. Two typed ports are the only extension
|
|
14
|
+
points:
|
|
15
|
+
|
|
16
|
+
* **`System`** — the thing benchmarked. `JuneApiSystem` (default; a thin HTTP client to June's
|
|
17
|
+
`/v1/answer`, so **no June source is shipped**), `JuneLocalSystem` (`[june-local]` extra; a
|
|
18
|
+
source-protected compiled wheel), `CogneeSystem` (`[cognee]` extra), or any future system as one
|
|
19
|
+
adapter.
|
|
20
|
+
* **`Dataset`** — what it runs on. The four benchmarks behind a registry.
|
|
21
|
+
|
|
22
|
+
The scorer is the canonical SQuAD/HotpotQA EM/F1 + selective-accuracy/coverage/cost — Cognee-comparable.
|
|
23
|
+
Tiny **smoke fixtures ship in the wheel** (offline wiring proof); full splits are **fetched, sha-verified,
|
|
24
|
+
from a pinned release**. No score is ever baked into the package — every result row records
|
|
25
|
+
dataset + scorer + system + model + cost, so a published number is reproducible by a stranger.
|
|
26
|
+
|
|
27
|
+
Status: **SB0** (contracts + no-deps smoke + skeleton). Datasets, June/Cognee systems, and the full CLI
|
|
28
|
+
land in SB1–SB6.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""june_bench — a pip-installable, reproducible benchmark suite for memory/QA systems.
|
|
2
|
+
|
|
3
|
+
`run(system, dataset) → records → score`. Two typed ports (`System`, `Dataset`) are the only
|
|
4
|
+
extension points; June and pluggable competitors plug in behind `System`, the four benchmarks behind
|
|
5
|
+
`Dataset`. The harness core is pure (no model, no network) and unit-tests with the bundled `smoke`
|
|
6
|
+
fixtures; June is reached over its REST API by default (no June source shipped).
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from june_bench.ports import Dataset, Example, Prediction, Record, System
|
|
11
|
+
from june_bench.runner import run, run_async
|
|
12
|
+
from june_bench.score import score
|
|
13
|
+
|
|
14
|
+
__version__ = "0.0.1"
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"Example", "Prediction", "Record", "System", "Dataset",
|
|
18
|
+
"run", "run_async", "score", "__version__",
|
|
19
|
+
]
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""june-bench CLI (SB0) — `run`, `list`. SB5 adds `suite`/`score`/`report`.
|
|
2
|
+
|
|
3
|
+
june-bench list
|
|
4
|
+
june-bench run --system echo --dataset smoke --split smoke
|
|
5
|
+
june-bench run --system null --dataset smoke --json
|
|
6
|
+
|
|
7
|
+
The `echo`/`null` systems + the bundled `smoke` dataset need no deps, key, or network, so this runs
|
|
8
|
+
offline immediately after `pip install june-bench`.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import sys
|
|
14
|
+
|
|
15
|
+
import pathlib
|
|
16
|
+
|
|
17
|
+
from june_bench import datasets, systems
|
|
18
|
+
from june_bench.report import suite_json, suite_markdown, to_json, to_markdown
|
|
19
|
+
from june_bench.runner import run
|
|
20
|
+
from june_bench.score import score
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _cmd_list(_args) -> int: # noqa: ANN001
|
|
24
|
+
print("systems: ", ", ".join(systems.names()))
|
|
25
|
+
print("datasets:", ", ".join(datasets.registry.names()))
|
|
26
|
+
return 0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _cmd_run(args) -> int: # noqa: ANN001
|
|
30
|
+
try:
|
|
31
|
+
system = systems.get(args.system)
|
|
32
|
+
dataset = datasets.registry.get(args.dataset)
|
|
33
|
+
except KeyError as exc:
|
|
34
|
+
print(f"error: {exc}", file=sys.stderr)
|
|
35
|
+
return 2
|
|
36
|
+
records = run(system, dataset, split=args.split, limit=args.limit)
|
|
37
|
+
summary = score(records)
|
|
38
|
+
if args.json:
|
|
39
|
+
print(to_json(summary, system=args.system, dataset=args.dataset,
|
|
40
|
+
split=args.split, model=args.model))
|
|
41
|
+
else:
|
|
42
|
+
print(to_markdown(summary, system=args.system, dataset=args.dataset,
|
|
43
|
+
split=args.split, model=args.model))
|
|
44
|
+
return 0
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _cmd_suite(args) -> int: # noqa: ANN001
|
|
48
|
+
sysnames = [s.strip() for s in args.systems.split(",") if s.strip()]
|
|
49
|
+
dsnames = (datasets.registry.names() if args.datasets == "all"
|
|
50
|
+
else [d.strip() for d in args.datasets.split(",") if d.strip()])
|
|
51
|
+
rows: list[dict] = []
|
|
52
|
+
for sn in sysnames:
|
|
53
|
+
for dn in dsnames:
|
|
54
|
+
try: # fail-soft per cell — one bad cell never kills the matrix
|
|
55
|
+
recs = run(systems.get(sn), datasets.registry.get(dn),
|
|
56
|
+
split=args.split, limit=args.limit)
|
|
57
|
+
rows.append({"system": sn, "dataset": dn, "summary": score(recs)})
|
|
58
|
+
except Exception as exc: # noqa: BLE001
|
|
59
|
+
rows.append({"system": sn, "dataset": dn, "error": str(exc)})
|
|
60
|
+
if args.json:
|
|
61
|
+
out = suite_json(rows, split=args.split, model=args.model)
|
|
62
|
+
else:
|
|
63
|
+
out = suite_markdown(rows, split=args.split, model=args.model)
|
|
64
|
+
if args.out:
|
|
65
|
+
pathlib.Path(args.out).write_text(out, encoding="utf-8")
|
|
66
|
+
print(f"wrote {args.out} ({len(rows)} cells)")
|
|
67
|
+
else:
|
|
68
|
+
print(out)
|
|
69
|
+
return 0
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
73
|
+
ap = argparse.ArgumentParser(prog="june-bench",
|
|
74
|
+
description="Reproducible benchmark suite (June + pluggable competitors)")
|
|
75
|
+
sub = ap.add_subparsers(dest="cmd", required=True)
|
|
76
|
+
sub.add_parser("list", help="list registered systems + datasets").set_defaults(func=_cmd_list)
|
|
77
|
+
r = sub.add_parser("run", help="run one system on one dataset split")
|
|
78
|
+
r.add_argument("--system", default="echo")
|
|
79
|
+
r.add_argument("--dataset", default="smoke")
|
|
80
|
+
r.add_argument("--split", default="smoke")
|
|
81
|
+
r.add_argument("--limit", type=int, default=None)
|
|
82
|
+
r.add_argument("--model", default="", help="model id, recorded in the result header")
|
|
83
|
+
r.add_argument("--json", action="store_true")
|
|
84
|
+
r.set_defaults(func=_cmd_run)
|
|
85
|
+
|
|
86
|
+
su = sub.add_parser("suite", help="run a systems × datasets matrix → a RESULTS.md table")
|
|
87
|
+
su.add_argument("--systems", default="echo", help="comma-separated system names")
|
|
88
|
+
su.add_argument("--datasets", default="smoke", help="comma-separated names, or 'all'")
|
|
89
|
+
su.add_argument("--split", default="smoke")
|
|
90
|
+
su.add_argument("--limit", type=int, default=None)
|
|
91
|
+
su.add_argument("--model", default="")
|
|
92
|
+
su.add_argument("--out", default="", help="write the table to this path (else stdout)")
|
|
93
|
+
su.add_argument("--json", action="store_true")
|
|
94
|
+
su.set_defaults(func=_cmd_suite)
|
|
95
|
+
return ap
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def main(argv=None) -> int:
|
|
99
|
+
args = build_parser().parse_args(argv)
|
|
100
|
+
return args.func(args)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
if __name__ == "__main__":
|
|
104
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""Datasets. SB0 ships the bundled `smoke` set; SB2 registers the four real benchmarks."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from june_bench.datasets import registry
|
|
5
|
+
from june_bench.datasets.loaders import SmokeDataset
|
|
6
|
+
|
|
7
|
+
__all__ = ["registry", "SmokeDataset"]
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"_id": "hp-smoke-1",
|
|
4
|
+
"question": "What nationality was the director of the 1994 film Ed Wood?",
|
|
5
|
+
"answer": "American",
|
|
6
|
+
"context": [
|
|
7
|
+
["Ed Wood (film)", ["Ed Wood is a 1994 American biographical period comedy-drama film directed by Tim Burton.", " It stars Johnny Depp as the cult filmmaker Ed Wood."]],
|
|
8
|
+
["Tim Burton", ["Timothy Walter Burton is an American filmmaker, born in Burbank, California."]]
|
|
9
|
+
]
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"_id": "hp-smoke-2",
|
|
13
|
+
"question": "In which city is the Eiffel Tower located?",
|
|
14
|
+
"answer": "Paris",
|
|
15
|
+
"context": [
|
|
16
|
+
["Eiffel Tower", ["The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France."]],
|
|
17
|
+
["Paris", ["Paris is the capital and most populous city of France."]]
|
|
18
|
+
]
|
|
19
|
+
}
|
|
20
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
{
|
|
2
|
+
"meta": {"source": "june_bench smoke fixture", "format": "june"},
|
|
3
|
+
"documents": [
|
|
4
|
+
{"id": "conv-smoke#0::session_1", "text": "Caroline mentioned she joined an LGBTQ support group on 7 May 2023."},
|
|
5
|
+
{"id": "conv-smoke#0::session_2", "text": "They talked about the weather and weekend plans."},
|
|
6
|
+
{"id": "conv-smoke#1::session_1", "text": "I graduated with a degree in Business Administration in 2018."},
|
|
7
|
+
{"id": "conv-smoke#1::session_2", "text": "Later we discussed my first job after college."}
|
|
8
|
+
],
|
|
9
|
+
"queries": [
|
|
10
|
+
{"id": "conv-smoke#0", "query": "When did Caroline go to the LGBTQ support group?", "gold": ["conv-smoke#0::session_1"], "haystack": ["conv-smoke#0::session_1", "conv-smoke#0::session_2"], "question_type": "cat2", "answer": "7 May 2023"},
|
|
11
|
+
{"id": "conv-smoke#1", "query": "What degree did I graduate with?", "gold": ["conv-smoke#1::session_1"], "haystack": ["conv-smoke#1::session_1", "conv-smoke#1::session_2"], "question_type": "single-session-user", "answer": "Business Administration"}
|
|
12
|
+
]
|
|
13
|
+
}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
{"qid": "s1", "question": "Who wrote the first computer algorithm?", "golds": ["Ada Lovelace"], "context": ["Ada Lovelace is regarded as the first computer programmer; she wrote the first algorithm intended for a machine."]}
|
|
2
|
+
{"qid": "s2", "question": "What city is the Eiffel Tower in?", "golds": ["Paris"], "context": ["The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France."]}
|
|
3
|
+
{"qid": "s3", "question": "Which planet is known as the Red Planet?", "golds": ["Mars"], "context": ["Mars is often called the Red Planet because of its reddish appearance."]}
|
|
4
|
+
{"qid": "s4", "question": "Who painted the Mona Lisa?", "golds": ["Leonardo da Vinci", "da Vinci"], "context": ["The Mona Lisa is a portrait painted by the Italian artist Leonardo da Vinci."]}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""Dataset loaders (SB0 smoke + SB2 the four real benchmarks).
|
|
2
|
+
|
|
3
|
+
Two formats cover all six datasets, so each is one small adapter behind the `Dataset` port:
|
|
4
|
+
|
|
5
|
+
* **HotpotQA format** (`hotpot` / `2wiki` / `musique`) — `[{_id, question, answer, context:[[title,
|
|
6
|
+
[sents]]]}]`. Context-QA: the distractor passages ride in `Example.context`.
|
|
7
|
+
* **June format** (`locomo` / `longmemeval` / `financebench`) — `{documents:[{id,text}], queries:[{query,
|
|
8
|
+
answer, gold, haystack, question_type}]}` with **conversation-scoped** doc ids (`<conv>::<chunk>`).
|
|
9
|
+
Memory-QA: the conversation's documents ride in `Example.corpus` (a memory system ingests them, then
|
|
10
|
+
answers); `gold`/`question_type` ride in `meta`.
|
|
11
|
+
|
|
12
|
+
Reproducibility: the bundled few-KB fixtures (`fixtures/*.smoke.json`) make `--split smoke` run offline.
|
|
13
|
+
The full splits are read from the data dir (`JUNE_BENCH_DATA`, else the repo `data/` / `benchmarks/apex_qa/`
|
|
14
|
+
for in-repo dev); if absent, the loader raises with a `june-bench fetch` hint (the fetch lands SB-later).
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import pathlib
|
|
21
|
+
from collections.abc import Sequence
|
|
22
|
+
|
|
23
|
+
from june_bench.ports import Example
|
|
24
|
+
|
|
25
|
+
_HERE = pathlib.Path(__file__).resolve().parent
|
|
26
|
+
_FIXTURES = _HERE / "fixtures"
|
|
27
|
+
# In-repo dev locations (a standalone install relies on JUNE_BENCH_DATA / the fetch cache instead).
|
|
28
|
+
# loaders.py lives at <repo>/june_bench/june_bench/datasets/loaders.py → repo root is parents[2].
|
|
29
|
+
_REPO = _HERE.parents[2] if len(_HERE.parents) >= 3 else _HERE
|
|
30
|
+
_REPO_DATA = _REPO / "data"
|
|
31
|
+
_REPO_APEX = _REPO / "benchmarks" / "apex_qa"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _read_json(path: pathlib.Path):
|
|
35
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _resolve(filename: str) -> pathlib.Path | None:
|
|
39
|
+
for base in (os.environ.get("JUNE_BENCH_DATA"), _REPO_DATA, _REPO_APEX):
|
|
40
|
+
if not base:
|
|
41
|
+
continue
|
|
42
|
+
p = pathlib.Path(base) / filename
|
|
43
|
+
if p.exists():
|
|
44
|
+
return p
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _need(filename: str) -> pathlib.Path:
|
|
49
|
+
p = _resolve(filename)
|
|
50
|
+
if p is None:
|
|
51
|
+
raise FileNotFoundError(
|
|
52
|
+
f"dataset file {filename!r} not found. Set JUNE_BENCH_DATA to the data dir, "
|
|
53
|
+
f"or run `june-bench fetch` to download the full splits (the bundled `smoke` split "
|
|
54
|
+
f"needs no data).")
|
|
55
|
+
return p
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# ── HotpotQA format (hotpot / 2wiki / musique) ──────────────────────────────────────────────
|
|
59
|
+
def _hotpot_context(ctx) -> tuple[str, ...]:
|
|
60
|
+
out: list[str] = []
|
|
61
|
+
for entry in ctx or []:
|
|
62
|
+
# entry is [title, [sentences]]
|
|
63
|
+
if isinstance(entry, (list, tuple)) and len(entry) == 2 and isinstance(entry[1], (list, tuple)):
|
|
64
|
+
out.append(" ".join(str(s) for s in entry[1]).strip())
|
|
65
|
+
elif isinstance(entry, str):
|
|
66
|
+
out.append(entry)
|
|
67
|
+
return tuple(p for p in out if p)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _hotpot_examples(raw: Sequence[dict]) -> list[Example]:
|
|
71
|
+
out = []
|
|
72
|
+
for i, r in enumerate(raw):
|
|
73
|
+
ans = r.get("answer", "")
|
|
74
|
+
out.append(Example(
|
|
75
|
+
qid=str(r.get("_id", i)), question=str(r.get("question", "")),
|
|
76
|
+
golds=tuple([ans]) if ans else (), context=_hotpot_context(r.get("context")),
|
|
77
|
+
meta={"question_type": r.get("type", ""), "level": r.get("level", "")}))
|
|
78
|
+
return out
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class HotpotFormatDataset:
|
|
82
|
+
def __init__(self, name: str, full_file: str, *, fixture: str = "hotpot.smoke.json") -> None:
|
|
83
|
+
self.name = name
|
|
84
|
+
self._full = full_file
|
|
85
|
+
self._fixture = fixture
|
|
86
|
+
|
|
87
|
+
def load(self, split: str = "smoke") -> Sequence[Example]:
|
|
88
|
+
path = _FIXTURES / self._fixture if split == "smoke" else _need(self._full)
|
|
89
|
+
return _hotpot_examples(_read_json(path))
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ── June format (locomo / longmemeval / financebench) ───────────────────────────────────────
|
|
93
|
+
def _conv_key(doc_or_query_id: str) -> str:
|
|
94
|
+
return str(doc_or_query_id).split("::", 1)[0]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _june_examples(blob: dict, *, max_corpus: int = 200) -> list[Example]:
|
|
98
|
+
docs_by_conv: dict[str, list[str]] = {}
|
|
99
|
+
for d in blob.get("documents", []):
|
|
100
|
+
docs_by_conv.setdefault(_conv_key(d.get("id", "")), []).append(str(d.get("text", "")))
|
|
101
|
+
out = []
|
|
102
|
+
for q in blob.get("queries", []):
|
|
103
|
+
conv = _conv_key(q.get("id", ""))
|
|
104
|
+
corpus = tuple(docs_by_conv.get(conv, [])[:max_corpus])
|
|
105
|
+
ans = q.get("answer")
|
|
106
|
+
out.append(Example(
|
|
107
|
+
qid=str(q.get("id", "")), question=str(q.get("query", "")),
|
|
108
|
+
golds=tuple([str(ans)]) if ans not in (None, "") else (),
|
|
109
|
+
corpus=corpus,
|
|
110
|
+
meta={"question_type": q.get("question_type", ""),
|
|
111
|
+
"gold_ids": list(q.get("gold", []) or [])}))
|
|
112
|
+
return out
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class JuneFormatDataset:
|
|
116
|
+
def __init__(self, name: str, full_file: str, *, fixture: str = "june.smoke.json") -> None:
|
|
117
|
+
self.name = name
|
|
118
|
+
self._full = full_file
|
|
119
|
+
self._fixture = fixture
|
|
120
|
+
|
|
121
|
+
def load(self, split: str = "smoke") -> Sequence[Example]:
|
|
122
|
+
path = _FIXTURES / self._fixture if split == "smoke" else _need(self._full)
|
|
123
|
+
return _june_examples(_read_json(path))
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# ── the bundled smoke trivia set (SB0) ──────────────────────────────────────────────────────
|
|
127
|
+
class SmokeDataset:
|
|
128
|
+
name = "smoke"
|
|
129
|
+
|
|
130
|
+
def load(self, split: str = "smoke") -> Sequence[Example]:
|
|
131
|
+
raws = [json.loads(ln) for ln in (_FIXTURES / "smoke.jsonl").read_text(
|
|
132
|
+
encoding="utf-8").splitlines() if ln.strip()]
|
|
133
|
+
return [Example(qid=str(r.get("qid", i)), question=str(r.get("question", "")),
|
|
134
|
+
golds=tuple(r.get("golds", [])), context=tuple(r.get("context", [])))
|
|
135
|
+
for i, r in enumerate(raws)]
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
__all__ = ["SmokeDataset", "HotpotFormatDataset", "JuneFormatDataset"]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Dataset registry (SB0) — name → Dataset. Adding a benchmark is one entry, never a runner edit."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from june_bench.ports import Dataset
|
|
5
|
+
from june_bench.datasets.loaders import (
|
|
6
|
+
HotpotFormatDataset,
|
|
7
|
+
JuneFormatDataset,
|
|
8
|
+
SmokeDataset,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
_DATASETS: dict[str, Dataset] = {}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def register(dataset: Dataset) -> None:
|
|
15
|
+
if not getattr(dataset, "name", ""):
|
|
16
|
+
raise ValueError("dataset must have a non-empty name")
|
|
17
|
+
_DATASETS[dataset.name] = dataset
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get(name: str) -> Dataset:
|
|
21
|
+
if name not in _DATASETS:
|
|
22
|
+
raise KeyError(f"unknown dataset {name!r}; registered: {names()}")
|
|
23
|
+
return _DATASETS[name]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def names() -> list[str]:
|
|
27
|
+
return sorted(_DATASETS)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
register(SmokeDataset())
|
|
31
|
+
# SB2 — the four benchmarks (six datasets, two formats). Full files resolve from the data dir
|
|
32
|
+
# (JUNE_BENCH_DATA / repo); the bundled `smoke` split needs no data.
|
|
33
|
+
register(HotpotFormatDataset("hotpot", "hotpot_dev.json"))
|
|
34
|
+
register(HotpotFormatDataset("2wiki", "2wiki_dev.json"))
|
|
35
|
+
register(HotpotFormatDataset("musique", "musique_dev.json"))
|
|
36
|
+
register(JuneFormatDataset("locomo", "locomo.june.json"))
|
|
37
|
+
register(JuneFormatDataset("longmemeval", "lme.june.json"))
|
|
38
|
+
register(JuneFormatDataset("financebench", "financebench.june.json"))
|
|
39
|
+
|
|
40
|
+
__all__ = ["register", "get", "names"]
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""june_bench ports — the two typed contracts the whole suite extends through (SB0).
|
|
2
|
+
|
|
3
|
+
A benchmark is ``run(system, dataset) → records → score``. Only two things vary: the **System**
|
|
4
|
+
being benchmarked and the **Dataset** it runs on. Everything else (runner, scorer, reporter) is
|
|
5
|
+
fixed. So a new competitor or a new dataset is exactly one adapter behind one of these Protocols —
|
|
6
|
+
never an edit to the harness. Pure: stdlib only, so the core installs and unit-tests with no model,
|
|
7
|
+
no datasets, no network (the ``NullSystem``/``EchoSystem`` smoke).
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections.abc import Sequence
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Protocol, runtime_checkable
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True, slots=True)
|
|
17
|
+
class Example:
|
|
18
|
+
"""One benchmark item. ``context`` is in-prompt passages (context-QA like HotpotQA);
|
|
19
|
+
``corpus`` is documents a *memory* system ingests before answering (LoCoMo/LongMemEval).
|
|
20
|
+
A system uses whichever its modality needs. ``meta`` carries dataset-specific fields
|
|
21
|
+
(category, session id, …) so per-category scoring needs no schema change."""
|
|
22
|
+
qid: str
|
|
23
|
+
question: str
|
|
24
|
+
golds: tuple[str, ...]
|
|
25
|
+
context: tuple[str, ...] = ()
|
|
26
|
+
corpus: tuple[str, ...] = ()
|
|
27
|
+
meta: dict = field(default_factory=dict)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True, slots=True)
|
|
31
|
+
class Prediction:
|
|
32
|
+
"""A system's answer + optional provenance (``calls``/``cost``/``abstained`` in ``meta``)
|
|
33
|
+
so the cost axis is measured, not guessed."""
|
|
34
|
+
text: str
|
|
35
|
+
meta: dict = field(default_factory=dict)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True, slots=True)
|
|
39
|
+
class Record:
|
|
40
|
+
"""One scored row: the prediction joined to its gold + provenance. The scorer reads only
|
|
41
|
+
these, so scoring is decoupled from how the answer was produced (any System, any Dataset)."""
|
|
42
|
+
qid: str
|
|
43
|
+
question: str
|
|
44
|
+
golds: tuple[str, ...]
|
|
45
|
+
prediction: str
|
|
46
|
+
context: tuple[str, ...] = ()
|
|
47
|
+
calls: int = 0
|
|
48
|
+
cost: float = 0.0
|
|
49
|
+
abstained: bool = False
|
|
50
|
+
meta: dict = field(default_factory=dict)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@runtime_checkable
|
|
54
|
+
class System(Protocol):
|
|
55
|
+
"""A thing being benchmarked. ``answer`` is the one required method; a memory system may also
|
|
56
|
+
expose ``ingest(corpus)`` (the runner calls it when present). Async so real adapters (June over
|
|
57
|
+
HTTP, an LLM) fit without blocking; sync logic just returns. The model/HTTP/heavy deps live in
|
|
58
|
+
the adapter, never in this contract."""
|
|
59
|
+
name: str
|
|
60
|
+
|
|
61
|
+
async def answer(self, example: Example) -> Prediction: ...
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@runtime_checkable
|
|
65
|
+
class Dataset(Protocol):
|
|
66
|
+
"""A benchmark's data. ``load(split)`` returns the examples for a named split (e.g. ``smoke``,
|
|
67
|
+
``dev100``, ``full``). Loaders are pure readers over the bundled fixtures or the fetched dumps."""
|
|
68
|
+
name: str
|
|
69
|
+
|
|
70
|
+
def load(self, split: str) -> Sequence[Example]: ...
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
__all__ = ["Example", "Prediction", "Record", "System", "Dataset"]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Per-dataset scoring profiles (SB1) — which axes are the *headline* for each benchmark.
|
|
2
|
+
|
|
3
|
+
All benchmarks are scored on the same three axes (accuracy/coverage/cost, `score.py`), but they don't
|
|
4
|
+
all *report* the same headline: HotpotQA/2Wiki/MuSiQue/FinanceBench are EM/F1; the memory benchmarks
|
|
5
|
+
(LoCoMo, LongMemEval) pair accuracy with coverage / retrieval-recall. A profile is just the ordered
|
|
6
|
+
metric keys a reporter surfaces — adding a dataset's profile is one entry, never a scorer change."""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
_PROFILES: dict[str, tuple[str, ...]] = {
|
|
10
|
+
"default": ("em", "f1", "coverage"),
|
|
11
|
+
"hotpot": ("em", "f1"),
|
|
12
|
+
"2wiki": ("em", "f1"),
|
|
13
|
+
"musique": ("em", "f1"),
|
|
14
|
+
"locomo": ("em", "f1", "coverage"),
|
|
15
|
+
"longmemeval": ("em", "f1", "context_recall"),
|
|
16
|
+
"financebench": ("em", "f1"),
|
|
17
|
+
"smoke": ("em", "f1"),
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def headline_metrics(dataset: str) -> tuple[str, ...]:
|
|
22
|
+
"""The ordered headline metric keys for a dataset (falls back to the default profile)."""
|
|
23
|
+
return _PROFILES.get(dataset, _PROFILES["default"])
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def headline(summary: dict, dataset: str) -> dict:
|
|
27
|
+
"""Project a full `score()` summary down to a dataset's headline metrics (order preserved)."""
|
|
28
|
+
return {k: summary[k] for k in headline_metrics(dataset) if k in summary}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def register_profile(dataset: str, metrics: tuple[str, ...]) -> None:
|
|
32
|
+
if not metrics:
|
|
33
|
+
raise ValueError("a profile needs at least one metric")
|
|
34
|
+
_PROFILES[dataset] = tuple(metrics)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
__all__ = ["headline_metrics", "headline", "register_profile"]
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Reporter (SB0) — a scored summary → a self-describing markdown row + JSON.
|
|
2
|
+
|
|
3
|
+
Every row records *system · dataset · split · model + the three axes*, so a published number is
|
|
4
|
+
reproducible and self-describing (the honesty discipline). SB5 extends this to a full multi-arm
|
|
5
|
+
`RESULTS.md` table; SB0 ships the single-run row.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def to_json(summary: dict, *, system: str, dataset: str, split: str, model: str = "") -> str:
|
|
13
|
+
head = {"system": system, "dataset": dataset, "split": split, "model": model}
|
|
14
|
+
return json.dumps({**head, **summary}, indent=2)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def to_markdown(summary: dict, *, system: str, dataset: str, split: str, model: str = "") -> str:
|
|
18
|
+
s = summary
|
|
19
|
+
return (
|
|
20
|
+
f"### {system} · {dataset}/{split}" + (f" · {model}" if model else "") + "\n\n"
|
|
21
|
+
"| n | answered | EM | F1 | coverage | ctx-recall | calls/ans | cost/ans |\n"
|
|
22
|
+
"|---|---|---|---|---|---|---|---|\n"
|
|
23
|
+
f"| {s['n']} | {s['answered']} | {s['em']} | {s['f1']} | {s['coverage']} | "
|
|
24
|
+
f"{s['context_recall']} | {s['calls_per_answer']} | {s['cost_per_answer']} |\n"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def suite_markdown(rows: list[dict], *, split: str = "", model: str = "") -> str:
|
|
29
|
+
"""A systems × datasets matrix → one `RESULTS.md`-ready table. Each row is
|
|
30
|
+
``{system, dataset, summary}`` or ``{system, dataset, error}`` (fail-soft per cell). The
|
|
31
|
+
per-dataset *headline* metrics (profiles) are noted, but every axis is shown for honesty."""
|
|
32
|
+
from june_bench.profiles import headline_metrics
|
|
33
|
+
head = f"# june-bench results"
|
|
34
|
+
if split:
|
|
35
|
+
head += f" · split `{split}`"
|
|
36
|
+
if model:
|
|
37
|
+
head += f" · model `{model}`"
|
|
38
|
+
lines = [head, "",
|
|
39
|
+
"| system | dataset | headline | n | EM | F1 | coverage | ctx-recall | calls/ans | cost/ans |",
|
|
40
|
+
"|---|---|---|---|---|---|---|---|---|---|"]
|
|
41
|
+
for r in rows:
|
|
42
|
+
sysn, dsn = r.get("system", "?"), r.get("dataset", "?")
|
|
43
|
+
if "error" in r:
|
|
44
|
+
lines.append(f"| {sysn} | {dsn} | — | — | _error_ | {r['error'][:60]} | | | | |")
|
|
45
|
+
continue
|
|
46
|
+
s = r["summary"]
|
|
47
|
+
hl = "/".join(headline_metrics(dsn))
|
|
48
|
+
lines.append(f"| {sysn} | {dsn} | {hl} | {s['n']} | {s['em']} | {s['f1']} | "
|
|
49
|
+
f"{s['coverage']} | {s['context_recall']} | {s['calls_per_answer']} | "
|
|
50
|
+
f"{s['cost_per_answer']} |")
|
|
51
|
+
lines += ["", "_Same data, same scorer; every cell records the system, dataset, split and model. "
|
|
52
|
+
"Headline = the dataset's reported metric(s); all axes shown for honesty._"]
|
|
53
|
+
return "\n".join(lines) + "\n"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def suite_json(rows: list[dict], *, split: str = "", model: str = "") -> str:
|
|
57
|
+
return json.dumps({"split": split, "model": model, "rows": rows}, indent=2)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
__all__ = ["to_json", "to_markdown", "suite_markdown", "suite_json"]
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""june_bench runner — the fixed orchestration: ask the System each Example → Records (SB0).
|
|
2
|
+
|
|
3
|
+
Pure: depends only on the two ports, so it's unit-tested with a fake System + fixture Dataset
|
|
4
|
+
(no model, no network). If a System exposes ``ingest`` (a memory system), the runner calls it with
|
|
5
|
+
each example's ``corpus`` before asking — so context-QA and memory systems share one path.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import inspect
|
|
11
|
+
from collections.abc import Sequence
|
|
12
|
+
|
|
13
|
+
from june_bench.ports import Dataset, Example, Record, System
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def _maybe_await(value):
|
|
17
|
+
return await value if inspect.isawaitable(value) else value
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
async def _answer_one(system: System, ex: Example) -> Record:
|
|
21
|
+
ingest = getattr(system, "ingest", None)
|
|
22
|
+
if ingest is not None and ex.corpus:
|
|
23
|
+
await _maybe_await(ingest(ex.corpus))
|
|
24
|
+
pred = await system.answer(ex)
|
|
25
|
+
m = pred.meta or {}
|
|
26
|
+
return Record(
|
|
27
|
+
qid=ex.qid, question=ex.question, golds=ex.golds, prediction=pred.text,
|
|
28
|
+
context=ex.context, calls=int(m.get("calls", 0)), cost=float(m.get("cost", 0.0)),
|
|
29
|
+
abstained=bool(m.get("abstained", False)), meta=dict(m))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
async def run_async(system: System, examples: Sequence[Example]) -> list[Record]:
|
|
33
|
+
return [await _answer_one(system, ex) for ex in examples]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def run(system: System, dataset: Dataset, *, split: str = "smoke",
|
|
37
|
+
limit: int | None = None) -> list[Record]:
|
|
38
|
+
"""Load the split, ask the system each example, return scored-ready Records. Sync entry point
|
|
39
|
+
(wraps the async path) so the CLI and tests call one function."""
|
|
40
|
+
examples = list(dataset.load(split))
|
|
41
|
+
if limit is not None:
|
|
42
|
+
examples = examples[:limit]
|
|
43
|
+
return asyncio.run(run_async(system, examples))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
__all__ = ["run", "run_async"]
|