fieldkit 0.4.0__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fieldkit-0.4.0 → fieldkit-0.4.2}/CHANGELOG.md +43 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/PKG-INFO +1 -1
- {fieldkit-0.4.0 → fieldkit-0.4.2}/docs/api/cli.md +2 -2
- {fieldkit-0.4.0 → fieldkit-0.4.2}/docs/api/eval.md +7 -2
- {fieldkit-0.4.0 → fieldkit-0.4.2}/docs/api/publish.md +6 -1
- {fieldkit-0.4.0 → fieldkit-0.4.2}/src/fieldkit/_version.py +1 -1
- {fieldkit-0.4.0 → fieldkit-0.4.2}/src/fieldkit/eval/vertical.py +73 -3
- {fieldkit-0.4.0 → fieldkit-0.4.2}/src/fieldkit/publish/__init__.py +27 -1
- {fieldkit-0.4.0 → fieldkit-0.4.2}/tests/test_publish.py +49 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/tests/test_vertical_bench.py +108 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/.gitignore +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/LICENSE +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/README.md +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/docs/api/capabilities.md +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/docs/api/lineage.md +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/docs/api/nim.md +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/docs/api/quant.md +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/docs/api/rag.md +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/docs/api/training.md +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/pyproject.toml +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/samples/bench-rag.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/samples/feasibility-math.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/samples/hello-lineage.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/samples/hello-nim.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/samples/naive-rag.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/src/fieldkit/__init__.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/src/fieldkit/capabilities/__init__.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/src/fieldkit/capabilities/data/__init__.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/src/fieldkit/capabilities/data/spark-capabilities.json +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/src/fieldkit/cli/__init__.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/src/fieldkit/eval/__init__.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/src/fieldkit/lineage/__init__.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/src/fieldkit/nim/__init__.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/src/fieldkit/quant/__init__.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/src/fieldkit/rag/__init__.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/src/fieldkit/training/__init__.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/tests/__init__.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/tests/conftest.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/tests/test_capabilities.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/tests/test_cli.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/tests/test_eval.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/tests/test_lineage.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/tests/test_nim.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/tests/test_nim_spark.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/tests/test_quant.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/tests/test_rag.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/tests/test_rag_spark.py +0 -0
- {fieldkit-0.4.0 → fieldkit-0.4.2}/tests/test_training.py +0 -0
|
@@ -6,6 +6,49 @@ The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and
|
|
|
6
6
|
|
|
7
7
|
## [Unreleased]
|
|
8
8
|
|
|
9
|
+
## [0.4.2] — 2026-05-15
|
|
10
|
+
|
|
11
|
+
Patch release. Two card-rendering polish lifts on `fieldkit.publish` driven by the 2026-05-15 cyber-vertical cycle (`Orionfold/SecurityLLM-GGUF`, the third vertical card on this surface — zero fieldkit source changes between Saul / cyber, the v0.4.1 publishing surface generalized exactly as designed). Both lifts are additive (one new `ModelCard` field already shipped on `main` in `ff1b92f`; one new `ArtifactManifest` field added here). No new modules, no new public classes, no breaking changes — purely a tightening pass.
|
|
12
|
+
|
|
13
|
+
### Added — `fieldkit.publish` card-rendering polish
|
|
14
|
+
|
|
15
|
+
- **`ModelCard.llama_cpp_example_prompt: Optional[str]`** — new field. Threads through `publish_quant(..., llama_cpp_example_prompt=...)` and from a duck-typed report's `.llama_cpp_example_prompt` attribute. The default `## How to run` body's `llama-cpp-python` snippet now uses this string for the user-message; when omitted it falls back to a neutral `"Summarize the key idea in one paragraph."` placeholder instead of the previously-hardcoded `"Explain working capital."` (which leaked into the legal + cyber vertical cards on first push). Multi-line MCQ-shaped prompts are JSON-escaped (`\n`) so the snippet stays single-line + valid Python — caller passes the raw prompt, the renderer handles escaping.
|
|
16
|
+
- **Side fix:** the previous renderer rendered the hardcoded finance prompt on every vertical card; the cyber + legal cards on HF were patched out-of-band on 2026-05-15 (commits `365dfe2`, `0824439`). Going forward, every `publish_quant` call should pass `llama_cpp_example_prompt=...` matching the article's "Using this release" section, per `[[feedback_customer_link_audit]]`.
|
|
17
|
+
- **`ArtifactManifest.recommended_variant: Optional[str]`** — new field. Was already on `ModelCard` (so the README's How-to-run snippets template against the article's pick) but did NOT flow into the `<slug>.yaml` manifest, so the destination catalog couldn't see the article's narrative choice and ran its own rank-avg picker instead. `publish_quant` now threads `recommended_variant` into both surfaces — the HF README badge and the destination "Sweet spot" badge stay in sync from one kwarg. Mac added the matching `recommended_variant: z.string().optional()` to its artifacts schema in PR #6 (`mac-sweep/2026-05-15-cyber-vertical`) and pinned cyber's catalog `Q4_K_M` manually; source `src/content.config.ts` now mirrors that field for forward-compat. Motivated by cyber-vertical (2026-05-15): `Q4_K_M` topped CyberMetric at 40% but its worst-in-class perplexity dragged its rank-avg down, so without the override the picker selected `Q5_K_M`.
|
|
18
|
+
|
|
19
|
+
### Test suite
|
|
20
|
+
|
|
21
|
+
**+3 new tests:** `test_artifact_manifest_carries_recommended_variant_when_set` + `test_artifact_manifest_omits_recommended_variant_when_unset` (round-trip + elision on the new manifest field) and `test_publish_quant_threads_recommended_variant_into_card_and_manifest` (kwarg threads to both surfaces via `publish_quant`). Total: **378 passed, 3 skipped** offline (`pytest -q`). The 3 skips are the two `--spark`-gated live-integration tests + the `torch`-import skip in `test_training.py` (CPU-only venv).
|
|
22
|
+
|
|
23
|
+
### Articles in this release
|
|
24
|
+
|
|
25
|
+
- [`becoming-a-cyber-curator-on-spark`](https://ainative.business/field-notes/becoming-a-cyber-curator-on-spark/) — third Orionfold quant card. Drives both lifts: surfaces the `llama_cpp_example_prompt` leak (cyber's MCQ prompt would have shipped as "Explain working capital." otherwise) and motivates `ArtifactManifest.recommended_variant` (the destination's rank-avg picker would have surfaced `Q5_K_M` instead of `Q4_K_M`).
|
|
26
|
+
|
|
27
|
+
### Verified on Spark
|
|
28
|
+
|
|
29
|
+
- **Live HF push:** `Orionfold/SecurityLLM-GGUF` (5 GGUF variants + README, ~26 GB) shipped 2026-05-15 via the same `publish_quant(dry_run=False)` path as Saul and finance-chat. Zero source changes in `fieldkit.publish` between Saul (v0.4.1) and cyber (the cycle that drove this v0.4.2 patch) — the surface generalized as designed across three verticals.
|
|
30
|
+
|
|
31
|
+
## [0.4.1] — 2026-05-14
|
|
32
|
+
|
|
33
|
+
Patch release. The `fieldkit.eval.VerticalBench` overlay introduced in v0.4.0 needed two kwargs to score FinanceBench correctly (open-book context-prepend) and to bound a JSONL slice (subset filter on `question_type`). Both lifts came out of the 2026-05-13 V1 attempt on `AdaptLLM/finance-chat` (0/50 closed-book vs. 14–18%/50 open-book on the same JSONL) and the 2026-05-14 legal-curator scoring run on `Equall/Saul-7B-Instruct-v1`. The two scripts under `scripts/g3_*` that carried duplicated loaders now call into the package surface. No new modules, no new public classes — additive kwargs only.
|
|
34
|
+
|
|
35
|
+
### Added — `fieldkit.eval.VerticalBench` open-book mode
|
|
36
|
+
|
|
37
|
+
- **`VerticalBench.from_jsonl(..., open_book=...)`** — new kwarg. When `True`, FinanceBench rows have their `evidence[*].evidence_text` prepended to the question (templated as "Context from <doc>: …\n\nQuestion: …\n\nAnswer with just the numeric value.") so the model sees the 10-K excerpt the gold answer was derived from. Default `None` auto-resolves to `True` for `financebench` and `False` for `legalbench` / `generic` — the right defaults per benchmark convention. Lifts inline `_load_finbench_open_book` helpers from `scripts/g3_preflight_bench.py` and `scripts/g3_measure_variants.py` into the package surface; both scripts now call `VerticalBench.from_jsonl(open_book=True, subset=…)` instead of carrying duplicated loaders. The 2026-05-13 V1 attempt on AdaptLLM/finance-chat scored 0/50 closed-book and 14–18%/50 open-book on the same JSONL — open-book is the load-bearing flag for FinanceBench scoring.
|
|
38
|
+
- **`VerticalBench.from_jsonl(..., subset=...)`** — new kwarg. FinanceBench-only convenience filter on the `question_type` column. Drops non-matching rows before the loader hits the `limit` cap, so callers can score the `metrics-generated` subset with `limit=50` and get 50 metrics-generated questions (not 50 mixed rows of which N are metrics-generated).
|
|
39
|
+
|
|
40
|
+
### Test suite
|
|
41
|
+
|
|
42
|
+
**+8 new tests** on `TestOpenBook` in `tests/test_vertical_bench.py` covering: auto-default for financebench, explicit `False` keeps closed-book, missing-evidence falls back to closed-book, legalbench / generic are no-ops, list-of-strings evidence shape, subset filter, subset × limit composition. Total: **375 passed, 3 skipped** offline (`pytest -q`). The 3 skips are the two `--spark`-gated live-integration tests + the `torch`-import skip in `test_training.py` (CPU-only venv).
|
|
43
|
+
|
|
44
|
+
### Articles in this release
|
|
45
|
+
|
|
46
|
+
- [`becoming-a-legal-curator-on-spark`](https://ainative.business/field-notes/becoming-a-legal-curator-on-spark/) — second Orionfold quant card, swaps FinanceBench for a curated 5-task LegalBench subset. Drives the `subset` kwarg's first non-finance use (LegalBench tasks via `legalbench` format) and validates that the `open_book` default-off branch is correct for LegalBench JSONLs.
|
|
47
|
+
|
|
48
|
+
### Verified on Spark
|
|
49
|
+
|
|
50
|
+
- **Live HF push:** `Orionfold/Saul-7B-Instruct-v1-GGUF` (5 GGUF variants + README, ~37 GB) shipped 2026-05-14 via the same `publish_quant(dry_run=False)` path the finance-chat card used a week earlier. Zero source changes in `fieldkit.publish` between the two pushes — the v0.4.0 surface generalized as designed.
|
|
51
|
+
|
|
9
52
|
## [0.4.0] — 2026-05-14
|
|
10
53
|
|
|
11
54
|
Fourth public release. Two new top-level modules (`fieldkit.publish` + `fieldkit.quant`) for the G3 GGUF / Quantization Publisher pick (MTBM Pick #1 per `ideas/mtbm-use-cases.md` §6), the v0.4.x **vertical-curator overlay** on `fieldkit.eval` (`VerticalBench`), and post-dry-run card-rendering fixes that landed the first live HF push (`Orionfold/finance-chat-GGUF`). The two new modules together unlock most of Cluster G; this cut implements the GGUF critical path and stubs the other quant formats with named entry points pointing at the v0.5+ roadmap.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fieldkit
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Verified-on-Spark patterns lifted from the ai-field-notes blog into one importable Python package.
|
|
5
5
|
Project-URL: Homepage, https://ainative.business/fieldkit/
|
|
6
6
|
Project-URL: Source, https://github.com/manavsehgal/ai-field-notes/tree/main/fieldkit
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
module: cli
|
|
3
3
|
title: fieldkit (CLI)
|
|
4
4
|
summary: A thin Typer wrapper over the modules. Quick checks and smoke benchmarks without writing Python.
|
|
5
|
-
order:
|
|
5
|
+
order: 9
|
|
6
6
|
---
|
|
7
7
|
|
|
8
8
|
## What it is
|
|
@@ -17,7 +17,7 @@ Print the installed package version.
|
|
|
17
17
|
|
|
18
18
|
```bash
|
|
19
19
|
$ fieldkit version
|
|
20
|
-
0.
|
|
20
|
+
0.4.0
|
|
21
21
|
```
|
|
22
22
|
|
|
23
23
|
### `fieldkit envelope <size>`
|
|
@@ -11,7 +11,7 @@ The eval harnesses the project keeps reinventing: a per-call latency benchmarker
|
|
|
11
11
|
|
|
12
12
|
**v0.4.x additions** (vertical-curator surface for the G3 GGUF publisher pipeline):
|
|
13
13
|
|
|
14
|
-
- `VerticalBench` — Spark-overlay scorer for FinanceBench / LegalBench / SemEval-style JSONL test sets. Wraps `Bench`, so latency aggregates alongside accuracy and refusal. Network access lives in the caller (`llama-cli`, NIM, vLLM) — the bench itself is offline-only and unit-testable.
|
|
14
|
+
- `VerticalBench` — Spark-overlay scorer for FinanceBench / LegalBench / SemEval-style JSONL test sets. Wraps `Bench`, so latency aggregates alongside accuracy and refusal. Network access lives in the caller (`llama-cli`, NIM, vLLM) — the bench itself is offline-only and unit-testable. **v0.4.1 lift:** `from_jsonl(..., open_book=…, subset=…)` — open-book mode prepends FinanceBench evidence text to the question (default-on for `financebench`, default-off elsewhere); `subset` filters FinanceBench rows by `question_type` before the `limit` cap.
|
|
15
15
|
- `VerticalQA` — one test case (qid + question + expected + tags) lifted from a vertical-eval JSONL.
|
|
16
16
|
- `exact_match` / `contains` / `numeric_match` — the three built-in scorers. `numeric_match` is the FinanceBench default (first-number ±1% rel-tol); `exact_match` is the LegalBench default; `contains` is the right pick when the model answers in prose around a key fact.
|
|
17
17
|
|
|
@@ -241,6 +241,8 @@ vb = VerticalBench.from_jsonl(
|
|
|
241
241
|
"financebench.jsonl",
|
|
242
242
|
scorer=numeric_match, # FinanceBench → first-number ±1%
|
|
243
243
|
limit=50,
|
|
244
|
+
subset="metrics-generated", # v0.4.1 — filter question_type before limit
|
|
245
|
+
open_book=True, # v0.4.1 — prepend evidence_text to the question
|
|
244
246
|
)
|
|
245
247
|
|
|
246
248
|
def model_fn(prompt: str) -> str:
|
|
@@ -250,7 +252,10 @@ bench = vb.run(model_fn, extra_tags={"variant": "Q4_K_M"})
|
|
|
250
252
|
print(bench.report()) # accuracy + refusal_rate + latency
|
|
251
253
|
```
|
|
252
254
|
|
|
253
|
-
`VerticalBench.from_jsonl(path, *, format="auto", limit=None, scorer=None, scorer_kwargs=None)` auto-sniffs FinanceBench / LegalBench / generic schemas from the first JSON row. Rows missing the question or expected field are silently dropped (the row-count delta vs the JSONL is the diagnostic). The default scorer is `numeric_match` for FinanceBench and `exact_match` everywhere else; pass `scorer=` to override.
|
|
255
|
+
`VerticalBench.from_jsonl(path, *, format="auto", limit=None, scorer=None, scorer_kwargs=None, open_book=None, subset=None)` auto-sniffs FinanceBench / LegalBench / generic schemas from the first JSON row. Rows missing the question or expected field are silently dropped (the row-count delta vs the JSONL is the diagnostic). The default scorer is `numeric_match` for FinanceBench and `exact_match` everywhere else; pass `scorer=` to override.
|
|
256
|
+
|
|
257
|
+
- **`open_book=` *(v0.4.1)*** — when `True`, FinanceBench rows have their `evidence[*].evidence_text` prepended to the question (templated as `Context from <doc>: …\n\nQuestion: …\n\nAnswer with just the numeric value.`) so the model sees the 10-K excerpt the gold answer was derived from. Default `None` auto-resolves to `True` for `financebench` and `False` for `legalbench` / `generic` — the right defaults per benchmark convention. The 2026-05-13 V1 attempt on `AdaptLLM/finance-chat` scored 0/50 closed-book and 14–18%/50 open-book on the same JSONL; open-book is the load-bearing flag for FinanceBench scoring. Lifted from inline helpers in `scripts/g3_preflight_bench.py` and `scripts/g3_measure_variants.py` into the package surface.
|
|
258
|
+
- **`subset=` *(v0.4.1)*** — FinanceBench-only convenience filter on the `question_type` column. Drops non-matching rows *before* the loader hits the `limit` cap, so callers can score the `metrics-generated` subset with `limit=50` and get 50 metrics-generated questions (not 50 mixed rows of which N happen to be metrics-generated). No-op on `legalbench` / `generic` formats.
|
|
254
259
|
|
|
255
260
|
`VerticalBench.run(model_fn, *, limit=None, on_error="record", extra_tags=None)` returns the underlying `Bench` so callers route through the existing `.summary()` / `.report()` / `.dump()` pipeline. Each `BenchCall` carries `accuracy` (0.0/1.0 from the scorer) and `refusal` (0.0/1.0 from `is_refusal`) metrics; per-row metadata (company, doc_period, question_type) flows through to `BenchCall.tags` for downstream slice-by aggregation.
|
|
256
261
|
|
|
@@ -70,6 +70,7 @@ ModelCard(
|
|
|
70
70
|
hf_repo="Orionfold/finance-chat-GGUF", # drives default `## How to run` body
|
|
71
71
|
chat_format="llama-2", # → llama_cpp.Llama(chat_format=...)
|
|
72
72
|
recommended_variant="Q5_K_M", # featured in default snippets
|
|
73
|
+
llama_cpp_example_prompt="Explain working capital.", # user-message in the default `llama-cpp-python` snippet; falls back to a neutral placeholder when omitted
|
|
73
74
|
ollama_pull_handle=None, # opt-in override; default body wins otherwise
|
|
74
75
|
transformers_snippet=None,
|
|
75
76
|
lineage_prompt=None, # injected by publish_quant if a LineageStore is supplied
|
|
@@ -98,6 +99,7 @@ m = ArtifactManifest(
|
|
|
98
99
|
sustained_load_minutes=2.18,
|
|
99
100
|
vertical_eval={"Q4_K_M": 0.14, ...},
|
|
100
101
|
vertical_eval_name="FinanceBench (n=50, numeric_match)",
|
|
102
|
+
recommended_variant="Q5_K_M", # article-narrative pick; destination pins the "Sweet spot" badge to this variant
|
|
101
103
|
lineage_run_id=None,
|
|
102
104
|
license_tier="free", # Orionfold commercial tier (free / pro)
|
|
103
105
|
license_commercial_tier=None,
|
|
@@ -112,6 +114,8 @@ print(m.to_yaml())
|
|
|
112
114
|
|
|
113
115
|
The `license_tier` / `license_commercial_tier` fields live alongside `model_license` under a nested `license:` block in YAML output. Mac destination's Zod schema mirrors this shape.
|
|
114
116
|
|
|
117
|
+
`recommended_variant` (v0.4.2+) lets the article's narrative pick — the variant the writeup recommends — override the destination's rank-avg picker. Cyber's `Q4_K_M` topped CyberMetric but its worst-in-class perplexity dragged its rank-avg down, so without this field the catalog page would pin `Q5_K_M` as the "Sweet spot" instead. Same value flows into both `ModelCard` (HF README's How-to-run snippets) and `ArtifactManifest` (destination catalog) so the badge and the snippet stay in sync.
|
|
118
|
+
|
|
115
119
|
### `write_artifact_manifest(manifest, *, artifacts_dir)`
|
|
116
120
|
|
|
117
121
|
Writes the manifest to `<artifacts_dir>/<slug>.yaml`. Creates the directory if missing. Returns the absolute path of the written file — callers can stage it alongside the article for the next git commit.
|
|
@@ -134,7 +138,7 @@ Token resolution order: explicit `token=` arg → `HF_TOKEN` env → `HUGGING_FA
|
|
|
134
138
|
|
|
135
139
|
### `publish_quant(*, quant_report, base_model, repo_name, staging_dir, ...) → PublishResult`
|
|
136
140
|
|
|
137
|
-
The one-line orchestrator. Reads the duck-typed `quant_report` fields (`.format`, `.variants`, `.perplexity`, `.tokens_per_sec`, `.sustained_load_minutes`, `.variant_files`, `.vertical_eval`, `.vertical_eval_name`, `.model_license`, `.chat_format`, `.recommended_variant`), builds a `ModelCard`, stages the README + variant files, writes the `ArtifactManifest` (if `artifacts_dir` supplied), and invokes `HFHubAdapter.push_folder()`. Explicit kwargs override duck-typed report attrs.
|
|
141
|
+
The one-line orchestrator. Reads the duck-typed `quant_report` fields (`.format`, `.variants`, `.perplexity`, `.tokens_per_sec`, `.sustained_load_minutes`, `.variant_files`, `.vertical_eval`, `.vertical_eval_name`, `.model_license`, `.chat_format`, `.recommended_variant`, `.llama_cpp_example_prompt`), builds a `ModelCard`, stages the README + variant files, writes the `ArtifactManifest` (if `artifacts_dir` supplied), and invokes `HFHubAdapter.push_folder()`. Explicit kwargs override duck-typed report attrs.
|
|
138
142
|
|
|
139
143
|
```python
|
|
140
144
|
result = publish_quant(
|
|
@@ -150,6 +154,7 @@ result = publish_quant(
|
|
|
150
154
|
model_license="llama2", # critical — never default silently to apache-2.0
|
|
151
155
|
chat_format="llama-2",
|
|
152
156
|
recommended_variant="Q5_K_M",
|
|
157
|
+
llama_cpp_example_prompt="Explain working capital.", # mirror the article's example user-message
|
|
153
158
|
lineage_store=store, # optional; injects ## Lineage block
|
|
154
159
|
dry_run=True, # flip to False for the actual push
|
|
155
160
|
)
|
|
@@ -29,6 +29,15 @@ for custom scoring. ``exact_match`` and ``contains`` are deterministic;
|
|
|
29
29
|
``numeric_match`` extracts the first number from the prediction and compares
|
|
30
30
|
to the reference under a relative tolerance — the right default for
|
|
31
31
|
FinanceBench's quantitative questions.
|
|
32
|
+
|
|
33
|
+
**Open-book mode (v0.4.1+).** FinanceBench is an *open-book* benchmark — the
|
|
34
|
+
right answer is in the 10-K excerpt cited under ``evidence[*].evidence_text``.
|
|
35
|
+
``VerticalBench.from_jsonl(..., open_book=True)`` rewrites the
|
|
36
|
+
``VerticalQA.question`` to include the evidence text + a numeric-answer prompt
|
|
37
|
+
before the model sees it. Default is auto: ``True`` for `financebench`,
|
|
38
|
+
``False`` for everything else (LegalBench/generic). The 2026-05-13 V1 attempt
|
|
39
|
+
on AdaptLLM/finance-chat scored 0/50 closed-book and 14–18%/50 open-book on
|
|
40
|
+
the same JSONL — open-book is the load-bearing flag for FinanceBench scoring.
|
|
32
41
|
"""
|
|
33
42
|
|
|
34
43
|
from __future__ import annotations
|
|
@@ -143,8 +152,20 @@ def _detect_format(row: dict[str, Any]) -> str:
|
|
|
143
152
|
return "generic"
|
|
144
153
|
|
|
145
154
|
|
|
146
|
-
def _row_to_qa(
|
|
147
|
-
|
|
155
|
+
def _row_to_qa(
|
|
156
|
+
row: dict[str, Any],
|
|
157
|
+
fmt: str,
|
|
158
|
+
fallback_idx: int,
|
|
159
|
+
*,
|
|
160
|
+
open_book: bool = False,
|
|
161
|
+
) -> VerticalQA | None:
|
|
162
|
+
"""Map a JSONL row to `VerticalQA`. Returns None if required fields missing.
|
|
163
|
+
|
|
164
|
+
When `open_book=True` and `fmt == "financebench"`, prepends the row's
|
|
165
|
+
``evidence[*].evidence_text`` to the question so the model sees the
|
|
166
|
+
10-K excerpt the gold answer was derived from. No-op for other formats —
|
|
167
|
+
LegalBench / generic JSONLs don't have a standard evidence field.
|
|
168
|
+
"""
|
|
148
169
|
if fmt == "financebench":
|
|
149
170
|
qid = str(row.get("financebench_id") or f"fb-{fallback_idx}")
|
|
150
171
|
question = row.get("question") or ""
|
|
@@ -159,6 +180,16 @@ def _row_to_qa(row: dict[str, Any], fmt: str, fallback_idx: int) -> VerticalQA |
|
|
|
159
180
|
for k in ("company", "doc_period", "doc_type", "question_type")
|
|
160
181
|
if k in row
|
|
161
182
|
}
|
|
183
|
+
if open_book and question:
|
|
184
|
+
evidence_text = _extract_evidence_text(row)
|
|
185
|
+
if evidence_text:
|
|
186
|
+
doc_name = row.get("doc_name") or "the filing"
|
|
187
|
+
question = (
|
|
188
|
+
f"Context from {doc_name}:\n\n"
|
|
189
|
+
f"{evidence_text}\n\n"
|
|
190
|
+
f"Question: {question}\n\n"
|
|
191
|
+
f"Answer with just the numeric value."
|
|
192
|
+
)
|
|
162
193
|
elif fmt == "legalbench":
|
|
163
194
|
qid = str(row.get("id") or row.get("index") or f"lb-{fallback_idx}")
|
|
164
195
|
question = row.get("text") or row.get("input") or row.get("question") or ""
|
|
@@ -180,6 +211,23 @@ def _row_to_qa(row: dict[str, Any], fmt: str, fallback_idx: int) -> VerticalQA |
|
|
|
180
211
|
return VerticalQA(qid=qid, question=str(question), expected=str(expected), tags=tags)
|
|
181
212
|
|
|
182
213
|
|
|
214
|
+
def _extract_evidence_text(row: dict[str, Any]) -> str:
|
|
215
|
+
"""Flatten FinanceBench's `evidence: [{evidence_text: ...}, ...]` into a
|
|
216
|
+
blank-line-joined string. Accepts either list-of-dicts (canonical shape)
|
|
217
|
+
or list-of-strings (some pre-flattened dumps). Returns ``""`` when no
|
|
218
|
+
evidence field is present — caller falls back to closed-book.
|
|
219
|
+
"""
|
|
220
|
+
chunks: list[str] = []
|
|
221
|
+
for e in row.get("evidence") or []:
|
|
222
|
+
if isinstance(e, dict):
|
|
223
|
+
txt = e.get("evidence_text") or ""
|
|
224
|
+
if txt:
|
|
225
|
+
chunks.append(str(txt))
|
|
226
|
+
elif isinstance(e, str):
|
|
227
|
+
chunks.append(e)
|
|
228
|
+
return "\n\n".join(chunks)
|
|
229
|
+
|
|
230
|
+
|
|
183
231
|
# --- VerticalBench -------------------------------------------------------
|
|
184
232
|
|
|
185
233
|
|
|
@@ -224,6 +272,8 @@ class VerticalBench:
|
|
|
224
272
|
limit: int | None = None,
|
|
225
273
|
scorer: Callable[..., float] | None = None,
|
|
226
274
|
scorer_kwargs: dict[str, Any] | None = None,
|
|
275
|
+
open_book: bool | None = None,
|
|
276
|
+
subset: str | None = None,
|
|
227
277
|
) -> VerticalBench:
|
|
228
278
|
"""Load a JSONL test set from disk and return a configured bench.
|
|
229
279
|
|
|
@@ -232,6 +282,18 @@ class VerticalBench:
|
|
|
232
282
|
first row, so a partially-corrupt JSONL still triggers
|
|
233
283
|
format-specific behavior. Rows missing question or expected are
|
|
234
284
|
silently dropped (they show up as a row-count delta vs the JSONL).
|
|
285
|
+
|
|
286
|
+
`open_book` controls whether per-row evidence text is prepended to the
|
|
287
|
+
question. Default ``None`` resolves to ``True`` for `financebench`
|
|
288
|
+
(where the gold answer lives in the cited 10-K excerpt) and ``False``
|
|
289
|
+
for everything else. Pass ``True`` / ``False`` to override. Currently
|
|
290
|
+
only `financebench` rows have a standard evidence field; the flag is
|
|
291
|
+
a no-op for the other formats.
|
|
292
|
+
|
|
293
|
+
`subset` is a FinanceBench-only convenience filter that drops rows
|
|
294
|
+
whose ``question_type`` doesn't match. Useful for scoring only the
|
|
295
|
+
``metrics-generated`` subset (quantitative questions) without
|
|
296
|
+
pre-filtering the JSONL.
|
|
235
297
|
"""
|
|
236
298
|
p = Path(path)
|
|
237
299
|
questions: list[VerticalQA] = []
|
|
@@ -247,11 +309,19 @@ class VerticalBench:
|
|
|
247
309
|
continue
|
|
248
310
|
if fmt is None:
|
|
249
311
|
fmt = _detect_format(row)
|
|
250
|
-
|
|
312
|
+
# Resolve open_book on the first row once fmt is known.
|
|
313
|
+
if open_book is None:
|
|
314
|
+
open_book = fmt == "financebench"
|
|
315
|
+
if subset is not None and row.get("question_type") != subset:
|
|
316
|
+
continue
|
|
317
|
+
qa = _row_to_qa(row, fmt, fallback_idx=i, open_book=open_book)
|
|
251
318
|
if qa is not None:
|
|
252
319
|
questions.append(qa)
|
|
253
320
|
if limit is not None and len(questions) >= limit:
|
|
254
321
|
break
|
|
322
|
+
# If the file was empty, open_book may still be None — collapse to False.
|
|
323
|
+
if open_book is None:
|
|
324
|
+
open_book = False
|
|
255
325
|
# Pick a sensible default scorer per format if caller didn't override.
|
|
256
326
|
if scorer is None:
|
|
257
327
|
scorer = numeric_match if fmt == "financebench" else exact_match
|
|
@@ -201,6 +201,12 @@ class ModelCard:
|
|
|
201
201
|
"""Variant to feature in the default How-to-run snippets (e.g. `Q5_K_M`).
|
|
202
202
|
Falls back to `Q5_K_M` if present in `variants`, else the first listed
|
|
203
203
|
variant."""
|
|
204
|
+
llama_cpp_example_prompt: Optional[str] = None
|
|
205
|
+
"""User-message string for the templated `llama-cpp-python` snippet.
|
|
206
|
+
Mirrors what the article's "Using this release" section asks the model —
|
|
207
|
+
keep article + HF card in lockstep. When omitted, falls back to a
|
|
208
|
+
neutral placeholder. Multi-line strings (e.g. MCQ-shaped prompts) render
|
|
209
|
+
via JSON-escaped `\\n` so the snippet stays single-line + valid Python."""
|
|
204
210
|
ollama_pull_handle: Optional[str] = None
|
|
205
211
|
transformers_snippet: Optional[str] = None
|
|
206
212
|
lineage_prompt: Optional[str] = None
|
|
@@ -329,7 +335,12 @@ def _render_how_to_run(card: ModelCard) -> list[str]:
|
|
|
329
335
|
out.append(f" n_ctx=4096, n_gpu_layers=99{chat_format_kw},")
|
|
330
336
|
out.append(")")
|
|
331
337
|
out.append("out = llm.create_chat_completion(")
|
|
332
|
-
|
|
338
|
+
example_prompt = (
|
|
339
|
+
card.llama_cpp_example_prompt
|
|
340
|
+
or "Summarize the key idea in one paragraph."
|
|
341
|
+
)
|
|
342
|
+
escaped_prompt = json.dumps(example_prompt, ensure_ascii=False)
|
|
343
|
+
out.append(f' messages=[{{"role": "user", "content": {escaped_prompt}}}],')
|
|
333
344
|
out.append(" temperature=0.0,")
|
|
334
345
|
out.append(")")
|
|
335
346
|
out.append('print(out["choices"][0]["message"]["content"])')
|
|
@@ -525,6 +536,14 @@ class ArtifactManifest:
|
|
|
525
536
|
vertical_eval_name: Optional[str] = None
|
|
526
537
|
"""Display name for the vertical eval (e.g.,
|
|
527
538
|
"FinanceBench (n=50, numeric_match)")."""
|
|
539
|
+
recommended_variant: Optional[str] = None
|
|
540
|
+
"""Article-narrative pick for the "Sweet spot" — the variant the catalog
|
|
541
|
+
page should pin under `recommended_variant:` instead of letting the
|
|
542
|
+
destination's rank-avg picker choose. Mirrors `ModelCard.recommended_variant`
|
|
543
|
+
so the HF README badge and the destination catalog page stay in sync.
|
|
544
|
+
Added in v0.4.2 after cyber-vertical: Q4_K_M topped the bench but its
|
|
545
|
+
worst-in-class perplexity dragged its rank-avg down, so the picker selected
|
|
546
|
+
Q5_K_M. This field lets the article's narrative judgment win."""
|
|
528
547
|
lineage_run_id: Optional[str] = None
|
|
529
548
|
license_tier: str = "free"
|
|
530
549
|
license_commercial_tier: Optional[str] = None
|
|
@@ -564,6 +583,8 @@ class ArtifactManifest:
|
|
|
564
583
|
d["vertical_eval"] = dict(self.vertical_eval)
|
|
565
584
|
if self.vertical_eval_name:
|
|
566
585
|
d["vertical_eval_name"] = self.vertical_eval_name
|
|
586
|
+
if self.recommended_variant:
|
|
587
|
+
d["recommended_variant"] = self.recommended_variant
|
|
567
588
|
if self.lineage_run_id:
|
|
568
589
|
d["lineage_run_id"] = self.lineage_run_id
|
|
569
590
|
d["license"] = {"tier": self.license_tier}
|
|
@@ -813,6 +834,7 @@ def publish_quant(
|
|
|
813
834
|
model_license: Optional[str] = None,
|
|
814
835
|
chat_format: Optional[str] = None,
|
|
815
836
|
recommended_variant: Optional[str] = None,
|
|
837
|
+
llama_cpp_example_prompt: Optional[str] = None,
|
|
816
838
|
) -> PublishResult:
|
|
817
839
|
"""Orchestrate model-card render + manifest write + HF push.
|
|
818
840
|
|
|
@@ -860,6 +882,8 @@ def publish_quant(
|
|
|
860
882
|
chat_format = getattr(quant_report, "chat_format", None)
|
|
861
883
|
if recommended_variant is None:
|
|
862
884
|
recommended_variant = getattr(quant_report, "recommended_variant", None)
|
|
885
|
+
if llama_cpp_example_prompt is None:
|
|
886
|
+
llama_cpp_example_prompt = getattr(quant_report, "llama_cpp_example_prompt", None)
|
|
863
887
|
|
|
864
888
|
# Build the card.
|
|
865
889
|
tag_set: list[str] = [
|
|
@@ -916,6 +940,7 @@ def publish_quant(
|
|
|
916
940
|
hf_repo=hf_repo,
|
|
917
941
|
chat_format=chat_format,
|
|
918
942
|
recommended_variant=recommended_variant,
|
|
943
|
+
llama_cpp_example_prompt=llama_cpp_example_prompt,
|
|
919
944
|
ollama_pull_handle=ollama_pull_handle,
|
|
920
945
|
transformers_snippet=transformers_snippet,
|
|
921
946
|
lineage_prompt=lineage_prompt,
|
|
@@ -951,6 +976,7 @@ def publish_quant(
|
|
|
951
976
|
sustained_load_minutes=sustained,
|
|
952
977
|
vertical_eval=vertical_eval,
|
|
953
978
|
vertical_eval_name=vertical_eval_name,
|
|
979
|
+
recommended_variant=recommended_variant,
|
|
954
980
|
lineage_run_id=lineage_run_id,
|
|
955
981
|
model_license=model_license,
|
|
956
982
|
article=f"articles/{article_slug}/" if article_slug else None,
|
|
@@ -457,6 +457,32 @@ def test_artifact_manifest_carries_vertical_eval_when_set() -> None:
|
|
|
457
457
|
assert "vertical_eval_name:" in yaml_text
|
|
458
458
|
|
|
459
459
|
|
|
460
|
+
def test_artifact_manifest_carries_recommended_variant_when_set() -> None:
|
|
461
|
+
m = ArtifactManifest(
|
|
462
|
+
slug="securityllm-gguf",
|
|
463
|
+
kind="quant",
|
|
464
|
+
artifact_class="gguf",
|
|
465
|
+
base_model="ZySec-AI/SecurityLLM",
|
|
466
|
+
hf_repo="Orionfold/SecurityLLM-GGUF",
|
|
467
|
+
variants=("Q4_K_M", "Q5_K_M"),
|
|
468
|
+
recommended_variant="Q4_K_M",
|
|
469
|
+
)
|
|
470
|
+
d = m.to_dict()
|
|
471
|
+
assert d["recommended_variant"] == "Q4_K_M"
|
|
472
|
+
yaml_text = m.to_yaml()
|
|
473
|
+
assert "recommended_variant: Q4_K_M" in yaml_text
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def test_artifact_manifest_omits_recommended_variant_when_unset() -> None:
|
|
477
|
+
m = ArtifactManifest(
|
|
478
|
+
slug="s", kind="quant", artifact_class="gguf",
|
|
479
|
+
base_model="b", hf_repo="Orionfold/x",
|
|
480
|
+
)
|
|
481
|
+
d = m.to_dict()
|
|
482
|
+
assert "recommended_variant" not in d
|
|
483
|
+
assert "recommended_variant" not in m.to_yaml()
|
|
484
|
+
|
|
485
|
+
|
|
460
486
|
def test_artifact_manifest_yaml_is_parseable_round_trip() -> None:
|
|
461
487
|
yaml_text = ArtifactManifest(
|
|
462
488
|
slug="s",
|
|
@@ -653,6 +679,29 @@ def test_publish_quant_reads_vertical_eval_from_quant_report_duck_typed(
|
|
|
653
679
|
assert "55.0%" in card and "60.0%" in card
|
|
654
680
|
|
|
655
681
|
|
|
682
|
+
def test_publish_quant_threads_recommended_variant_into_card_and_manifest(
|
|
683
|
+
tmp_path: Path,
|
|
684
|
+
) -> None:
|
|
685
|
+
"""`recommended_variant` kwarg flows to the README's How-to-run snippets
|
|
686
|
+
AND to the manifest YAML so the destination catalog pins the same pick."""
|
|
687
|
+
qr = _stub_quant_report(tmp_path / "source")
|
|
688
|
+
result = publish_quant(
|
|
689
|
+
quant_report=qr,
|
|
690
|
+
base_model="ZySec-AI/SecurityLLM",
|
|
691
|
+
repo_name="SecurityLLM-GGUF",
|
|
692
|
+
staging_dir=tmp_path / "stage",
|
|
693
|
+
artifacts_dir=tmp_path / "content" / "artifacts",
|
|
694
|
+
recommended_variant="Q4_K_M",
|
|
695
|
+
dry_run=True,
|
|
696
|
+
)
|
|
697
|
+
card = result.card_path.read_text()
|
|
698
|
+
# The default How-to-run snippet templates against the recommended variant
|
|
699
|
+
assert "Q4_K_M" in card
|
|
700
|
+
# Manifest YAML carries the recommended_variant field
|
|
701
|
+
manifest = result.manifest_path.read_text()
|
|
702
|
+
assert "recommended_variant: Q4_K_M" in manifest
|
|
703
|
+
|
|
704
|
+
|
|
656
705
|
def test_publish_quant_threads_model_license_into_card_and_manifest(
|
|
657
706
|
tmp_path: Path,
|
|
658
707
|
) -> None:
|
|
@@ -217,6 +217,114 @@ class TestFromJsonl:
|
|
|
217
217
|
assert vb.name == "financebench-mini"
|
|
218
218
|
|
|
219
219
|
|
|
220
|
+
# --- Open-book mode (v0.4.1+) -----------------------------------------------
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _fb_row(qid: str, question: str, gold: str, evidence_text: str, **extra) -> dict:
|
|
224
|
+
return {
|
|
225
|
+
"financebench_id": qid,
|
|
226
|
+
"question": question,
|
|
227
|
+
"gold_standard": gold,
|
|
228
|
+
"doc_name": extra.get("doc_name", "ACME_2024_10-K"),
|
|
229
|
+
"company": extra.get("company", "ACME"),
|
|
230
|
+
"doc_period": extra.get("doc_period", "FY2024"),
|
|
231
|
+
"question_type": extra.get("question_type", "metrics-generated"),
|
|
232
|
+
"evidence": [{"evidence_text": evidence_text}] if evidence_text else [],
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class TestOpenBook:
|
|
237
|
+
def test_financebench_auto_open_book(self, tmp_path: Path) -> None:
|
|
238
|
+
# No explicit open_book — default for financebench is True.
|
|
239
|
+
rows = [_fb_row("fb-1", "What was revenue?", "1234", "Revenue: $1,234M")]
|
|
240
|
+
p = _write_jsonl(tmp_path, "fb.jsonl", rows)
|
|
241
|
+
vb = VerticalBench.from_jsonl(p)
|
|
242
|
+
assert len(vb.questions) == 1
|
|
243
|
+
q = vb.questions[0]
|
|
244
|
+
# Evidence is now in the question prompt.
|
|
245
|
+
assert "Revenue: $1,234M" in q.question
|
|
246
|
+
assert "Question: What was revenue?" in q.question
|
|
247
|
+
assert "ACME_2024_10-K" in q.question
|
|
248
|
+
# Expected is unchanged.
|
|
249
|
+
assert q.expected == "1234"
|
|
250
|
+
|
|
251
|
+
def test_explicit_open_book_false_keeps_closed(self, tmp_path: Path) -> None:
|
|
252
|
+
rows = [_fb_row("fb-1", "What was revenue?", "1234", "Revenue: $1,234M")]
|
|
253
|
+
p = _write_jsonl(tmp_path, "fb.jsonl", rows)
|
|
254
|
+
vb = VerticalBench.from_jsonl(p, open_book=False)
|
|
255
|
+
assert vb.questions[0].question == "What was revenue?"
|
|
256
|
+
|
|
257
|
+
def test_open_book_no_evidence_falls_back(self, tmp_path: Path) -> None:
|
|
258
|
+
# A financebench row missing evidence — open_book=True is a no-op.
|
|
259
|
+
rows = [_fb_row("fb-1", "What was revenue?", "1234", "")]
|
|
260
|
+
p = _write_jsonl(tmp_path, "fb.jsonl", rows)
|
|
261
|
+
vb = VerticalBench.from_jsonl(p)
|
|
262
|
+
assert vb.questions[0].question == "What was revenue?"
|
|
263
|
+
|
|
264
|
+
def test_legalbench_open_book_is_noop(self, tmp_path: Path) -> None:
|
|
265
|
+
# No standard evidence field — open_book=True doesn't change the question.
|
|
266
|
+
rows = [{"id": "lb-1", "text": "Is X enforceable?", "answer": "yes"}]
|
|
267
|
+
p = _write_jsonl(tmp_path, "lb.jsonl", rows)
|
|
268
|
+
vb = VerticalBench.from_jsonl(p, open_book=True)
|
|
269
|
+
assert vb.questions[0].question == "Is X enforceable?"
|
|
270
|
+
|
|
271
|
+
def test_open_book_default_off_for_legalbench(self, tmp_path: Path) -> None:
|
|
272
|
+
# Auto-default for non-financebench is False, even if a stray evidence
|
|
273
|
+
# field is present.
|
|
274
|
+
rows = [
|
|
275
|
+
{
|
|
276
|
+
"id": "lb-1",
|
|
277
|
+
"text": "Is X enforceable?",
|
|
278
|
+
"answer": "yes",
|
|
279
|
+
"evidence": [{"evidence_text": "Section 3.1 prohibits X."}],
|
|
280
|
+
}
|
|
281
|
+
]
|
|
282
|
+
p = _write_jsonl(tmp_path, "lb.jsonl", rows)
|
|
283
|
+
vb = VerticalBench.from_jsonl(p)
|
|
284
|
+
# No prepending — only financebench triggers open-book by default.
|
|
285
|
+
assert vb.questions[0].question == "Is X enforceable?"
|
|
286
|
+
|
|
287
|
+
def test_evidence_text_as_strings(self, tmp_path: Path) -> None:
|
|
288
|
+
# Some pre-flattened dumps put evidence as list-of-strings.
|
|
289
|
+
rows = [
|
|
290
|
+
{
|
|
291
|
+
"financebench_id": "fb-1",
|
|
292
|
+
"question": "q",
|
|
293
|
+
"gold_standard": "g",
|
|
294
|
+
"evidence": ["chunk-1", "chunk-2"],
|
|
295
|
+
}
|
|
296
|
+
]
|
|
297
|
+
p = _write_jsonl(tmp_path, "fb.jsonl", rows)
|
|
298
|
+
vb = VerticalBench.from_jsonl(p)
|
|
299
|
+
assert "chunk-1" in vb.questions[0].question
|
|
300
|
+
assert "chunk-2" in vb.questions[0].question
|
|
301
|
+
|
|
302
|
+
def test_subset_filter_financebench(self, tmp_path: Path) -> None:
|
|
303
|
+
rows = [
|
|
304
|
+
_fb_row("fb-1", "q1", "1", "ev-1", question_type="metrics-generated"),
|
|
305
|
+
_fb_row("fb-2", "q2", "2", "ev-2", question_type="domain-relevant"),
|
|
306
|
+
_fb_row("fb-3", "q3", "3", "ev-3", question_type="metrics-generated"),
|
|
307
|
+
]
|
|
308
|
+
p = _write_jsonl(tmp_path, "fb.jsonl", rows)
|
|
309
|
+
vb = VerticalBench.from_jsonl(p, subset="metrics-generated")
|
|
310
|
+
assert len(vb.questions) == 2
|
|
311
|
+
assert {q.qid for q in vb.questions} == {"fb-1", "fb-3"}
|
|
312
|
+
|
|
313
|
+
def test_subset_limit_compose(self, tmp_path: Path) -> None:
|
|
314
|
+
# subset filters before limit applies.
|
|
315
|
+
rows = [
|
|
316
|
+
_fb_row(f"fb-{i}", f"q{i}", str(i), f"ev-{i}", question_type="metrics-generated")
|
|
317
|
+
for i in range(5)
|
|
318
|
+
] + [
|
|
319
|
+
_fb_row(f"fbx-{i}", f"qx{i}", str(i), f"ev-{i}", question_type="other")
|
|
320
|
+
for i in range(5)
|
|
321
|
+
]
|
|
322
|
+
p = _write_jsonl(tmp_path, "fb.jsonl", rows)
|
|
323
|
+
vb = VerticalBench.from_jsonl(p, subset="metrics-generated", limit=3)
|
|
324
|
+
assert len(vb.questions) == 3
|
|
325
|
+
assert all(q.qid.startswith("fb-") for q in vb.questions)
|
|
326
|
+
|
|
327
|
+
|
|
220
328
|
# --- VerticalBench.run -------------------------------------------------------
|
|
221
329
|
|
|
222
330
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|