@dogfood-lab/study-swarm 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +36 -0
- package/PROTOCOL.md +15 -2
- package/README.es.md +45 -33
- package/README.fr.md +44 -32
- package/README.hi.md +53 -41
- package/README.it.md +48 -36
- package/README.ja.md +53 -41
- package/README.md +14 -2
- package/README.pt-BR.md +52 -40
- package/README.zh.md +56 -44
- package/bin/study-swarm.mjs +183 -1
- package/examples/study-swarm-lock.dispatch.md +137 -0
- package/examples/study-swarm-lock.lock.json +62 -0
- package/examples/study-swarm-lock.orchestration.json +369 -0
- package/examples/study-swarm-v1_1.dispatch.md +89 -0
- package/package.json +1 -1
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schema": "dispatch.lock/v1",
|
|
3
|
+
"study_swarm_version": "1.2.0",
|
|
4
|
+
"protocol_sha256": "sha256-TPun6PfvCRUx0BI6GgAOlp0P91Mg3gK0DLnpaHQ6jQs=",
|
|
5
|
+
"dispatch_sha256": "sha256-mPdwnlPuRlCtky20zG6l5AFJ8iRAZjHISnoGDUuGSOo=",
|
|
6
|
+
"steps": [
|
|
7
|
+
{
|
|
8
|
+
"question_id": "Q1-replay-manifest",
|
|
9
|
+
"resolved_model": "claude-opus-4-8",
|
|
10
|
+
"prompt_sha256": "sha256-hcdcTxvqJNij0z9ItUPIzQhBWbrVZy3VvY4umPHaM5I=",
|
|
11
|
+
"tool_schema_sha256": "sha256-kUC7s+wAFV4lzwrFKYp36khxCRQZWlWeFDOwhPIqGCk=",
|
|
12
|
+
"schema_dialect": "https://json-schema.org/draft/2020-12/schema",
|
|
13
|
+
"output_sha256": "sha256-k5rkOOyAHGBCVacEk8LPULp+s5YHVzaUyCHJY4tD5jE="
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
"question_id": "Q2-canonicalization",
|
|
17
|
+
"resolved_model": "claude-opus-4-8",
|
|
18
|
+
"prompt_sha256": "sha256-Gj+ri3gWVWxP7M9fv50c0xsgiZ2j3T6kn3A3c3EhSos=",
|
|
19
|
+
"tool_schema_sha256": "sha256-kUC7s+wAFV4lzwrFKYp36khxCRQZWlWeFDOwhPIqGCk=",
|
|
20
|
+
"schema_dialect": "https://json-schema.org/draft/2020-12/schema",
|
|
21
|
+
"output_sha256": "sha256-ymPE5lJqoygN0MpzqftgE1tYlyp7z+poCK2Z1Zj3mi8="
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"question_id": "Q3-provenance-attestation",
|
|
25
|
+
"resolved_model": "claude-opus-4-8",
|
|
26
|
+
"prompt_sha256": "sha256-g5/dW+zJvMObKyqnAdy70pUPtJcGD0DrsjkeEx/AcIY=",
|
|
27
|
+
"tool_schema_sha256": "sha256-kUC7s+wAFV4lzwrFKYp36khxCRQZWlWeFDOwhPIqGCk=",
|
|
28
|
+
"schema_dialect": "https://json-schema.org/draft/2020-12/schema",
|
|
29
|
+
"output_sha256": "sha256-OSUAIhytytKihfFM1Y+p1BllUSTC+KEfb/NX86X+Kc0="
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"question_id": "Q4-llm-determinism",
|
|
33
|
+
"resolved_model": "claude-opus-4-8",
|
|
34
|
+
"prompt_sha256": "sha256-OfQ00IyAulvEOnZ/UpiLp+JTwF4jYUzQRaqCp3zgIbE=",
|
|
35
|
+
"tool_schema_sha256": "sha256-kUC7s+wAFV4lzwrFKYp36khxCRQZWlWeFDOwhPIqGCk=",
|
|
36
|
+
"schema_dialect": "https://json-schema.org/draft/2020-12/schema",
|
|
37
|
+
"output_sha256": "sha256-aFDm9p4/94p97NJg/vWKbYXchv6N1b0swpIfYoep1iw="
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
"question_id": "Q5-tool-schema-drift",
|
|
41
|
+
"resolved_model": "claude-opus-4-8",
|
|
42
|
+
"prompt_sha256": "sha256-qIr9Dl8GmGeikUrgCJ62QQh8FvTNUhvdxL0Ivk/Di3M=",
|
|
43
|
+
"tool_schema_sha256": "sha256-kUC7s+wAFV4lzwrFKYp36khxCRQZWlWeFDOwhPIqGCk=",
|
|
44
|
+
"schema_dialect": "https://json-schema.org/draft/2020-12/schema",
|
|
45
|
+
"output_sha256": "sha256-Y1lsUdKoplgdos0r4faN6UgxXX74D/Se0AVHCiUA9dM="
|
|
46
|
+
}
|
|
47
|
+
],
|
|
48
|
+
"verification": {
|
|
49
|
+
"runner": "roleos verify-citations",
|
|
50
|
+
"runner_source": "role-os local clone E:/AI/role-os",
|
|
51
|
+
"tool": "prism verify --type citations",
|
|
52
|
+
"tool_version": "prism 1.6.0",
|
|
53
|
+
"verifier_model": "mistral-small:24b",
|
|
54
|
+
"verifier_family": "local",
|
|
55
|
+
"caller_family_excluded": "anthropic",
|
|
56
|
+
"verdict": "escalate",
|
|
57
|
+
"receipt_id": "prism-01kwbajx31dj9gcf5xn3cn5ydg",
|
|
58
|
+
"receipt_signature": "272c892124e3bc13a76b2674fa361b1d65aee6a588c74604cf4ae4e7c9440a8ba7888175b9ec1286fe87490121f694f64cd30adc3ffc0e1b31cd3365b7b38901",
|
|
59
|
+
"receipt_chain_sha256": "499b63905064a5e25fd1801c5530504c94742f2183c4d3c8eb545a20cfbb112e"
|
|
60
|
+
},
|
|
61
|
+
"lock_sha256": "sha256-vLSdNNCbyk+Oyp+MRNTamBM5QdJlSUShfkTf5zWcpHs="
|
|
62
|
+
}
|
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
{
|
|
2
|
+
"schema": "study-swarm.orchestration/v1",
|
|
3
|
+
"note": "Harness-emitted record of the Step-2 research agents for study-swarm-lock.dispatch.md. tool_schema pins the StructuredOutput contract each agent was bound to (the load-bearing, capturable surface); a full harness would pin the entire tool array. output_sha256 hashes the returned findings for drift detection (not determinism).",
|
|
4
|
+
"steps": [
|
|
5
|
+
{
|
|
6
|
+
"question_id": "Q1-replay-manifest",
|
|
7
|
+
"resolved_model": "claude-opus-4-8",
|
|
8
|
+
"prompt": "You are a research agent in a STUDY-SWARM (the research-grounded-advisor protocol) grounding the design of a new feature for the open-source repo `dogfood-lab/study-swarm`.\n\nTHE FEATURE — `dispatch.lock.json`: a per-dispatch lockfile that makes a study-swarm research dispatch BYTE-REPLAYABLE by pinning, per step:\n- the RESOLVED model id each research agent actually ran on (e.g. claude-opus-4-8, never an alias like \"opus\"),\n- the SHA-256 of the byte-exact agent prompt,\n- the SHA-256 of the tool JSONSchemas the agent had,\n- the external-verifier run/receipt id (e.g. a prism Ed25519 receipt id) and the receipt chain hash,\n- plus a top-level `lock_sha256` rollup over the whole lock (its content-address).\nThis implements the PIN_PER_STEP workflow standard (heritage: Snakemake 2012, Pegasus 2001).\n\nIMPLEMENTATION CONSTRAINTS (these shape which evidence is useful):\n- The CLI is ZERO-DEPENDENCY, NETWORK-FREE, DETERMINISTIC: SHA-256 via node:crypto, JSON I/O only. It makes NO model calls. The ORCHESTRATION HARNESS supplies the resolved models + byte-exact prompts + verifier run_id; the CLI only canonicalizes + hashes + validates them, and `lock --verify` re-derives the deterministic hashes and FAILS (exit 1) on drift.\n- Honest ceiling: pinning model+prompt+temp does NOT give bit-identical LLM outputs. The lock pins INPUTS byte-exact + records OUTPUT hashes for DRIFT DETECTION — \"replayable inputs + drift-detectable outputs\", NOT \"deterministic replay\".\n\nYOUR JOB: gather SPECIFIC, CITED, RETRIEVED evidence to answer ONE question. HARD RULES:\n- GROUND AT GENERATION TIME: use WebSearch and WebFetch to ACTUALLY RETRIEVE every source you cite THIS session. Cite ONLY sources you actually fetched. A claim you cannot ground in a fetched source is DROPPED, not invented.\n- Every finding needs: a one-sentence claim in your own words that MATCHES what the source actually says (do NOT overstate); author(s)/org; year; a RESOLVABLE identifier (arXiv:NNNN.NNNNN, a DOI, an RFC number, or a direct URL to the spec/paper — not a blog summary); the resolvable URL; whether you retrieved it; and a one-sentence DESIGN IMPLICATION for `dispatch.lock.json`.\n- Prefer specificity over breadth: 6-8 well-sourced, RETRIEVED findings beat 20 vague gestures. ~500-600 words of substance.\n- Set retrieved=false for anything you could not actually fetch — those will be dropped.\n\n========\nYOUR QUESTION (Q1-replay-manifest):\nQUESTION: How do reproducible-workflow and build/package systems structure a replay manifest, and how do they detect & surface DRIFT between the lock and a re-run?\nInvestigate (retrieve the actual papers/docs): Snakemake (Koster & Rahmann 2012, Bioinformatics DOI:10.1093/bioinformatics/bts480; and the 2021 F1000Research sustainable-data-analysis update); Pegasus (Deelman et al., Future Generation Computer Systems 2015 / the workflow provenance work); Nextflow (Di Tommaso et al. 2017, Nature Biotechnology DOI:10.1038/nbt.3820); ReproZip (Chirigati et al.); Nix / reproducible builds; and DEPENDENCY LOCKFILES (npm package-lock.json integrity, Cargo.lock, uv.lock, pip --require-hashes / PEP 665 / PEP 658). For each: WHAT is pinned (inputs, versions, content hashes), HOW it is hashed/content-addressed, and HOW drift is detected and surfaced (e.g. an integrity mismatch failing the install/CI). Map each to a concrete `dispatch.lock.json` field or to the `lock --verify` drift-detection behavior.\n\nReturn structured findings. Remember: retrieve-then-cite; drop what you cannot fetch.",
|
|
9
|
+
"tool_schema": {
|
|
10
|
+
"type": "object",
|
|
11
|
+
"additionalProperties": false,
|
|
12
|
+
"required": [
|
|
13
|
+
"question_id",
|
|
14
|
+
"findings",
|
|
15
|
+
"notes"
|
|
16
|
+
],
|
|
17
|
+
"properties": {
|
|
18
|
+
"question_id": {
|
|
19
|
+
"type": "string"
|
|
20
|
+
},
|
|
21
|
+
"findings": {
|
|
22
|
+
"type": "array",
|
|
23
|
+
"items": {
|
|
24
|
+
"type": "object",
|
|
25
|
+
"additionalProperties": false,
|
|
26
|
+
"required": [
|
|
27
|
+
"claim",
|
|
28
|
+
"authors",
|
|
29
|
+
"year",
|
|
30
|
+
"identifier",
|
|
31
|
+
"url",
|
|
32
|
+
"retrieved",
|
|
33
|
+
"design_implication"
|
|
34
|
+
],
|
|
35
|
+
"properties": {
|
|
36
|
+
"claim": {
|
|
37
|
+
"type": "string",
|
|
38
|
+
"description": "One-sentence finding in your own words, matching the source (do not overstate)."
|
|
39
|
+
},
|
|
40
|
+
"authors": {
|
|
41
|
+
"type": "string",
|
|
42
|
+
"description": "Author(s) or org, e.g. \"Koster & Rahmann\" or \"Rundgren et al.\" or \"OpenSSF\"."
|
|
43
|
+
},
|
|
44
|
+
"year": {
|
|
45
|
+
"type": "string"
|
|
46
|
+
},
|
|
47
|
+
"identifier": {
|
|
48
|
+
"type": "string",
|
|
49
|
+
"description": "arXiv:NNNN.NNNNN, a DOI (10.xxxx/...), an RFC number, or a direct URL."
|
|
50
|
+
},
|
|
51
|
+
"url": {
|
|
52
|
+
"type": "string",
|
|
53
|
+
"description": "A resolvable URL the existence oracle can fetch."
|
|
54
|
+
},
|
|
55
|
+
"retrieved": {
|
|
56
|
+
"type": "boolean",
|
|
57
|
+
"description": "true ONLY if you actually fetched this source this session."
|
|
58
|
+
},
|
|
59
|
+
"design_implication": {
|
|
60
|
+
"type": "string",
|
|
61
|
+
"description": "One sentence: implication for dispatch.lock.json."
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
},
|
|
66
|
+
"notes": {
|
|
67
|
+
"type": "string",
|
|
68
|
+
"description": "Coverage gaps, sources you could not fetch, dropped claims."
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
"schema_dialect": "https://json-schema.org/draft/2020-12/schema",
|
|
73
|
+
"output_sha256": "sha256-k5rkOOyAHGBCVacEk8LPULp+s5YHVzaUyCHJY4tD5jE="
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
"question_id": "Q2-canonicalization",
|
|
77
|
+
"resolved_model": "claude-opus-4-8",
|
|
78
|
+
"prompt": "You are a research agent in a STUDY-SWARM (the research-grounded-advisor protocol) grounding the design of a new feature for the open-source repo `dogfood-lab/study-swarm`.\n\nTHE FEATURE — `dispatch.lock.json`: a per-dispatch lockfile that makes a study-swarm research dispatch BYTE-REPLAYABLE by pinning, per step:\n- the RESOLVED model id each research agent actually ran on (e.g. claude-opus-4-8, never an alias like \"opus\"),\n- the SHA-256 of the byte-exact agent prompt,\n- the SHA-256 of the tool JSONSchemas the agent had,\n- the external-verifier run/receipt id (e.g. a prism Ed25519 receipt id) and the receipt chain hash,\n- plus a top-level `lock_sha256` rollup over the whole lock (its content-address).\nThis implements the PIN_PER_STEP workflow standard (heritage: Snakemake 2012, Pegasus 2001).\n\nIMPLEMENTATION CONSTRAINTS (these shape which evidence is useful):\n- The CLI is ZERO-DEPENDENCY, NETWORK-FREE, DETERMINISTIC: SHA-256 via node:crypto, JSON I/O only. It makes NO model calls. The ORCHESTRATION HARNESS supplies the resolved models + byte-exact prompts + verifier run_id; the CLI only canonicalizes + hashes + validates them, and `lock --verify` re-derives the deterministic hashes and FAILS (exit 1) on drift.\n- Honest ceiling: pinning model+prompt+temp does NOT give bit-identical LLM outputs. The lock pins INPUTS byte-exact + records OUTPUT hashes for DRIFT DETECTION — \"replayable inputs + drift-detectable outputs\", NOT \"deterministic replay\".\n\nYOUR JOB: gather SPECIFIC, CITED, RETRIEVED evidence to answer ONE question. HARD RULES:\n- GROUND AT GENERATION TIME: use WebSearch and WebFetch to ACTUALLY RETRIEVE every source you cite THIS session. Cite ONLY sources you actually fetched. A claim you cannot ground in a fetched source is DROPPED, not invented.\n- Every finding needs: a one-sentence claim in your own words that MATCHES what the source actually says (do NOT overstate); author(s)/org; year; a RESOLVABLE identifier (arXiv:NNNN.NNNNN, a DOI, an RFC number, or a direct URL to the spec/paper — not a blog summary); the resolvable URL; whether you retrieved it; and a one-sentence DESIGN IMPLICATION for `dispatch.lock.json`.\n- Prefer specificity over breadth: 6-8 well-sourced, RETRIEVED findings beat 20 vague gestures. ~500-600 words of substance.\n- Set retrieved=false for anything you could not actually fetch — those will be dropped.\n\n========\nYOUR QUESTION (Q2-canonicalization):\nQUESTION: What is the correct way to canonicalize structured (JSON) data so a hash is STABLE across platforms and re-serializations, and how should per-step hashes roll up to one dispatch hash?\nInvestigate (retrieve the actual specs): RFC 8785 JSON Canonicalization Scheme (Rundgren, Jordan & Erdtman 2020) — exactly what it normalizes (property ordering, number serialization per ECMAScript, Unicode/UTF-8, whitespace); JWS/JOSE canonical serialization (RFC 7515); Merkle trees / hash chains (Merkle, CRYPTO 1987, DOI:10.1007/3-540-48184-2_32) for per-step → rollup; and the concrete instability sources a WINDOWS-authored tool must defend against — CRLF vs LF, key/property ordering, Unicode normalization (NFC/NFD), floating-point/number formatting, trailing whitespace, BOM. Map each to how `lock_sha256`, `prompt_sha256`, and `tool_schema_sha256` must be computed so the SAME dispatch hashes IDENTICALLY on Windows, Linux, and macOS.\n\nReturn structured findings. Remember: retrieve-then-cite; drop what you cannot fetch.",
|
|
79
|
+
"tool_schema": {
|
|
80
|
+
"type": "object",
|
|
81
|
+
"additionalProperties": false,
|
|
82
|
+
"required": [
|
|
83
|
+
"question_id",
|
|
84
|
+
"findings",
|
|
85
|
+
"notes"
|
|
86
|
+
],
|
|
87
|
+
"properties": {
|
|
88
|
+
"question_id": {
|
|
89
|
+
"type": "string"
|
|
90
|
+
},
|
|
91
|
+
"findings": {
|
|
92
|
+
"type": "array",
|
|
93
|
+
"items": {
|
|
94
|
+
"type": "object",
|
|
95
|
+
"additionalProperties": false,
|
|
96
|
+
"required": [
|
|
97
|
+
"claim",
|
|
98
|
+
"authors",
|
|
99
|
+
"year",
|
|
100
|
+
"identifier",
|
|
101
|
+
"url",
|
|
102
|
+
"retrieved",
|
|
103
|
+
"design_implication"
|
|
104
|
+
],
|
|
105
|
+
"properties": {
|
|
106
|
+
"claim": {
|
|
107
|
+
"type": "string",
|
|
108
|
+
"description": "One-sentence finding in your own words, matching the source (do not overstate)."
|
|
109
|
+
},
|
|
110
|
+
"authors": {
|
|
111
|
+
"type": "string",
|
|
112
|
+
"description": "Author(s) or org, e.g. \"Koster & Rahmann\" or \"Rundgren et al.\" or \"OpenSSF\"."
|
|
113
|
+
},
|
|
114
|
+
"year": {
|
|
115
|
+
"type": "string"
|
|
116
|
+
},
|
|
117
|
+
"identifier": {
|
|
118
|
+
"type": "string",
|
|
119
|
+
"description": "arXiv:NNNN.NNNNN, a DOI (10.xxxx/...), an RFC number, or a direct URL."
|
|
120
|
+
},
|
|
121
|
+
"url": {
|
|
122
|
+
"type": "string",
|
|
123
|
+
"description": "A resolvable URL the existence oracle can fetch."
|
|
124
|
+
},
|
|
125
|
+
"retrieved": {
|
|
126
|
+
"type": "boolean",
|
|
127
|
+
"description": "true ONLY if you actually fetched this source this session."
|
|
128
|
+
},
|
|
129
|
+
"design_implication": {
|
|
130
|
+
"type": "string",
|
|
131
|
+
"description": "One sentence: implication for dispatch.lock.json."
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
},
|
|
136
|
+
"notes": {
|
|
137
|
+
"type": "string",
|
|
138
|
+
"description": "Coverage gaps, sources you could not fetch, dropped claims."
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
},
|
|
142
|
+
"schema_dialect": "https://json-schema.org/draft/2020-12/schema",
|
|
143
|
+
"output_sha256": "sha256-ymPE5lJqoygN0MpzqftgE1tYlyp7z+poCK2Z1Zj3mi8="
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
"question_id": "Q3-provenance-attestation",
|
|
147
|
+
"resolved_model": "claude-opus-4-8",
|
|
148
|
+
"prompt": "You are a research agent in a STUDY-SWARM (the research-grounded-advisor protocol) grounding the design of a new feature for the open-source repo `dogfood-lab/study-swarm`.\n\nTHE FEATURE — `dispatch.lock.json`: a per-dispatch lockfile that makes a study-swarm research dispatch BYTE-REPLAYABLE by pinning, per step:\n- the RESOLVED model id each research agent actually ran on (e.g. claude-opus-4-8, never an alias like \"opus\"),\n- the SHA-256 of the byte-exact agent prompt,\n- the SHA-256 of the tool JSONSchemas the agent had,\n- the external-verifier run/receipt id (e.g. a prism Ed25519 receipt id) and the receipt chain hash,\n- plus a top-level `lock_sha256` rollup over the whole lock (its content-address).\nThis implements the PIN_PER_STEP workflow standard (heritage: Snakemake 2012, Pegasus 2001).\n\nIMPLEMENTATION CONSTRAINTS (these shape which evidence is useful):\n- The CLI is ZERO-DEPENDENCY, NETWORK-FREE, DETERMINISTIC: SHA-256 via node:crypto, JSON I/O only. It makes NO model calls. The ORCHESTRATION HARNESS supplies the resolved models + byte-exact prompts + verifier run_id; the CLI only canonicalizes + hashes + validates them, and `lock --verify` re-derives the deterministic hashes and FAILS (exit 1) on drift.\n- Honest ceiling: pinning model+prompt+temp does NOT give bit-identical LLM outputs. The lock pins INPUTS byte-exact + records OUTPUT hashes for DRIFT DETECTION — \"replayable inputs + drift-detectable outputs\", NOT \"deterministic replay\".\n\nYOUR JOB: gather SPECIFIC, CITED, RETRIEVED evidence to answer ONE question. HARD RULES:\n- GROUND AT GENERATION TIME: use WebSearch and WebFetch to ACTUALLY RETRIEVE every source you cite THIS session. Cite ONLY sources you actually fetched. A claim you cannot ground in a fetched source is DROPPED, not invented.\n- Every finding needs: a one-sentence claim in your own words that MATCHES what the source actually says (do NOT overstate); author(s)/org; year; a RESOLVABLE identifier (arXiv:NNNN.NNNNN, a DOI, an RFC number, or a direct URL to the spec/paper — not a blog summary); the resolvable URL; whether you retrieved it; and a one-sentence DESIGN IMPLICATION for `dispatch.lock.json`.\n- Prefer specificity over breadth: 6-8 well-sourced, RETRIEVED findings beat 20 vague gestures. ~500-600 words of substance.\n- Set retrieved=false for anything you could not actually fetch — those will be dropped.\n\n========\nYOUR QUESTION (Q3-provenance-attestation):\nQUESTION: How do software supply-chain frameworks capture STEP-LEVEL provenance, and which parts map to pinning \"model + prompt + tool-schema + verifier receipt\" for one dispatch step?\nInvestigate (retrieve the actual papers/specs): in-toto (Torres-Arias, Afzali, Kuppusamy, Curtmola & Cappos 2019, USENIX Security — the link metadata + layout model); SLSA (the OpenSSF SLSA provenance levels + the provenance predicate schema); Sigstore (Newman, Meyers et al. 2022, ACM CCS DOI:10.1145/3548606.3560596 — keyless signing + Rekor transparency log) and the verifiability-vs-anti-forgery distinction; SCITT and/or C2PA if relevant; and W3C PROV / research-object reproducibility lineage. For each: what a step attestation records (materials/inputs, the step command/predicate, products/outputs, the actor/environment), how steps are chained, and what \"verifiable but not unforgeable\" means for an ephemeral local signing key. Map each to the per-step record SHAPE of `dispatch.lock.json` and to the \"harness EMITS the record, CLI CANONICALIZES+HASHES+VALIDATES it\" separation.\n\nReturn structured findings. Remember: retrieve-then-cite; drop what you cannot fetch.",
|
|
149
|
+
"tool_schema": {
|
|
150
|
+
"type": "object",
|
|
151
|
+
"additionalProperties": false,
|
|
152
|
+
"required": [
|
|
153
|
+
"question_id",
|
|
154
|
+
"findings",
|
|
155
|
+
"notes"
|
|
156
|
+
],
|
|
157
|
+
"properties": {
|
|
158
|
+
"question_id": {
|
|
159
|
+
"type": "string"
|
|
160
|
+
},
|
|
161
|
+
"findings": {
|
|
162
|
+
"type": "array",
|
|
163
|
+
"items": {
|
|
164
|
+
"type": "object",
|
|
165
|
+
"additionalProperties": false,
|
|
166
|
+
"required": [
|
|
167
|
+
"claim",
|
|
168
|
+
"authors",
|
|
169
|
+
"year",
|
|
170
|
+
"identifier",
|
|
171
|
+
"url",
|
|
172
|
+
"retrieved",
|
|
173
|
+
"design_implication"
|
|
174
|
+
],
|
|
175
|
+
"properties": {
|
|
176
|
+
"claim": {
|
|
177
|
+
"type": "string",
|
|
178
|
+
"description": "One-sentence finding in your own words, matching the source (do not overstate)."
|
|
179
|
+
},
|
|
180
|
+
"authors": {
|
|
181
|
+
"type": "string",
|
|
182
|
+
"description": "Author(s) or org, e.g. \"Koster & Rahmann\" or \"Rundgren et al.\" or \"OpenSSF\"."
|
|
183
|
+
},
|
|
184
|
+
"year": {
|
|
185
|
+
"type": "string"
|
|
186
|
+
},
|
|
187
|
+
"identifier": {
|
|
188
|
+
"type": "string",
|
|
189
|
+
"description": "arXiv:NNNN.NNNNN, a DOI (10.xxxx/...), an RFC number, or a direct URL."
|
|
190
|
+
},
|
|
191
|
+
"url": {
|
|
192
|
+
"type": "string",
|
|
193
|
+
"description": "A resolvable URL the existence oracle can fetch."
|
|
194
|
+
},
|
|
195
|
+
"retrieved": {
|
|
196
|
+
"type": "boolean",
|
|
197
|
+
"description": "true ONLY if you actually fetched this source this session."
|
|
198
|
+
},
|
|
199
|
+
"design_implication": {
|
|
200
|
+
"type": "string",
|
|
201
|
+
"description": "One sentence: implication for dispatch.lock.json."
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
},
|
|
206
|
+
"notes": {
|
|
207
|
+
"type": "string",
|
|
208
|
+
"description": "Coverage gaps, sources you could not fetch, dropped claims."
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
},
|
|
212
|
+
"schema_dialect": "https://json-schema.org/draft/2020-12/schema",
|
|
213
|
+
"output_sha256": "sha256-OSUAIhytytKihfFM1Y+p1BllUSTC+KEfb/NX86X+Kc0="
|
|
214
|
+
},
|
|
215
|
+
{
|
|
216
|
+
"question_id": "Q4-llm-determinism",
|
|
217
|
+
"resolved_model": "claude-opus-4-8",
|
|
218
|
+
"prompt": "You are a research agent in a STUDY-SWARM (the research-grounded-advisor protocol) grounding the design of a new feature for the open-source repo `dogfood-lab/study-swarm`.\n\nTHE FEATURE — `dispatch.lock.json`: a per-dispatch lockfile that makes a study-swarm research dispatch BYTE-REPLAYABLE by pinning, per step:\n- the RESOLVED model id each research agent actually ran on (e.g. claude-opus-4-8, never an alias like \"opus\"),\n- the SHA-256 of the byte-exact agent prompt,\n- the SHA-256 of the tool JSONSchemas the agent had,\n- the external-verifier run/receipt id (e.g. a prism Ed25519 receipt id) and the receipt chain hash,\n- plus a top-level `lock_sha256` rollup over the whole lock (its content-address).\nThis implements the PIN_PER_STEP workflow standard (heritage: Snakemake 2012, Pegasus 2001).\n\nIMPLEMENTATION CONSTRAINTS (these shape which evidence is useful):\n- The CLI is ZERO-DEPENDENCY, NETWORK-FREE, DETERMINISTIC: SHA-256 via node:crypto, JSON I/O only. It makes NO model calls. The ORCHESTRATION HARNESS supplies the resolved models + byte-exact prompts + verifier run_id; the CLI only canonicalizes + hashes + validates them, and `lock --verify` re-derives the deterministic hashes and FAILS (exit 1) on drift.\n- Honest ceiling: pinning model+prompt+temp does NOT give bit-identical LLM outputs. The lock pins INPUTS byte-exact + records OUTPUT hashes for DRIFT DETECTION — \"replayable inputs + drift-detectable outputs\", NOT \"deterministic replay\".\n\nYOUR JOB: gather SPECIFIC, CITED, RETRIEVED evidence to answer ONE question. HARD RULES:\n- GROUND AT GENERATION TIME: use WebSearch and WebFetch to ACTUALLY RETRIEVE every source you cite THIS session. Cite ONLY sources you actually fetched. A claim you cannot ground in a fetched source is DROPPED, not invented.\n- Every finding needs: a one-sentence claim in your own words that MATCHES what the source actually says (do NOT overstate); author(s)/org; year; a RESOLVABLE identifier (arXiv:NNNN.NNNNN, a DOI, an RFC number, or a direct URL to the spec/paper — not a blog summary); the resolvable URL; whether you retrieved it; and a one-sentence DESIGN IMPLICATION for `dispatch.lock.json`.\n- Prefer specificity over breadth: 6-8 well-sourced, RETRIEVED findings beat 20 vague gestures. ~500-600 words of substance.\n- Set retrieved=false for anything you could not actually fetch — those will be dropped.\n\n========\nYOUR QUESTION (Q4-llm-determinism):\nQUESTION: Can pinning model + prompt + temperature (+ seed) yield reproducible LLM OUTPUTS, or only reproducible INPUTS? Find the strongest EMPIRICAL evidence.\nInvestigate (retrieve the actual sources): nondeterminism even at temperature 0 / fixed seed from floating-point non-associativity + GPU kernel/reduction order + BATCH-SIZE / batching effects (Thinking Machines Lab — He et al. 2025, \"Defeating Nondeterminism in LLM Inference\", thinkingmachines.ai; and the batch-invariant-kernels / vLLM work); provider-side SILENT MODEL DRIFT over time (Chen, Zaharia & Zou 2023, \"How Is ChatGPT's Behavior Changing over Time?\", arXiv:2307.09009); MoE/expert-routing nondeterminism; and any work quantifying output variance under fixed decoding params (e.g. reproducibility-of-LLM-evaluations papers, Atil et al. or similar). This finding JUSTIFIES the honest-ceiling claim: pin INPUTS byte-exact + record OUTPUT hashes for DRIFT DETECTION; do NOT claim \"deterministic replay\". Give the strongest citations for exactly that framing.\n\nReturn structured findings. Remember: retrieve-then-cite; drop what you cannot fetch.",
|
|
219
|
+
"tool_schema": {
|
|
220
|
+
"type": "object",
|
|
221
|
+
"additionalProperties": false,
|
|
222
|
+
"required": [
|
|
223
|
+
"question_id",
|
|
224
|
+
"findings",
|
|
225
|
+
"notes"
|
|
226
|
+
],
|
|
227
|
+
"properties": {
|
|
228
|
+
"question_id": {
|
|
229
|
+
"type": "string"
|
|
230
|
+
},
|
|
231
|
+
"findings": {
|
|
232
|
+
"type": "array",
|
|
233
|
+
"items": {
|
|
234
|
+
"type": "object",
|
|
235
|
+
"additionalProperties": false,
|
|
236
|
+
"required": [
|
|
237
|
+
"claim",
|
|
238
|
+
"authors",
|
|
239
|
+
"year",
|
|
240
|
+
"identifier",
|
|
241
|
+
"url",
|
|
242
|
+
"retrieved",
|
|
243
|
+
"design_implication"
|
|
244
|
+
],
|
|
245
|
+
"properties": {
|
|
246
|
+
"claim": {
|
|
247
|
+
"type": "string",
|
|
248
|
+
"description": "One-sentence finding in your own words, matching the source (do not overstate)."
|
|
249
|
+
},
|
|
250
|
+
"authors": {
|
|
251
|
+
"type": "string",
|
|
252
|
+
"description": "Author(s) or org, e.g. \"Koster & Rahmann\" or \"Rundgren et al.\" or \"OpenSSF\"."
|
|
253
|
+
},
|
|
254
|
+
"year": {
|
|
255
|
+
"type": "string"
|
|
256
|
+
},
|
|
257
|
+
"identifier": {
|
|
258
|
+
"type": "string",
|
|
259
|
+
"description": "arXiv:NNNN.NNNNN, a DOI (10.xxxx/...), an RFC number, or a direct URL."
|
|
260
|
+
},
|
|
261
|
+
"url": {
|
|
262
|
+
"type": "string",
|
|
263
|
+
"description": "A resolvable URL the existence oracle can fetch."
|
|
264
|
+
},
|
|
265
|
+
"retrieved": {
|
|
266
|
+
"type": "boolean",
|
|
267
|
+
"description": "true ONLY if you actually fetched this source this session."
|
|
268
|
+
},
|
|
269
|
+
"design_implication": {
|
|
270
|
+
"type": "string",
|
|
271
|
+
"description": "One sentence: implication for dispatch.lock.json."
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
},
|
|
276
|
+
"notes": {
|
|
277
|
+
"type": "string",
|
|
278
|
+
"description": "Coverage gaps, sources you could not fetch, dropped claims."
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
},
|
|
282
|
+
"schema_dialect": "https://json-schema.org/draft/2020-12/schema",
|
|
283
|
+
"output_sha256": "sha256-aFDm9p4/94p97NJg/vWKbYXchv6N1b0swpIfYoep1iw="
|
|
284
|
+
},
|
|
285
|
+
{
|
|
286
|
+
"question_id": "Q5-tool-schema-drift",
|
|
287
|
+
"resolved_model": "claude-opus-4-8",
|
|
288
|
+
"prompt": "You are a research agent in a STUDY-SWARM (the research-grounded-advisor protocol) grounding the design of a new feature for the open-source repo `dogfood-lab/study-swarm`.\n\nTHE FEATURE — `dispatch.lock.json`: a per-dispatch lockfile that makes a study-swarm research dispatch BYTE-REPLAYABLE by pinning, per step:\n- the RESOLVED model id each research agent actually ran on (e.g. claude-opus-4-8, never an alias like \"opus\"),\n- the SHA-256 of the byte-exact agent prompt,\n- the SHA-256 of the tool JSONSchemas the agent had,\n- the external-verifier run/receipt id (e.g. a prism Ed25519 receipt id) and the receipt chain hash,\n- plus a top-level `lock_sha256` rollup over the whole lock (its content-address).\nThis implements the PIN_PER_STEP workflow standard (heritage: Snakemake 2012, Pegasus 2001).\n\nIMPLEMENTATION CONSTRAINTS (these shape which evidence is useful):\n- The CLI is ZERO-DEPENDENCY, NETWORK-FREE, DETERMINISTIC: SHA-256 via node:crypto, JSON I/O only. It makes NO model calls. The ORCHESTRATION HARNESS supplies the resolved models + byte-exact prompts + verifier run_id; the CLI only canonicalizes + hashes + validates them, and `lock --verify` re-derives the deterministic hashes and FAILS (exit 1) on drift.\n- Honest ceiling: pinning model+prompt+temp does NOT give bit-identical LLM outputs. The lock pins INPUTS byte-exact + records OUTPUT hashes for DRIFT DETECTION — \"replayable inputs + drift-detectable outputs\", NOT \"deterministic replay\".\n\nYOUR JOB: gather SPECIFIC, CITED, RETRIEVED evidence to answer ONE question. HARD RULES:\n- GROUND AT GENERATION TIME: use WebSearch and WebFetch to ACTUALLY RETRIEVE every source you cite THIS session. Cite ONLY sources you actually fetched. A claim you cannot ground in a fetched source is DROPPED, not invented.\n- Every finding needs: a one-sentence claim in your own words that MATCHES what the source actually says (do NOT overstate); author(s)/org; year; a RESOLVABLE identifier (arXiv:NNNN.NNNNN, a DOI, an RFC number, or a direct URL to the spec/paper — not a blog summary); the resolvable URL; whether you retrieved it; and a one-sentence DESIGN IMPLICATION for `dispatch.lock.json`.\n- Prefer specificity over breadth: 6-8 well-sourced, RETRIEVED findings beat 20 vague gestures. ~500-600 words of substance.\n- Set retrieved=false for anything you could not actually fetch — those will be dropped.\n\n========\nYOUR QUESTION (Q5-tool-schema-drift):\nQUESTION: How do LLM agent frameworks and tool/function-calling systems pin or version the TOOL/FUNCTION schemas an agent had, so a replay with the same prompt but a CHANGED tool surface is DETECTED? (This is the half PIN explicitly flags as missing.)\nInvestigate (retrieve the actual docs/specs/papers): OpenAI & Anthropic function-calling / tool-use schema definitions (JSON Schema for tool parameters); the Model Context Protocol (MCP) tool definition format and any capability negotiation / version field (modelcontextprotocol.io spec); JSON Schema canonicalization/hashing for API-contract drift; OpenAPI + Pact consumer-driven contract testing and \"schema drift\" detection in API tooling; and any agent-reproducibility / agent-eval work that captures the TOOL ENVIRONMENT as part of a run record. For each: how the tool surface is represented and how a change is surfaced as a failure. Map each to the `tool_schema_sha256` field — exactly WHAT to hash (the canonicalized tool JSONSchemas the agent was given) and HOW a changed tool surface surfaces as a `lock --verify` drift failure.\n\nReturn structured findings. Remember: retrieve-then-cite; drop what you cannot fetch.",
|
|
289
|
+
"tool_schema": {
|
|
290
|
+
"type": "object",
|
|
291
|
+
"additionalProperties": false,
|
|
292
|
+
"required": [
|
|
293
|
+
"question_id",
|
|
294
|
+
"findings",
|
|
295
|
+
"notes"
|
|
296
|
+
],
|
|
297
|
+
"properties": {
|
|
298
|
+
"question_id": {
|
|
299
|
+
"type": "string"
|
|
300
|
+
},
|
|
301
|
+
"findings": {
|
|
302
|
+
"type": "array",
|
|
303
|
+
"items": {
|
|
304
|
+
"type": "object",
|
|
305
|
+
"additionalProperties": false,
|
|
306
|
+
"required": [
|
|
307
|
+
"claim",
|
|
308
|
+
"authors",
|
|
309
|
+
"year",
|
|
310
|
+
"identifier",
|
|
311
|
+
"url",
|
|
312
|
+
"retrieved",
|
|
313
|
+
"design_implication"
|
|
314
|
+
],
|
|
315
|
+
"properties": {
|
|
316
|
+
"claim": {
|
|
317
|
+
"type": "string",
|
|
318
|
+
"description": "One-sentence finding in your own words, matching the source (do not overstate)."
|
|
319
|
+
},
|
|
320
|
+
"authors": {
|
|
321
|
+
"type": "string",
|
|
322
|
+
"description": "Author(s) or org, e.g. \"Koster & Rahmann\" or \"Rundgren et al.\" or \"OpenSSF\"."
|
|
323
|
+
},
|
|
324
|
+
"year": {
|
|
325
|
+
"type": "string"
|
|
326
|
+
},
|
|
327
|
+
"identifier": {
|
|
328
|
+
"type": "string",
|
|
329
|
+
"description": "arXiv:NNNN.NNNNN, a DOI (10.xxxx/...), an RFC number, or a direct URL."
|
|
330
|
+
},
|
|
331
|
+
"url": {
|
|
332
|
+
"type": "string",
|
|
333
|
+
"description": "A resolvable URL the existence oracle can fetch."
|
|
334
|
+
},
|
|
335
|
+
"retrieved": {
|
|
336
|
+
"type": "boolean",
|
|
337
|
+
"description": "true ONLY if you actually fetched this source this session."
|
|
338
|
+
},
|
|
339
|
+
"design_implication": {
|
|
340
|
+
"type": "string",
|
|
341
|
+
"description": "One sentence: implication for dispatch.lock.json."
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
},
|
|
346
|
+
"notes": {
|
|
347
|
+
"type": "string",
|
|
348
|
+
"description": "Coverage gaps, sources you could not fetch, dropped claims."
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
},
|
|
352
|
+
"schema_dialect": "https://json-schema.org/draft/2020-12/schema",
|
|
353
|
+
"output_sha256": "sha256-Y1lsUdKoplgdos0r4faN6UgxXX74D/Se0AVHCiUA9dM="
|
|
354
|
+
}
|
|
355
|
+
],
|
|
356
|
+
"verification": {
|
|
357
|
+
"runner": "roleos verify-citations",
|
|
358
|
+
"runner_source": "role-os local clone E:/AI/role-os",
|
|
359
|
+
"tool": "prism verify --type citations",
|
|
360
|
+
"tool_version": "prism 1.6.0",
|
|
361
|
+
"verifier_model": "mistral-small:24b",
|
|
362
|
+
"verifier_family": "local",
|
|
363
|
+
"caller_family_excluded": "anthropic",
|
|
364
|
+
"verdict": "escalate",
|
|
365
|
+
"receipt_id": "prism-01kwbajx31dj9gcf5xn3cn5ydg",
|
|
366
|
+
"receipt_signature": "272c892124e3bc13a76b2674fa361b1d65aee6a588c74604cf4ae4e7c9440a8ba7888175b9ec1286fe87490121f694f64cd30adc3ffc0e1b31cd3365b7b38901",
|
|
367
|
+
"receipt_chain_sha256": "499b63905064a5e25fd1801c5530504c94742f2183c4d3c8eb545a20cfbb112e"
|
|
368
|
+
}
|
|
369
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
<!-- study-swarm v1.1.0 · protocol-sha256:4479e7d2d758f42a · created:2026-06-29 -->
|
|
2
|
+
# Study-swarm dispatch: study-swarm-v1_1 (the protocol run on itself)
|
|
3
|
+
|
|
4
|
+
> **Meta-dispatch.** study-swarm v1.0.0 grounds its *central* decision (different-family verifier
|
|
5
|
+
> + retrieval oracle + ensemble diversity) in 6 citations. This dispatch runs the protocol on the
|
|
6
|
+
> **four design questions v1.0.0 leaves answered by "I think…", not "evidence says…"** — the v1.1 surface.
|
|
7
|
+
> Every citation below was gated through Step 4 (retrieval oracle for existence + two different-family
|
|
8
|
+
> groundedness lenses, reasoning-stripped) **before** it informed the architecture. The synthesizer is
|
|
9
|
+
> Claude/Opus; the verifier families are Mistral + IBM Granite + the deterministic arXiv oracle — none of
|
|
10
|
+
> them Claude. Run `study-swarm lint study-swarm-v1_1.dispatch.md` (it passes).
|
|
11
|
+
|
|
12
|
+
## Step 1 — Load-bearing questions
|
|
13
|
+
|
|
14
|
+
Each passes the v1.0.0 test (two real designs hinge on the answer; an adjacent field has measured it; the current spec is silent or hand-wavy):
|
|
15
|
+
|
|
16
|
+
- **A — Groundedness mechanism.** Step 4 stage-2 says only "an NLI-style support check." Should it check the finding sentence *whole*, or **decompose** it into atomic/molecular claims and check each?
|
|
17
|
+
- **B — Generation-time grounding.** Step 2 just "asks for URLs" and catches fabrication downstream by *dropping* findings. Should Step 2 instead **force retrieval at generation time**, or is post-hoc Step-4 verification enough?
|
|
18
|
+
- **C — Aggregation rule.** v1.0.0 mandates "≥3 decorrelated lenses, diversity > count" but never says how to **combine** their verdicts. Disjunction? Majority? An oracle-gated cascade? (The published proof showed union catches traps *but* LLMs false-flag real recent papers.)
|
|
19
|
+
- **D — Calibrated abstention.** The halt table has a hard `CANNOT_CONFIRM` category. Should verdicts instead carry **calibrated confidence** with a tuned **abstention** threshold?
|
|
20
|
+
|
|
21
|
+
## Step 2 — Research dispatch
|
|
22
|
+
|
|
23
|
+
Four parallel research agents (one per question), retrieval-required — a paper an agent could not fetch did not enter the dispatch. **Process note (an ANDON receipt):** lanes C and D first returned schema-valid *placeholder stubs* despite heavy retrieval (19–20 tool calls each); per the protocol's own "a research lane returning placeholders halts that lane" rule, both were **discarded and re-dispatched** with an anti-stub guard (C succeeded; D crashed the output schema and was re-run as a plain-text agent). This is finding **B** happening to the dispatch itself — generation succeeded but emission lost it; a coverage-recovery pass recovered it.
|
|
24
|
+
|
|
25
|
+
## Step 3 — Research grounding
|
|
26
|
+
|
|
27
|
+
<!-- Every finding: author + year + resolvable arXiv id, one-sentence finding, design implication. All gated by Step 4 below before Step 5. Findings are phrased to what the retrieved source supports; precise figures that live in a paper's body but not its abstract were softened to the abstract-grounded claim during Step 4 (noted there). -->
|
|
28
|
+
|
|
29
|
+
1. **(A) Breaking text into atomic facts and scoring the fraction supported exposes partial-support failures a whole-sentence judgment masks — ChatGPT biographies score only ~58%, and an automated estimator tracks human scoring within ~2%.** Min et al. 2023 (arXiv:2305.14251). Implication: stage-2 should score *fraction-of-claims-supported*, because the danger case (a real paper whose finding sentence overstates it) is invisible whole-sentence but surfaces as one unsupported atomic claim.
|
|
30
|
+
2. **(A) Citation support is a ternary state (fully / partially / not), and even the best systems lack complete support ~50% of the time on ELI5 — partial support is the dominant, hardest-to-catch state.** Gao et al. 2023 (arXiv:2305.14627). Implication: the stage-2 verdict space must be ternary, not binary; a "partially supported" finding routes to correct-once/escalate, never auto-pass.
|
|
31
|
+
3. **(A) Automatic attribution evaluation has a hard ceiling — a fine-tuned GPT-3.5 reaches only ~80% macro-F1, and the majority of its errors come from insensitivity to fine-grained information.** Li et al. 2024 (arXiv:2402.15089). Implication: no single LLM judge is a reliable groundedness oracle — this independently re-justifies the ≥3-lens ensemble and an abstain-on-nuance rule.
|
|
32
|
+
4. **(A) Decompose-then-verify scores are sensitive to the decomposition method itself, so the metric must not attribute decomposition error to the text.** Wanner et al. 2024 (arXiv:2403.11903). Implication: pin the decomposer per run (PIN_PER_STEP) and do not score by raw subclaim count.
|
|
33
|
+
5. **(A) Decomposition scores can be inflated by padding with obvious/repetitive subclaims; filtering subclaims by informativeness/uniqueness makes precision substantially more robust.** Jiang et al. 2024 (arXiv:2407.03572). Implication: stage-2 needs an informativeness filter so only the load-bearing claim in a finding gates the verdict — boilerplate earns no support credit, blocking pad-to-pass.
|
|
34
|
+
6. **(A) Decontextualizing atomic claims before verification raises accuracy and almost never flips a true claim to false, so it safely rescues claims naive decomposition would wrongly drop.** Wanner et al. 2024 (arXiv:2412.13175). Implication: decontextualize each claim (resolve referents) before NLI-checking; the near-zero true→false rate makes this safe by default.
|
|
35
|
+
7. **(A) "Molecular" facts — decontextualized + minimal — verify more accurately than fully atomic claims, while over-decontextualization loses error-localizing information.** Gunjal & Durrett 2024 (arXiv:2406.20079). Implication: target *molecular* granularity, not maximal atomicity — the concrete spec for stage-2 (neither whole-sentence nor over-shredded).
|
|
36
|
+
8. **(B) Fine-tuning a model to browse and collect references during generation makes its answers human-checkable and preferred to demonstrator and reference answers.** Nakano et al. 2021 (arXiv:2112.09332). Implication: Step 2 should run agents in a browse-then-cite loop, citing only fetched sources, so each citation is attributable at generation time.
|
|
37
|
+
9. **(B) Training a model to attach supporting evidence per claim and abstain when unsure raises supported-answer rates — but adversarial evaluation shows evidence-backed claims can still be false.** Menick et al. 2022 (arXiv:2203.11147). Implication: keep BOTH a generation-time grounding step AND the Step-4 gate; add abstention to Step 2 (an agent that cannot ground a claim drops it).
|
|
38
|
+
10. **(B) An inline retrieve-and-self-critique loop cuts off-source (ungrounded) generation by roughly an order of magnitude versus comparable instruction-tuned models.** Asai et al. 2023 (arXiv:2310.11511). Implication: a lightweight in-Step-2 "is this in a fetched source?" check eliminates the bulk of fabrication before the gate runs, leaving Step 4 the residual.
|
|
39
|
+
11. **(B) Parametric models fail badly on fast-changing knowledge while search augmentation substantially improves correctness, and both the number and ordering of retrieved evidences matter.** Vu et al. 2023 (arXiv:2310.03214). Implication: for a fast-moving field, recall of recent papers is the worst case — Step 2 must force live retrieval of multiple sources, not the first hit.
|
|
40
|
+
12. **(B) Comparing generation-time vs post-hoc citation, retrieval is the main driver of quality in both, and there is a consistent trade-off: generation-time maximizes precision at the cost of coverage; post-hoc achieves higher coverage at competitive correctness.** Saxena et al. 2025 (arXiv:2509.21557). Implication: do not pick one axis — generation-time grounding floor + post-hoc groundedness ceiling + an explicit coverage-recovery sweep so true-but-hard-to-retrieve findings aren't silently dropped.
|
|
41
|
+
13. **(B) Auditing real LLM/agent citations, a meaningful fraction of URLs are fully hallucinated and more are non-resolving, and citation-heavy "deep research" agents hallucinate at higher rates.** Rao et al. 2026 (arXiv:2604.03173). Implication: keep the deterministic existence oracle even under generation-time grounding (don't trust an agent's claim it fetched a source), and treat citation-heavy agents as higher-risk.
|
|
42
|
+
14. **(B) A retrieval-grounded citation verifier reaches ~89 macro-F1 detecting hallucinated/corrupted citations and outperforms strong web-search LLM baselines, while a reasoning-only judge tops out far lower.** Khajavi et al. 2026 (arXiv:2605.27700). Implication: both the generation step AND the verifier lens must have live source access — a reasoning-only (memory) judge is the weakest configuration.
|
|
43
|
+
15. **(C) A 9-judge panel across 7 model families provides only ~2 effective independent votes, and no aggregation algorithm fixes this because the bottleneck is correlated inputs, not the algorithm.** Kohli 2026 (arXiv:2605.29800). Implication: more LLM lenses cannot fix correlated false-flagging of recent papers — the deterministic oracle is load-bearing precisely because it's the one genuinely decorrelated, non-LLM lens; never stack same-family lenses expecting reliability.
|
|
44
|
+
16. **(C) LLM validators have an agreeableness bias (high true-positive but very low true-negative rate); a tuned minority-veto beats both majority voting and raw disjunction at catching invalid items while bounding over-rejection.** Jain et al. 2025 (arXiv:2510.11822). Implication: the groundedness vote should be a *minority-veto with a tuned threshold n* — the explicit knob trading trap-catch against false-flagging — not disjunction (over-rejects) or majority (misses single-lens catches).
|
|
45
|
+
17. **(C) A small human-labeled calibration set with bias correction beats adding more judges, halving maximum error.** Jain et al. 2025 (arXiv:2510.11822). Implication: maintain a small held-out set of labeled (real/fabricated/misattributed) citations and fit a lightweight bias-correction on the lenses' raw verdicts — cheaper and better than decorrelating yet more families.
|
|
46
|
+
18. **(C) Agreement-based cascading uses inter-model disagreement as the routing/escalation signal and beats single-model-confidence cascades.** Kolawole et al. 2024 (arXiv:2407.02348). Implication: treat lens *disagreement* (oracle confirms existence but groundedness lenses split, especially on a post-cutoff paper) as the trigger to escalate-rather-than-auto-reject — directly bounding over-rejection of genuine recent work.
|
|
47
|
+
19. **(C) LLM judges are systematically overconfident — verbalized confidence overstates accuracy — and a risk-aware confidence fusion makes them more reliable.** Tian et al. 2025 (arXiv:2508.06225). Implication: never trust a lens's raw verbalized confidence for aggregation; down-weight a confident "fabricated" flag on a recent paper relative to the oracle's existence verdict.
|
|
48
|
+
20. **(C) Linear probes on a judge's hidden states give better-calibrated uncertainty than verbalized confidence, with conservative estimates suited to low-false-positive settings.** Radharapu et al. 2025 (arXiv:2512.22245). Implication: where lens internals are reachable, let a lens *abstain* below a calibrated-confidence threshold instead of casting a likely-correlated wrong vote — converting a false-flag into a no-vote that lets the oracle carry existence.
|
|
49
|
+
21. **(C) Aggregators that assume independent judge errors (majority, averaging) gain little or amplify mistakes; explicitly modeling the shared confounder is more reliable.** Zhao et al. 2026 (arXiv:2603.00039). Implication: the aggregation rule must model the training-cutoff blind spot as a shared confounder and discount correlated "fabricated" votes when the un-confounded oracle confirms existence — formalizing why the cascade beats flat voting.
|
|
50
|
+
22. **(D) Training a model to emit "I don't know" as a first-class refusal yields better-calibrated uncertainty than post-hoc thresholding, and the refusal skill generalizes out-of-domain.** Zhang et al. 2023 (arXiv:2311.09677). Implication: keep `CANNOT_CONFIRM` a *first-class* verdict the verifier is instructed to produce — do not collapse the halt table to accept/reject plus a confidence cut.
|
|
51
|
+
23. **(D) Conformal uncertainty gives a finite-sample statistical guarantee on the correctness-coverage rate of the answered set across many models and free-form tasks while keeping prediction sets small.** Wang et al. 2024 (arXiv:2407.00499). Implication: tune the abstention threshold with conformal calibration so the *accepted* citation set carries a provable error bound (e.g. "≤5% of confirmed citations are wrong") with a tunable risk knob.
|
|
52
|
+
24. **(D) Conformal factuality "backs off" to less-specific output and abstains on uncertain sub-claims, giving 80–90% correctness guarantees while retaining most of the output.** Mohri & Hashimoto 2024 (arXiv:2402.10978). Implication: abstention need not be all-or-nothing per citation — partially confirm the supported molecular claims and escalate only the unconfirmable one, preserving coverage.
|
|
53
|
+
25. **(D) Entropy / raw confidence alone is insufficient for safe abstention because models are confidently wrong; combining it with an external correctness signal is required.** Phillips et al. 2026 (arXiv:2603.21172). Implication: gate abstention on *external evidence presence* (was the source fetched, does the retrieved text contain the claim) — not on the verifier's own entropy or verbalized confidence.
|
|
54
|
+
26. **(D) Trust-induced over-reliance is large, and always-on/non-adaptive explanations backfire — only trust-gated, selectively-surfaced counter-explanations reduce inappropriate reliance.** Srinivasan & Thomason 2025 (arXiv:2502.13321). Implication: surface `CANNOT_CONFIRM` *contrastively and selectively* ("I expected to find X and didn't"), never as an always-on confidence bar a human will rubber-stamp.
|
|
55
|
+
27. **(D) Refusal-aware tuning has a documented over-refusal failure mode that must be actively balanced against coverage.** Zhu et al. 2025 (arXiv:2502.05911). Implication: instrument and cap the abstain/escalation rate against a labeled holdout, and treat an abstain-rate spike as its own ANDON trigger — not a success.
|
|
56
|
+
|
|
57
|
+
## Step 4 — External verification
|
|
58
|
+
|
|
59
|
+
**Run against this dispatch's own 27 citations before Step 5 was written.** Synthesizer = Claude/Opus; verifier families = the deterministic arXiv oracle + Mistral (`mistral-small:24b`) + IBM Granite (`granite4.1:30b`), reasoning-stripped (lenses saw only the bare claim + the source title/abstract — never the implications or any synthesizer reasoning).
|
|
60
|
+
|
|
61
|
+
- [x] every citation resolved by retrieval (arXiv/DOI), not model memory — structured arXiv API
|
|
62
|
+
- [x] every finding matches what its source actually claims (groundedness) — two different families vs each abstract
|
|
63
|
+
- [x] >= 3 decorrelated lenses (retrieval oracle + >= 2 different model families) — arXiv oracle + Mistral + Granite
|
|
64
|
+
|
|
65
|
+
**Existence / attribution (retrieval oracle).** All **27/27** papers resolved with correct titles and years. **0 fabricated.** Five attribution corrections the oracle made that no parametric model could: CiteCheck authors `Anonymous → Khajavi et al.` (#14); DnDScore author list trimmed to Wanner, Van Durme & Dredze (#6); R-Tuning year `2024 → 2023` (#22); SConU year `2024 → 2025`; **GRAIT first author `Fang → Zhu` — a real misattribution the research agent flagged itself, corrected once (#27).**
|
|
66
|
+
|
|
67
|
+
**Postdated-paper check.** Six 2025–2026 papers (#12 Saxena, #13 Rao, #14 Khajavi, #15 Kohli, #21 Zhao, #25 Phillips) — which a parametric LLM would false-flag as fabricated — were all **oracle-confirmed real**. This is the existence-must-be-retrieval thesis, executed.
|
|
68
|
+
|
|
69
|
+
**Groundedness (two different-family lenses vs each abstract).** Core qualitative claims **SUPPORTED** by both lenses. Precise figures that live in a paper's *body* but not its *abstract* were correctly flagged PARTIAL/NOT by the lenses and **softened to the abstract-grounded claim** in Step 3 (e.g. #3's "66%" → "the majority"; #6's "33→51.6%"; #7's "74.7 vs 68.7"; #10's "2% vs 18–20%"; #16's veto magnitudes; #26's per-condition deltas). No finding was dropped; none was fabricated; none mis-first-authored after correction.
|
|
70
|
+
|
|
71
|
+
**The dispatch demonstrated its own findings, live:** (a) the lenses flagged exactly the *overstated-number* zone finding **A** is about; (b) Mistral returned a confident `NOT_SUPPORTED` on #21/CARE whose abstract is entirely about confounder modeling — a "confidently-wrong judge" (findings 19, 25) — while Granite was correct, and the **disagreement** triggered adjudication (finding 18); (c) both lenses under-credited material literally present in abstracts (#1 estimator, #9 TruthfulQA, #13 Wayback) — correlated lens noise (findings 15, 21) that only the deterministic oracle is immune to; (d) abstention fired on *evidence absence* (number-not-in-abstract), not model entropy — exactly finding **25**'s prescription. **No verifier was Claude; the protocol did not grade its own homework.**
|
|
72
|
+
|
|
73
|
+
## Step 5 — Architecture (study-swarm v1.1)
|
|
74
|
+
|
|
75
|
+
Each choice traces to findings by number.
|
|
76
|
+
|
|
77
|
+
- **A1 — Stage-2 becomes molecular-claim decomposition, not whole-sentence NLI.** Decompose each finding into *molecular* claims (decontextualized + minimal), informativeness-filter to the load-bearing claim, NLI-check each against the source, and score *fraction-supported*. (findings 1, 5, 6, 7)
|
|
78
|
+
- **A2 — The groundedness verdict is ternary.** Fully / partially / not supported; "partially supported" (the link resolves, the paper is real, the sentence overstates) routes to **correct-once or escalate**, never auto-pass. (findings 2, 3)
|
|
79
|
+
- **A3 — Pin the decomposer; don't score by subclaim count.** The verdict is sensitive to the decomposition method, so the decomposer prompt/model is pinned per run and padding earns no credit. (findings 4, 5)
|
|
80
|
+
- **B1 — Step 2 mandates retrieval-grounded generation.** Agents browse-then-cite, cite only fetched sources, and *drop* (not invent) a claim they cannot ground — a lightweight in-loop "is this in a fetched source?" check. (findings 8, 9, 10, 11, 14)
|
|
81
|
+
- **B2 — …but keep the Step-4 gate and add coverage recovery.** Generation-time grounding maximizes precision at the cost of coverage, so a post-hoc sweep recovers true-but-hard-to-retrieve findings, and the deterministic existence oracle stays even under generation-time grounding. (findings 9, 12, 13)
|
|
82
|
+
- **C1 — The aggregation rule is the cascade.** Existence is gated **authoritatively by the deterministic oracle** (no LLM vote — the only genuinely decorrelated lens); groundedness uses the LLM lenses only. (findings 15, 21)
|
|
83
|
+
- **C2 — Groundedness uses a tuned minority-veto, not disjunction or majority.** The veto threshold `n` is the explicit knob trading trap-catch against over-rejection; a small labeled calibration set + bias correction beats adding lenses. (findings 16, 17)
|
|
84
|
+
- **C3 — Lens disagreement escalates; it never auto-rejects.** When the oracle confirms existence but the groundedness lenses split — especially on a post-cutoff paper — the dispatch escalates to a human rather than rejecting genuine recent work, and confident "fabricated" flags on recent papers are down-weighted. (findings 18, 19, 20, 21)
|
|
85
|
+
- **D1 — `CANNOT_CONFIRM` stays a first-class verdict.** It is *not* collapsed into accept/reject + a confidence cut; the verifier is instructed to produce it. (finding 22)
|
|
86
|
+
- **D2 — Abstention is conformally calibrated and evidence-gated.** The threshold is tuned for a provable accepted-set error bound; abstention triggers on **external evidence absence**, never the verifier's own entropy/verbalized confidence; partial confirmation preserves the supported claims. (findings 23, 24, 25)
|
|
87
|
+
- **D3 — Surface contrastively, and cap the abstain rate as an ANDON.** `CANNOT_CONFIRM` is shown contrastively/selectively (not an always-on confidence bar), and an abstain-rate spike on a labeled holdout is itself a halt signal, not a success. (findings 26, 27)
|
|
88
|
+
|
|
89
|
+
**Net:** the verifier-protected envelope is unchanged in spirit but specified where v1.0.0 was silent — *how* groundedness is checked (molecular decomposition, ternary), *how* the lenses are combined (oracle-gated cascade + minority-veto + disagreement-escalation), *when* the research step grounds (generation-time floor + coverage recovery), and *how* abstention is calibrated and surfaced (first-class, conformal, evidence-gated, capped). Every one of these is both retrieval-verified above and was demonstrated on this very dispatch.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dogfood-lab/study-swarm",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"description": "Ground design decisions in cited research, then verify every citation with a different model family before it becomes canon — a research-grounded design protocol, with a thin CLI.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"methodology",
|