@dogfood-lab/study-swarm 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,280 @@
1
+ #!/usr/bin/env node
2
+ // study-swarm — thin CLI for the research-grounded design protocol.
3
+ // Zero runtime dependencies. Commands: protocol | new | lint | help | version.
4
+ import { readFileSync, writeFileSync, existsSync, statSync, readdirSync } from 'node:fs';
5
+ import { fileURLToPath } from 'node:url';
6
+ import { dirname, resolve, join } from 'node:path';
7
+ import { createHash } from 'node:crypto';
8
+
9
+ const __dirname = dirname(fileURLToPath(import.meta.url));
10
+ const PKG = JSON.parse(readFileSync(resolve(__dirname, '../package.json'), 'utf8'));
11
+ const VERSION = PKG.version;
12
+ const PROTOCOL_PATH = resolve(__dirname, '../PROTOCOL.md');
13
+
14
+ const HELP = `study-swarm v${VERSION} — ground design decisions in cited research, then verify.
15
+
16
+ USAGE
17
+ study-swarm <command> [args]
18
+
19
+ COMMANDS
20
+ protocol Print the locked protocol (the five steps + halt rules).
21
+ new <slug> Scaffold a dispatch file <slug>.dispatch.md to fill in.
22
+ lint [--json] <path...> Check dispatches' citations against the sourcing standard.
23
+ A <path> may be a file, a directory (linted recursively for
24
+ *.dispatch.md), or "-" to read one dispatch from stdin.
25
+ help Show this help.
26
+ version Print the version.
27
+
28
+ EXIT CODES
29
+ 0 ok / lint clean
30
+ 1 lint found sourcing violations
31
+ 2 usage or runtime error
32
+
33
+ NOTE
34
+ lint checks citation FORM (Step 3: author + year + a resolvable arXiv/DOI/URL,
35
+ no "studies show…" gestures) — it does not judge whether a source is legitimate
36
+ or actually supports the claim. That is Step 4, below.
37
+
38
+ Run a dispatch's model-based verification with: roleos verify-citations <file>
39
+ Docs: https://dogfood-lab.github.io/study-swarm/
40
+ `;
41
+
42
+ function fail(code, msg) {
43
+ process.stderr.write(`study-swarm: ${msg}\n`);
44
+ process.exit(code);
45
+ }
46
+
47
+ // Short hash of the vendored PROTOCOL.md, so a scaffolded dispatch records the exact
48
+ // methodology version it was authored against (the package vendors PROTOCOL.md for this).
49
+ function protocolHash() {
50
+ try { return createHash('sha256').update(readFileSync(PROTOCOL_PATH)).digest('hex').slice(0, 16); }
51
+ catch { return 'unknown'; }
52
+ }
53
+
54
+ function cmdProtocol() {
55
+ if (!existsSync(PROTOCOL_PATH)) fail(2, 'PROTOCOL.md not found in package');
56
+ try { process.stdout.write(readFileSync(PROTOCOL_PATH, 'utf8')); }
57
+ catch (err) { fail(2, `cannot read PROTOCOL.md in package: ${err && err.code ? err.code : err.message}`); }
58
+ }
59
+
60
+ const template = (slug, stamp) => `<!-- ${stamp} -->
61
+ # Study-swarm dispatch: ${slug}
62
+
63
+ > Fill in each section. Verify citations (Step 4) BEFORE connecting findings to the design (Step 5).
64
+ > Lint the sourcing with: study-swarm lint ${slug}.dispatch.md
65
+
66
+ ## Step 1 — Load-bearing questions
67
+ <!-- 3-5 questions where empirical evidence would change the answer. Fewer is fine if the decision is substantial.
68
+ A question is load-bearing if you can picture two designs hinging on the answer and the honest current
69
+ answer is "I think…", not "evidence says…". Don't manufacture questions to hit a count. -->
70
+ 1.
71
+ 2.
72
+ 3.
73
+
74
+ ## Step 2 — Research dispatch
75
+ <!-- One research agent per question, in parallel. Each returns: title, authors, year, URL, one-sentence finding. -->
76
+
77
+ ## Step 3 — Research grounding
78
+ <!-- One entry per finding (this is what 'lint' checks):
79
+ N. **<finding>.** <Authors> <year> (<arXiv:NNNN.NNNNN | DOI>). <design implication>.
80
+ e.g.: 1. **Contrastive explanations with a predicted human foil improve independent decisions.** Buçinca et al. 2024 (arXiv:2410.04253). Implication: every recommendation carries a "you might think X; I chose Y because…" frame. -->
81
+ 1. **<finding>.** <Authors> <year> (arXiv:____.____). <implication>.
82
+
83
+ ## Step 4 — External verification
84
+ <!-- Different model family, reasoning-stripped. Run: roleos verify-citations ${slug}.dispatch.md
85
+ HALT on fabricated/misattributed; halt-and-escalate if the verifier or oracle is unavailable. -->
86
+ - [ ] every citation resolved by retrieval (arXiv/DOI), not model memory
87
+ - [ ] every finding matches what its source actually claims (groundedness)
88
+ - [ ] >= 3 decorrelated lenses (retrieval oracle + >= 2 different model families)
89
+
90
+ ## Step 5 — Architecture
91
+ <!-- Each load-bearing choice references a finding by number. Citations without a connection are noise. -->
92
+ `;
93
+
94
+ function cmdNew(slug) {
95
+ if (!slug) fail(2, 'usage: study-swarm new <slug>');
96
+ // Reduce the slug to a single safe filename: strip any trailing .dispatch.md (even if
97
+ // repeated), then collapse anything that isn't a word char, dot, or hyphen to '-'. Path
98
+ // separators ('/' and '\') are NOT permitted — `new` writes ONE file in the current
99
+ // directory and must never traverse out of it. A pure-dots slug ('.', '..') is rejected.
100
+ const stem = String(slug).replace(/(\.dispatch\.md)+$/i, '');
101
+ const safe = stem.replace(/[^\w.\-]/g, '-');
102
+ if (!safe || /^\.+$/.test(safe)) {
103
+ fail(2, `invalid slug "${slug}" — use letters, digits, '.', or '-' (the file stays in the current directory)`);
104
+ }
105
+ const out = `${safe}.dispatch.md`;
106
+ if (existsSync(out)) fail(2, `refusing to overwrite existing ${out}`);
107
+ // Provenance stamp: pins the methodology version a dispatch was authored against.
108
+ const stamp = `study-swarm v${VERSION} · protocol-sha256:${protocolHash()} · created:${new Date().toISOString().slice(0, 10)}`;
109
+ writeFileSync(out, template(safe, stamp), 'utf8');
110
+ const note = safe === stem ? '' : ` (slug sanitized to "${safe}")`;
111
+ process.stdout.write(`Created ${out}${note}\nFill it in, then: study-swarm lint ${out}\n`);
112
+ }
113
+
114
+ // --- lint core ------------------------------------------------------------
115
+
116
+ const YEAR = /\b(19|20)\d{2}\b/;
117
+ const ID = /(arxiv:\s*\d{4}\.\d{4,5}|10\.\d{4,9}\/\S+|https?:\/\/\S+)/i;
118
+ const PLACEHOLDER = /arXiv:_{2,}|<finding>|<authors>|<year>|<implication>/i;
119
+ const BANNED = /\b(studies show|research suggests|it'?s well[- ]established|well[- ]established that)\b/i;
120
+ // An author cite: a capitalized name (Unicode-aware, so "Buçinca" counts), optionally
121
+ // followed by "et al.", "&", "and", or further surnames, immediately before the year.
122
+ // Accepts "Huang et al. 2023", "Walters & Wilder 2023", "Panickssery, Bowman & Feng 2024";
123
+ // flags an author-less finding like "**Foo.** 2024 (arXiv:…)".
124
+ const AUTHOR = /\p{Lu}[\p{L}.'’-]+(?:\s*,?\s*(?:&|and|et al\.?|\p{Lu}[\p{L}.'’-]+))*\s+\(?(?:19|20)\d{2}/u;
125
+
126
+ // Check one dispatch's text. Returns a structured result; never exits.
127
+ function lintText(label, raw) {
128
+ const lines = raw.split(/\r?\n/);
129
+ const problems = []; // { finding, line, rule, message }
130
+ const add = (rule, message, line = null, finding = null) => problems.push({ finding, line, rule, message });
131
+
132
+ // Find the "Research grounding" heading whose TEXT ends with that phrase (last wins), so a
133
+ // title that merely mentions "research grounding" above the real section can't shadow it.
134
+ let start = -1;
135
+ for (let i = 0; i < lines.length; i++) {
136
+ const h = lines[i].match(/^#{1,6}\s+(.*?)\s*$/);
137
+ if (h && /research grounding$/i.test(h[1])) start = i;
138
+ }
139
+ if (start === -1) {
140
+ add('no-section', 'no "Research grounding" section found — every dispatch needs one (Step 3).');
141
+ return { file: label, ok: false, findingCount: 0, problems, findings: [] };
142
+ }
143
+ let end = lines.length;
144
+ for (let i = start + 1; i < lines.length; i++) {
145
+ if (/^#{1,6}\s/.test(lines[i])) { end = i; break; }
146
+ }
147
+ const section = lines.slice(start + 1, end);
148
+
149
+ // Split into findings (numbered items + continuation lines), ignoring fenced code blocks
150
+ // so a "1." inside a ``` example isn't mistaken for a finding. Track each finding's line.
151
+ const findings = []; // { text, line }
152
+ let cur = null;
153
+ let inFence = false;
154
+ section.forEach((l, idx) => {
155
+ if (/^\s*(```|~~~)/.test(l)) { inFence = !inFence; return; }
156
+ if (inFence) return;
157
+ if (/^\s*\d+\.\s/.test(l)) { if (cur) findings.push(cur); cur = { text: l, line: start + 2 + idx }; }
158
+ else if (cur && l.trim()) cur.text += ' ' + l.trim();
159
+ });
160
+ if (cur) findings.push(cur);
161
+
162
+ if (findings.length === 0) add('no-findings', 'Research grounding has no numbered findings.');
163
+
164
+ const parsed = [];
165
+ findings.forEach((f, i) => {
166
+ const n = i + 1;
167
+ if (PLACEHOLDER.test(f.text)) add('placeholder', `finding ${n}: still has template placeholders — fill it in.`, f.line, n);
168
+ // Strip identifiers before the year check so an arXiv id's YYMM prefix
169
+ // (e.g. 2402 in arXiv:2402.01817) can't masquerade as a publication year.
170
+ const fNoIds = f.text.replace(/arxiv:\s*\d{4}\.\d{4,5}/gi, '').replace(/10\.\d{4,9}\/\S+/g, '');
171
+ if (!YEAR.test(fNoIds)) add('missing-year', `finding ${n}: missing a year (spell it out, e.g. "2024" — an arXiv id alone is not a year).`, f.line, n);
172
+ if (!AUTHOR.test(f.text)) add('missing-author', `finding ${n}: missing an author before the year (e.g. "Huang et al. 2023").`, f.line, n);
173
+ const idm = f.text.match(ID);
174
+ if (!idm) add('missing-id', `finding ${n}: missing an identifier (arXiv:NNNN.NNNNN, DOI, or URL).`, f.line, n);
175
+ const ym = fNoIds.match(YEAR);
176
+ const ident = idm ? idm[0].replace(/\s+/g, '').replace(/[).,;]+$/, '') : null;
177
+ parsed.push({ finding: n, year: ym ? ym[0] : null, identifier: ident });
178
+ });
179
+
180
+ // Banned gesture anywhere in the section (outside fences): a finding STATES its result,
181
+ // it never "studies show…" — a co-located citation doesn't redeem it.
182
+ let fence = false;
183
+ section.forEach((l, idx) => {
184
+ if (/^\s*(```|~~~)/.test(l)) { fence = !fence; return; }
185
+ if (!fence && BANNED.test(l)) {
186
+ add('banned-gesture', `line ${start + 2 + idx}: name the study (author + year + identifier), don't gesture: "${l.trim().slice(0, 56)}"`, start + 2 + idx);
187
+ }
188
+ });
189
+
190
+ return { file: label, ok: problems.length === 0, findingCount: findings.length, problems, findings: parsed };
191
+ }
192
+
193
+ // Recursively collect *.dispatch.md files under a directory (skips node_modules/.git).
194
+ function walkDispatches(dir) {
195
+ const out = [];
196
+ for (const entry of readdirSync(dir, { withFileTypes: true })) {
197
+ if (entry.name === 'node_modules' || entry.name === '.git') continue;
198
+ const full = join(dir, entry.name);
199
+ if (entry.isDirectory()) out.push(...walkDispatches(full));
200
+ else if (/\.dispatch\.md$/i.test(entry.name)) out.push(full);
201
+ }
202
+ return out.sort();
203
+ }
204
+
205
+ function readTarget(p) {
206
+ try { return { label: p, raw: readFileSync(p, 'utf8') }; }
207
+ catch (err) { fail(2, `cannot read ${p}: ${err && err.code ? err.code : err.message}`); }
208
+ }
209
+
210
+ function cmdLint(args) {
211
+ const json = args.includes('--json');
212
+ const paths = args.filter((a) => a !== '--json');
213
+ if (paths.length === 0) fail(2, 'usage: study-swarm lint [--json] <file|dir|-> [more...]');
214
+
215
+ const targets = [];
216
+ for (const p of paths) {
217
+ if (p === '-') {
218
+ let raw;
219
+ try { raw = readFileSync(0, 'utf8'); }
220
+ catch (err) { fail(2, `cannot read stdin: ${err && err.code ? err.code : err.message}`); }
221
+ targets.push({ label: '<stdin>', raw });
222
+ continue;
223
+ }
224
+ if (!existsSync(p)) fail(2, `path not found: ${p}`);
225
+ if (statSync(p).isDirectory()) {
226
+ const files = walkDispatches(p);
227
+ if (files.length === 0) fail(2, `no .dispatch.md files found under ${p}`);
228
+ for (const f of files) targets.push(readTarget(f));
229
+ } else {
230
+ targets.push(readTarget(p));
231
+ }
232
+ }
233
+
234
+ const results = targets.map((t) => lintText(t.label, t.raw));
235
+ const anyFail = results.some((r) => !r.ok);
236
+
237
+ if (json) {
238
+ const payload = results.length === 1 ? results[0] : { ok: !anyFail, files: results };
239
+ process.stdout.write(JSON.stringify(payload) + '\n');
240
+ process.exit(anyFail ? 1 : 0);
241
+ }
242
+
243
+ for (const r of results) {
244
+ if (r.ok) {
245
+ process.stdout.write(`ok ${r.file}: ${r.findingCount} finding(s), all sourced.\n`);
246
+ } else {
247
+ process.stderr.write(`x ${r.file}: ${r.problems.length} sourcing issue(s)\n`);
248
+ for (const pr of r.problems) process.stderr.write(` - ${pr.message}\n`);
249
+ }
250
+ }
251
+ if (!anyFail) {
252
+ process.stdout.write(
253
+ `\nStep 3 (sourcing FORM) is satisfied — this does NOT confirm the citations exist or support the claim.\n` +
254
+ `Run Step 4 (existence + groundedness, a different model family): roleos verify-citations <file>\n`,
255
+ );
256
+ }
257
+ process.exit(anyFail ? 1 : 0);
258
+ }
259
+
260
+ function main(argv) {
261
+ const [cmd, ...rest] = argv;
262
+ switch (cmd) {
263
+ case 'protocol': return cmdProtocol();
264
+ case 'new': return cmdNew(rest[0]);
265
+ case 'lint': return cmdLint(rest);
266
+ case 'version': case '--version': case '-v':
267
+ return void process.stdout.write(VERSION + '\n');
268
+ case 'help': case '--help': case '-h': case undefined:
269
+ return void process.stdout.write(HELP);
270
+ default:
271
+ fail(2, `unknown command "${cmd}". Run "study-swarm help".`);
272
+ }
273
+ }
274
+
275
+ try {
276
+ main(process.argv.slice(2).filter((a) => a !== '--debug'));
277
+ } catch (err) {
278
+ if (process.argv.includes('--debug')) throw err;
279
+ fail(2, err && err.message ? err.message : String(err));
280
+ }
@@ -0,0 +1,28 @@
1
+ # Copy this into YOUR repo at .github/workflows/dispatches.yml to gate the sourcing
2
+ # of every study-swarm dispatch on each pull request. It is a SAMPLE — it is not an
3
+ # active workflow in the study-swarm repo itself.
4
+ name: study-swarm lint
5
+
6
+ on:
7
+ pull_request:
8
+ paths:
9
+ - '**/*.dispatch.md'
10
+ - '.github/workflows/dispatches.yml'
11
+ workflow_dispatch:
12
+
13
+ concurrency:
14
+ group: ${{ github.workflow }}-${{ github.ref }}
15
+ cancel-in-progress: true
16
+
17
+ jobs:
18
+ lint:
19
+ runs-on: ubuntu-latest
20
+ timeout-minutes: 5
21
+ steps:
22
+ - uses: actions/checkout@v4
23
+ - uses: actions/setup-node@v4
24
+ with:
25
+ node-version: '20'
26
+ # Lint every dispatch under dispatches/ (a file, a dir, or '-' for stdin all work).
27
+ # Exit 1 on any sourcing violation fails the check. Add --json for machine-readable output.
28
+ - run: npx @dogfood-lab/study-swarm@latest lint dispatches/
@@ -0,0 +1,46 @@
1
+ <!-- study-swarm vX.Y.Z · protocol-sha256:<vendored> · a worked, lint-clean reference dispatch -->
2
+ # Study-swarm dispatch: study-swarm-self
3
+
4
+ > A complete, **lint-clean** example dispatch — study-swarm applied to its own
5
+ > central design decision. Run `study-swarm lint examples/study-swarm-self.dispatch.md`
6
+ > (it passes), then read it as a model for what a filled-in dispatch looks like end to end.
7
+
8
+ ## Step 1 — Load-bearing questions
9
+
10
+ <!-- Each is load-bearing: two real designs hinge on the answer, and the honest prior is "I think", not "evidence says". -->
11
+
12
+ 1. When an LLM makes a substantial design call, can the *same* model reliably verify its own citations, or does the verifier have to be a separate model?
13
+ 2. Is confirming a cited paper *exists* enough, or must "the source supports this claim" be checked as a separate axis?
14
+ 3. Does adding *more* verifiers improve coverage, or does the diversity of the verifiers matter more than their count?
15
+
16
+ ## Step 2 — Research dispatch
17
+
18
+ <!-- One research agent per question, in parallel; each returned paper titles + authors + years + URLs + a one-sentence finding, web-retrieval required (no recall-only citations). -->
19
+
20
+ Three parallel agents, scoped to empirical evidence (not opinion), word-capped, "specificity over breadth — 6–8 well-sourced findings beat 20 vague gestures." Their citations (below) were then resolved against arXiv/Crossref before any informed the design.
21
+
22
+ ## Step 3 — Research grounding
23
+
24
+ 1. **LLMs struggle to self-correct without external feedback, and can degrade after self-correction.** Huang et al. 2023 (arXiv:2310.01798). Implication: the verifier cannot be the generator itself — an external check is required (answers Q1).
25
+ 2. **Autoregressive LLMs cannot self-verify; pair the generator with an external model-based verifier.** Kambhampati et al. 2024 (arXiv:2402.01817). Implication: the architecture is generator + separate verifier, not self-critique (answers Q1).
26
+ 3. **An LLM judge's self-recognition correlates *linearly* with its self-preference bias.** Panickssery, Bowman & Feng 2024 (arXiv:2404.13076). Implication: the verifier must be a *different model family*, since partial blinding of a same-family judge does not remove the bias (answers Q1).
27
+ 4. **18–55% of LLM-generated citations are fabricated, and many real ones carry bibliographic errors.** Walters & Wilder 2023 (doi:10.1038/s41598-023-41032-5). Implication: existence must be established by *retrieval* (resolve the arXiv/DOI), never by the model's recall (answers Q2).
28
+ 5. **Cited links resolve >94% of the time, yet only 39–77% of the content actually supports the claim.** Onweller et al. 2026 (arXiv:2605.06635). Implication: groundedness is a distinct axis from existence — "the link resolves" is not "the paper says this" (answers Q2).
29
+ 6. **Decorrelated verifiers (pairwise ρ ∈ [0.05, 0.25]) beat any single one via submodular coverage.** Rajan 2025 (arXiv:2511.16708). Implication: spend the budget on *lens diversity* (a retrieval oracle + ≥2 different families), not on more copies of one judge (answers Q3).
30
+
31
+ ## Step 4 — External verification
32
+
33
+ <!-- This dispatch's own citations were gated this way before Step 5 was written. -->
34
+
35
+ - [x] every citation resolved by retrieval (arXiv/DOI), not model memory — arXiv API + OpenAlex + Crossref
36
+ - [x] every finding matches what its source actually claims (groundedness) — checked against each abstract
37
+ - [x] >= 3 decorrelated lenses (retrieval oracle + >= 2 different model families) — oracle + Mistral + IBM Granite, reasoning-stripped
38
+
39
+ Result: all six citations VERIFIED (existence + attribution + groundedness). Two blind traps seeded into a sibling set — a misattribution and a fabricated paper — were caught by the *union* of the two families, not either alone.
40
+
41
+ ## Step 5 — Architecture
42
+
43
+ - The verifier is a **different model family** from the synthesizer, run reasoning-stripped. (findings 1, 2, 3)
44
+ - Verification is **two-stage per citation**: a retrieval oracle confirms existence, then a groundedness lens confirms the source supports the claim. (findings 4, 5)
45
+ - The verifier is an **ensemble of decorrelated lenses** (retrieval oracle + ≥2 different families), because diversity — not count — drives coverage. (finding 6)
46
+ - On a non-clean verdict the finding **halts** (fabricated → dropped; misattributed → corrected once; unavailable → escalate), never silently proceeds. (findings 1, 4)
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@dogfood-lab/study-swarm",
3
- "version": "0.5.0",
4
- "description": "Ground design decisions in cited research, then verify every citation with a different model family before it becomes canon — a research-grounded design protocol.",
3
+ "version": "1.0.0",
4
+ "description": "Ground design decisions in cited research, then verify every citation with a different model family before it becomes canon — a research-grounded design protocol, with a thin CLI.",
5
5
  "keywords": [
6
6
  "methodology",
7
7
  "llm",
@@ -22,7 +22,19 @@
22
22
  "bugs": {
23
23
  "url": "https://github.com/dogfood-lab/study-swarm/issues"
24
24
  },
25
+ "type": "module",
26
+ "bin": {
27
+ "study-swarm": "bin/study-swarm.mjs"
28
+ },
29
+ "engines": {
30
+ "node": ">=18"
31
+ },
32
+ "scripts": {
33
+ "verify": "node scripts/smoke.mjs"
34
+ },
25
35
  "files": [
36
+ "bin/",
37
+ "examples/",
26
38
  "README.md",
27
39
  "README.ja.md",
28
40
  "README.zh.md",