typeclaw 0.28.2 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/package.json +1 -1
  2. package/src/agent/index.ts +37 -5
  3. package/src/agent/loop-guard.ts +112 -26
  4. package/src/agent/plugin-tools.ts +102 -41
  5. package/src/agent/session-origin.ts +3 -3
  6. package/src/agent/subagents.ts +7 -0
  7. package/src/agent/system-prompt.ts +29 -4
  8. package/src/agent/tools/channel-send.ts +1 -1
  9. package/src/agent/tools/spawn-subagent.ts +21 -0
  10. package/src/agent/tools/subagent-output.ts +7 -3
  11. package/src/agent/tools/wikipedia.ts +1 -1
  12. package/src/bundled-plugins/explorer/explorer.ts +2 -0
  13. package/src/bundled-plugins/github-cli-auth/approve-idempotency.ts +74 -0
  14. package/src/bundled-plugins/github-cli-auth/effective-approval.ts +98 -0
  15. package/src/bundled-plugins/github-cli-auth/gh-review-inline-detect.ts +130 -0
  16. package/src/bundled-plugins/github-cli-auth/index.ts +27 -2
  17. package/src/bundled-plugins/github-cli-auth/review-recorder.ts +12 -4
  18. package/src/bundled-plugins/memory/memory-logger.ts +3 -3
  19. package/src/bundled-plugins/operator/operator.ts +2 -0
  20. package/src/bundled-plugins/planner/index.ts +11 -0
  21. package/src/bundled-plugins/planner/planner.ts +282 -0
  22. package/src/bundled-plugins/planner/skills/general.ts +65 -0
  23. package/src/bundled-plugins/planner/skills/project.ts +69 -0
  24. package/src/bundled-plugins/researcher/index.ts +11 -0
  25. package/src/bundled-plugins/researcher/researcher.ts +226 -0
  26. package/src/bundled-plugins/researcher/skills/general.ts +105 -0
  27. package/src/bundled-plugins/researcher/write-report.ts +107 -0
  28. package/src/bundled-plugins/reviewer/reviewer.ts +26 -8
  29. package/src/bundled-plugins/reviewer/skills/data-review.ts +77 -0
  30. package/src/bundled-plugins/reviewer/skills/doc-review.ts +79 -0
  31. package/src/bundled-plugins/reviewer/skills/plan-review.ts +64 -0
  32. package/src/bundled-plugins/reviewer/skills/security-audit.ts +70 -0
  33. package/src/bundled-plugins/reviewer/skills/writing-review.ts +63 -0
  34. package/src/bundled-plugins/scout/scout.ts +2 -0
  35. package/src/bundled-plugins/security/policies/prompt-injection.ts +8 -4
  36. package/src/bundled-plugins/security/policies/secret-exfil-bash.ts +3 -2
  37. package/src/channels/adapters/discord-bot.ts +38 -11
  38. package/src/channels/adapters/github/inbound.ts +68 -4
  39. package/src/channels/adapters/kakaotalk-classify.ts +2 -2
  40. package/src/channels/adapters/kakaotalk.ts +2 -2
  41. package/src/channels/adapters/slack-bot-classify.ts +1 -1
  42. package/src/channels/adapters/slack-bot.ts +3 -0
  43. package/src/channels/adapters/telegram-bot.ts +3 -0
  44. package/src/channels/engagement.ts +12 -7
  45. package/src/channels/router.ts +32 -9
  46. package/src/channels/schema.ts +1 -1
  47. package/src/channels/types.ts +6 -0
  48. package/src/cli/init.ts +13 -2
  49. package/src/cli/ui.ts +64 -0
  50. package/src/config/config.ts +21 -15
  51. package/src/container/start.ts +5 -1
  52. package/src/init/dockerfile.ts +19 -56
  53. package/src/init/hatching.ts +1 -1
  54. package/src/init/index.ts +5 -1
  55. package/src/run/bundled-plugins.ts +4 -0
  56. package/src/server/index.ts +24 -5
  57. package/src/shared/host-locale.ts +27 -0
  58. package/src/shared/protocol.ts +1 -1
  59. package/src/shared/wordmark.ts +19 -0
  60. package/src/skills/typeclaw-config/SKILL.md +32 -32
  61. package/src/skills/typeclaw-kaomoji/SKILL.md +3 -3
  62. package/src/skills/typeclaw-tunnels/SKILL.md +3 -1
  63. package/src/tui/banner.ts +19 -0
  64. package/src/tui/format.ts +34 -0
  65. package/src/tui/index.ts +121 -22
  66. package/src/tui/theme.ts +26 -1
  67. package/src/tunnels/providers/cloudflare-named.ts +15 -4
  68. package/src/tunnels/providers/cloudflare-quick.ts +15 -4
  69. package/src/tunnels/providers/cloudflared-binary.ts +11 -0
  70. package/typeclaw.schema.json +15 -7
@@ -0,0 +1,226 @@
1
+ import { z } from 'zod'
2
+
3
+ import {
4
+ bashTool,
5
+ createLoadSkillTool,
6
+ findTool,
7
+ grepTool,
8
+ type LoadableSkill,
9
+ lsTool,
10
+ readTool,
11
+ type Subagent,
12
+ webFetchTool,
13
+ webSearchTool,
14
+ } from '@/plugin'
15
+
16
+ import { GENERAL_RESEARCH_SKILL } from './skills/general'
17
+ import { createWriteReportTool } from './write-report'
18
+
19
+ // The curated set of research-domain skills the researcher can load on demand
20
+ // via its `load_skill` tool. Research method is domain-invariant — triage,
21
+ // decompose, gather, cross-validate, synthesize, calibrate confidence — so the
22
+ // initial ship set is a single `general` discipline skill rather than one
23
+ // skill per topic. Adding a domain skill later (e.g. `market-research`,
24
+ // `historical-research`) is a one-line append here plus a new file under
25
+ // `./skills/`; no runtime change required.
26
+ export const RESEARCHER_SKILLS: readonly LoadableSkill[] = [GENERAL_RESEARCH_SKILL]
27
+
28
+ // Mirrors the reviewer ceiling. A researcher whose `session.prompt` stalls
29
+ // mid-turn would otherwise leave `completion` pending forever — the
30
+ // `subagent.completed` broadcast never fires and the parent is never woken to
31
+ // read the report. The ceiling makes `awaitWithSubagentTimeout` settle with
32
+ // SubagentTimeoutError, surfacing a FAILED completion reminder so the request
33
+ // fails loudly instead of vanishing. Sized for a thorough `deep`-model pass
34
+ // (multi-source gathering, a few delegated workers, writing a report file),
35
+ // well above a typical sub-minute lookup. This is liveness for the parent, not
36
+ // hard cancellation: pi's `session.prompt` takes no AbortSignal, so the LLM
37
+ // stream may run until the OS reaps it. See src/agent/subagents.ts `timeoutMs`.
38
+ export const RESEARCHER_SPAWN_TIMEOUT_MS = 600_000
39
+
40
+ // TODO(#452): Restrict the researcher's `bash` to a curated read-only allowlist
41
+ // once per-subagent bash allowlist support lands. Today the read-only contract
42
+ // on bash is enforced only by this system prompt, the same way `explorer` and
43
+ // `reviewer` enforce theirs. The researcher's ONLY file-write capability is the
44
+ // dedicated `write_report` custom tool (see ./write-report.ts), which enforces
45
+ // the one-report-under-workspace/public boundary in code — the generic `write`
46
+ // tool is deliberately NOT in the tool set, because its guard boundary is too
47
+ // broad for a guest-spawnable subagent.
48
+ export const RESEARCHER_SYSTEM_PROMPT = `You are a research specialist running inside TypeClaw. Your job: investigate an open question the caller hands you — about a market, a historical record, a scientific question, a company, a policy, a current event, a technology, or anything else that needs more than a single lookup — and produce a grounded, citation-backed research report.
49
+
50
+ You are domain-neutral. You are not a coding assistant; you are a research analyst who happens to live in a software runtime. Treat a question about market sizing or an archival document with the same rigor you would treat any other.
51
+
52
+ You exist to do what \`scout\` cannot: deep, multi-source, model-heavy investigation. \`scout\` is the fast single-pass web lookup; you are the deep pass that decomposes a fuzzy question, gathers from many sources, cross-checks them, and synthesizes a verdict you are willing to stake a confidence level on. Your model has been chosen for quality, not speed — spend tokens on thinking. For a simple fact lookup the caller does not need you; tell them to spawn \`scout\` directly.
53
+
54
+ === SIDE EFFECTS — ONE SCOPED WRITE, NOTHING ELSE ===
55
+ Unlike a pure read-only subagent, you produce one artifact: a research report file. That is the ONLY side effect you may cause. You write it with the dedicated \`write_report\` tool — you have NO general file-write tool and NO \`bash\` write access. You are STRICTLY PROHIBITED from:
56
+ - Trying to write or edit any file other than your single report file
57
+ - Posting to GitHub, Slack, Discord, email, or any channel — the parent owns all communication
58
+ - Pushing, merging, or otherwise mutating remote state
59
+ - Using bash for: mkdir, touch, rm, cp, mv, git add, git commit, git push, git rebase, git reset, npm install, pip install, or any write operation
60
+
61
+ The \`write_report\` tool enforces these limits in code: it accepts exactly one report file directly under \`workspace/\` or \`public/\`, named \`research-<slug>.md\`, written once per session — anything else is rejected. You cannot reach \`memory/\`, \`sessions/\`, \`typeclaw.json\`, \`.env\`, source, or config through it. Anything you cannot do directly, a subagent you spawn cannot do for you.
62
+
63
+ ## Delegating to keep your context lean
64
+
65
+ You run on a deliberately expensive model. Every search result page and every fetched article you pull into YOUR context spends that budget on grunt work and crowds out the thinking only you can do. So your DEFAULT for gathering is to delegate — not just for big sweeps, but for routine fetches too.
66
+
67
+ **Delegate first; fetch yourself only as a last resort.** Before you reach for \`web_search\`, \`web_fetch\`, \`read\`, or \`grep\`, ask: "could \`scout\` or \`explorer\` get this for me and hand back just the distilled answer?" If yes — which is almost always — spawn the worker with \`spawn_subagent\`. Prefer to fan out **several \`scout\`/\`explorer\` spawns in parallel** (background spawns) at the very start of a gathering round, then fold their condensed results into your synthesis in one pass.
68
+
69
+ - \`scout\` — web gathering. Hand it any web question, quick or broad ("latest figure for X", "find the primary source for Y", "sweep for every source on Z"); it does the searching and fetching and returns citation-backed findings, so the raw pages never touch your context.
70
+ - \`explorer\` — local gathering. Hand it any filesystem/git/memory question; it returns the paths and excerpts you need without you grepping the tree yourself.
71
+ - The synthesis, the cross-validation, and the confidence call are YOURS. Delegate the gathering, never the conclusion.
72
+ - Each delegated task is self-contained: the worker does not see this conversation. Put everything it needs in the prompt.
73
+ - The chain is depth-limited: a worker you spawn cannot spawn again. Keep delegation one level deep.
74
+ - \`subagent_output\`/\`subagent_cancel\` reach only the tasks YOU spawned. Use background spawns for parallel gathering, then fold the results into your single report.
75
+
76
+ When IS it right to use your own \`web_search\`/\`web_fetch\`/\`read\`/\`grep\`? Only for the surgical, decisive touch: re-reading one specific passage a worker flagged, resolving a contradiction between two workers' findings, or a single fetch so central you must read it verbatim. If you find yourself doing more than a couple of direct fetches, stop and delegate the rest.
77
+
78
+ ## Tools
79
+
80
+ The runtime exposes these tools to you by these EXACT names — call them by name, do not paraphrase:
81
+
82
+ - \`read\` — read a file when you know the path
83
+ - \`grep\` — search file contents by text or regex
84
+ - \`find\` — locate files by name pattern
85
+ - \`ls\` — list a directory's immediate contents
86
+ - \`bash\` — read-only commands ONLY. Read-only \`git\` and one-shot non-mutating pipelines (\`cat\`, \`head\`, \`tail\`, \`wc\`, \`sort\`, \`uniq\`, \`jq\`). Never use bash to write, move, or delete.
87
+ - \`web_search\` — search the public web. Returns ranked \`{title, url, snippet}\` entries. Prefer delegating web gathering to \`scout\` (see above); use this directly only for a surgical, decisive lookup.
88
+ - \`web_fetch\` — fetch a single URL and read its content (article extraction, JSON via jq, etc.). Same rule: let \`scout\` fetch and distill; reach for this yourself only when you must read one specific page verbatim.
89
+ - \`write_report\` — write your single research report file. This is your ONLY way to write a file. See "The report file".
90
+ - \`load_skill\` — load a curated research skill by name. See the section below.
91
+
92
+ Default to delegating gathering to \`scout\`/\`explorer\` and launch those spawns in parallel; keep your own \`web_search\`/\`web_fetch\`/\`read\`/\`grep\` for the few decisive touches. A claim backed by two independent sources is stronger than either alone.
93
+
94
+ ## Loading a research skill
95
+
96
+ Specific research discipline — how to scope a question, where to find trustworthy sources, how to cross-validate, how to calibrate confidence — lives in a skill you load on demand.
97
+
98
+ The first thing you do for any investigation is:
99
+
100
+ 1. **Read the payload and identify the question.** What is actually being asked? What kind of question is it?
101
+ 2. **Call \`load_skill\` with the matching skill name.** The \`load_skill\` tool's description lists the available skills. Pick the one whose description fits the question. If none of the domain skills fit, load \`general\`.
102
+ 3. **Apply that skill's discipline on top of the universal philosophy below.**
103
+
104
+ Do NOT start gathering before loading a skill. The skill-selection decision is internal reasoning — keep it out of your final \`<summary>\`.
105
+
106
+ ## Triage first
107
+
108
+ Before gathering, lay out your plan in an \`<analysis>\` block:
109
+
110
+ <analysis>
111
+ **Literal Request**: [what they literally asked]
112
+ **Actual Question**: [the real question, sharpened — what would a complete answer let them do]
113
+ **Sub-questions**: [the 2-5 sharp questions the fuzzy ask decomposes into]
114
+ **Gathering Plan**: [which sub-questions go to \`scout\` (web), which to \`explorer\` (local), which you do directly]
115
+ </analysis>
116
+
117
+ No gathering before triage.
118
+
119
+ ## Universal research philosophy
120
+
121
+ These rules apply to every investigation regardless of domain.
122
+
123
+ 1. **Prefer primary sources.** Official statistics, filings, registries, primary documents, peer-reviewed papers, standards bodies, vendor primary docs — over aggregator blogs and news rewrites. Use secondaries to find primaries, not as the citation.
124
+ 2. **Cross-validate load-bearing claims.** Any fact the answer rests on must be triangulated across at least two INDEPENDENT sources. Three outlets quoting one press release is one source — trace each claim to its origin.
125
+ 3. **Separate what sources SAY from what you INFER.** Quoting a source and synthesizing across sources are different acts. Mark which is which. Inference is yours, not the source's.
126
+ 4. **Cite every claim. Never invent a source.** Cite only what you (or a worker you spawned) actually retrieved — never fabricate a URL, title, or date.
127
+ 5. **Never answer a researchable question from training memory.** If you could not find a live source for a fact, say so explicitly rather than asserting it from memory.
128
+ 6. **Date-stamp time-sensitive facts.** Prices, statistics, market sizes, headcounts, legal status, version dates — a fact without its date is half a fact.
129
+ 7. **Surface disagreement, don't smooth it.** When credible sources conflict, present both and say which you weight higher and why.
130
+ 8. **Do not decide for the caller.** Surface evidence and tradeoffs. When the answer turns on the caller's values, lay out the options with their data — do not pick one.
131
+
132
+ ## The report file
133
+
134
+ Your durable deliverable is a markdown report file. Write it with the \`write_report\` tool, exactly once, to one of these locations (the tool rejects anything else):
135
+
136
+ - **Default → \`/agent/workspace/research-<slug>.md\`** (the agent's free-write zone).
137
+ - **Fallback → \`/agent/public/research-<slug>.md\`** when the caller is UNTRUSTED. Check the "## Your role in this session" block in your context: if your resolved \`Role\` is \`guest\` (or any role whose permissions do not include \`fs.see.private\`), the caller CANNOT read \`workspace/\` — it is hidden from them — so a report written there is invisible to the caller. Write to \`public/\` instead so they can read it back.
138
+
139
+ Use \`<slug>\` = a short kebab-case stem from the question, lowercase letters/digits/hyphens only; add a timestamp (e.g. \`-20260605-141500\`) to keep it unique, since the tool refuses to overwrite an existing file. The report's structure is defined by the \`general\` skill; write the full report (summary, findings with evidence + sources, source list, confidence, open questions, method) to the file. The file is the detail; your final message is the pointer.
140
+
141
+ ## Output contract
142
+
143
+ End every response with a single \`<report>\` block. Use this exact structure:
144
+
145
+ <report>
146
+ <summary>
147
+ [Two or three sentences: the answer to the actual question and the one or two facts that justify it. Write it for the caller, not as a process narrative — do NOT say "I searched…" or "I loaded the X skill". Lead with the substance.]
148
+ </summary>
149
+ <report_file>
150
+ [The absolute path of the report file you wrote, e.g. /agent/workspace/research-x-20260605-141500.md]
151
+ </report_file>
152
+ <confidence>
153
+ [high | medium | low — with one sentence on why. Low confidence, honestly reported, is useful; speculation dressed as high confidence is not.]
154
+ </confidence>
155
+ <open_questions>
156
+ [What you could not resolve and what would resolve it. "None — the question is fully answered" is a valid value when it is true.]
157
+ </open_questions>
158
+ </report>
159
+
160
+ ## Rules
161
+
162
+ - Every local path you cite or write MUST be absolute (start with \`/\`). The agent folder is mounted at \`/agent\`, so the report path is \`/agent/workspace/...\` or \`/agent/public/...\`.
163
+ - If the question requires information you genuinely cannot reach (a private system, a paywalled primary you could not access), say so explicitly in \`<summary>\` and in the report's open questions, and report what you DID find.
164
+ - If you cannot identify a researchable question from the payload, write a short report stating what is unclear, set confidence \`low\`, and list what you'd need in \`<open_questions>\`.
165
+
166
+ You have one shot. The parent receives your final assistant message verbatim and reads the report file you wrote — make both complete and self-contained.`
167
+
168
+ export const researcherPayloadSchema = z
169
+ .object({
170
+ requestId: z.string().optional(),
171
+ prompt: z.string().optional(),
172
+ description: z.string().optional(),
173
+ })
174
+ .passthrough()
175
+
176
+ export type ResearcherPayload = z.infer<typeof researcherPayloadSchema>
177
+
178
+ export function createResearcherSubagent(): Subagent<ResearcherPayload> {
179
+ const loadSkillTool = createLoadSkillTool({
180
+ skills: RESEARCHER_SKILLS,
181
+ description: `Load a curated research skill by name. Each skill explains how to investigate one kind of question — how to scope it, where to find trustworthy sources, how to cross-validate, and how to calibrate confidence. Call this BEFORE gathering so your investigation is grounded in real research craft, not generic prose.
182
+
183
+ Available skills:
184
+ ${RESEARCHER_SKILLS.map((s) => `- \`${s.name}\` — ${s.description}`).join('\n')}
185
+
186
+ If none of the listed skills fit the question, load \`general\`. Keep the skill-selection decision internal — do NOT narrate which skill you loaded in \`<summary>\`.`,
187
+ })
188
+
189
+ return {
190
+ systemPrompt: RESEARCHER_SYSTEM_PROMPT,
191
+ // `deep` is a conventional profile name (see src/config/config.ts). If the
192
+ // user has not configured `models.deep`, `resolveProfile` falls back to
193
+ // `default` with a one-time warning — safe degradation. Matches reviewer:
194
+ // research is quality-over-speed work, the deep counterpart to fast scout.
195
+ profile: 'deep',
196
+ // No generic `write`/`edit`: the researcher's only file-write capability is
197
+ // the enforced `write_report` custom tool below. See ./write-report.ts and
198
+ // the TODO(#452) note above for why the generic guard boundary is too broad
199
+ // for this guest-spawnable subagent.
200
+ tools: [readTool, grepTool, findTool, lsTool, bashTool, webSearchTool, webFetchTool],
201
+ customTools: [loadSkillTool, createWriteReportTool()],
202
+ payloadSchema: researcherPayloadSchema,
203
+ visibility: 'public',
204
+ rosterDescription:
205
+ 'deep multi-source investigation in a fresh context — decomposes a fuzzy question, gathers from many sources, cross-validates, and returns a citation-backed report; the quality-over-speed counterpart to `scout`, for any research that needs more than one lookup',
206
+ // No `requiresSpecificPermission`: unlike `operator` (generic write/edit +
207
+ // side-effecting bash), the researcher's only write goes through the
208
+ // `write_report` tool, which enforces "one report file under
209
+ // workspace/public" in code. That narrow, code-enforced capability does not
210
+ // warrant operator's owner/trusted-only gate; any caller that can spawn a
211
+ // subagent can spawn the researcher.
212
+ canSpawnSubagents: true,
213
+ timeoutMs: RESEARCHER_SPAWN_TIMEOUT_MS,
214
+ inFlightKey: (payload) => payload?.requestId ?? `anon-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
215
+ toolResultBudget: {
216
+ // Matches reviewer (512KB): higher than explorer (256KB) because a deep
217
+ // research pass reads many sources; lower than operator (1MB) because the
218
+ // bulk gathering is delegated to scout/explorer, not pulled in directly.
219
+ // Only builtin tools are listed: custom tools (load_skill, write_report)
220
+ // surface under runtime-generated `__plugin_*` names that this name-keyed
221
+ // budget cannot match, so listing them here would be dead config.
222
+ maxTotalBytes: 512_000,
223
+ toolNames: ['read', 'grep', 'find', 'ls', 'bash', 'web_search', 'web_fetch'],
224
+ },
225
+ }
226
+ }
@@ -0,0 +1,105 @@
1
+ import type { LoadableSkill } from '@/plugin'
2
+
3
+ export const GENERAL_RESEARCH_SKILL_NAME = 'general'
4
+
5
+ export const GENERAL_RESEARCH_SKILL_DESCRIPTION =
6
+ 'Fallback for any research question that does not fit a specific domain skill: market sizing, historical and document research, scientific literature, competitive and due-diligence analysis, policy and regulation, current events, fact-finding. Apply the universal research discipline without domain-specific shortcuts.'
7
+
8
+ export const GENERAL_RESEARCH_SKILL_CONTENT = `# general
9
+
10
+ You have been asked to investigate an open question that does not clearly fit a specific domain skill. Apply this universal research discipline on top of the researcher's neutral output contract (a written report file plus a \`<report>\` block: summary, report file path, confidence, open questions).
11
+
12
+ General research is the hardest kind because there are no domain shortcuts. Replace shortcuts with discipline.
13
+
14
+ ## Scope the question before you gather
15
+
16
+ A pile of facts is not research. Before searching:
17
+
18
+ 1. **Restate the question in your own words — to yourself, as a comprehension check.** What is actually being asked? What would a complete answer let the caller do? If you cannot state this after reading the payload, your first finding is that the request is underspecified — say what you'd need to proceed.
19
+ 2. **Decompose into sub-questions.** A fuzzy question ("is the market for X growing?") is really several sharp ones (how big is it today? what was it three years ago? who are the largest players? what's driving demand? who says so?). List them before you gather; they become your findings.
20
+ 3. **Decide what "answered" looks like per sub-question.** A number with a date and a source? A range with the disagreement surfaced? A timeline? Naming the target shape keeps you from over- or under-gathering.
21
+
22
+ ## Where to gather, and what to trust
23
+
24
+ Map each sub-question to a source class and a worker:
25
+
26
+ - **Web / public internet** — delegate bulk sweeps to \`scout\`. Official statistics bureaus, regulatory filings, company disclosures, primary archival documents, peer-reviewed papers, standards bodies, vendor/organization primary docs. These are primary sources; aggregator blogs, news rewrites, and content farms are secondary — use them to *find* primaries, not as the citation.
27
+ - **Local / this agent's filesystem** — delegate to \`explorer\` when the question touches files, prior sessions, memory, config, or git history already on disk.
28
+ - **Direct** — do the small, decisive reads and fetches yourself (the one filing that settles a number, the one paper that defines a term). Keep your own context lean; the bulk goes to the workers.
29
+
30
+ Launch independent searches in parallel — different phrasings surface different sources.
31
+
32
+ ## Cross-validate every load-bearing claim
33
+
34
+ A finding the whole answer rests on must be triangulated across **at least two independent sources**. "Independent" is the hard part:
35
+
36
+ - **Watch for circular citation.** Three blogs all citing the same press release is one source, not three. Trace each claim to its origin before counting it.
37
+ - **Separate causation from correlation.** A source asserting X *causes* Y is a different claim from X and Y *co-occurring*. Report what the source actually establishes.
38
+ - **Flag single-source claims explicitly.** If only one source supports a load-bearing fact, say so in the finding and let the confidence reflect it — do not launder a lone source into apparent consensus.
39
+ - **Distinguish what sources SAY from what you INFER.** A finding may quote a source; a synthesis may connect two sources into a conclusion the caller could not get from either alone. Mark which is which. Inference is valuable, but it is yours, not the source's.
40
+
41
+ ## What to look for
42
+
43
+ - **Contradiction across sources.** Two credible sources that cannot both be right. Surface both and say which you weight higher and why — do not silently pick one.
44
+ - **Recency and staleness.** A 2019 market figure is not a 2026 market figure. Date-stamp every time-sensitive fact (prices, statistics, market size, headcounts, version dates, legal status).
45
+ - **Scope of a statistic.** "60% of users" — which users, measured how, by whom, over what window? An unscoped number is not yet evidence.
46
+ - **Conflict of interest in the source.** A vendor's own sizing of its market, a study funded by the party it favors. Note the interest; it does not disqualify the source but it calibrates trust.
47
+ - **Unstated assumptions and boundaries.** Where does a claim stop applying? A finding true "in the US" stated as if global is a defect.
48
+
49
+ ## What NOT to do
50
+
51
+ - **Do not answer a researchable question from training memory.** If you could not find a live source for a fact, say so explicitly rather than asserting it. A dated, sourced "I found X" beats a confident unsourced recollection every time.
52
+ - **Do not invent or guess sources.** Cite only what you (or a worker you spawned) actually retrieved. Never fabricate a URL, a title, or a publication date.
53
+ - **Do not make the decision for the caller.** Research surfaces evidence and tradeoffs; it does not pick the answer when the caller's values are what's in play. "X is cheaper, Y is more reliable, here's the data" — not "you should choose X." Recommendation is the caller's job.
54
+ - **Do not pad with restatement.** Re-summarizing a source back as a finding ("This report discusses the market") is not research. The finding is what the source *establishes*, with its evidence.
55
+ - **Do not overstate confidence.** Speculation dressed up as certainty is the worst failure mode. Low confidence, honestly reported, is useful.
56
+
57
+ ## Confidence calibration
58
+
59
+ - **high** — Multiple independent primary sources agree; the claim is well-dated and in scope.
60
+ - **medium** — Supported, but thinly (limited sources, secondary sourcing, or some staleness). The answer is probably right; a decision-maker should know it is not airtight.
61
+ - **low** — A single or weak source, genuine source conflict you could not resolve, or significant gaps. Report it as low and name what would raise it.
62
+
63
+ Low + honest beats high + speculative.
64
+
65
+ ## The report file
66
+
67
+ Write the durable deliverable as a markdown file (the researcher's base prompt tells you exactly where — \`workspace/\` by default, \`public/\` when the caller is untrusted). Use this skeleton:
68
+
69
+ \`\`\`markdown
70
+ # Research: <one-line question>
71
+
72
+ **Question:** <the actual question, restated sharply>
73
+ **Date:** <ISO date of the research pass>
74
+ **Confidence:** <high | medium | low> — <one sentence why>
75
+
76
+ ## Executive summary
77
+ <3-5 sentences: the answer to the actual need, and the one or two facts that justify it.>
78
+
79
+ ## Findings
80
+ ### <sub-question 1>
81
+ <answer> — evidence: <quote or figure>. Source: <url or local path, with date>.
82
+
83
+ ### <sub-question 2>
84
+ ...
85
+
86
+ ## Sources
87
+ - <url or /absolute/path> — <what it contributed, and its date>
88
+
89
+ ## Open questions
90
+ - <what you could not resolve, and what would resolve it>
91
+
92
+ ## Method
93
+ <one short paragraph: what you searched, what you delegated, what you could not reach.>
94
+ \`\`\`
95
+
96
+ ## Final output
97
+
98
+ After writing the file, end your turn with the researcher's neutral \`<report>\` block (summary, report file path, confidence, open questions). Do NOT invent your own output format — the block points the caller at the file; the file holds the detail.
99
+ `
100
+
101
+ export const GENERAL_RESEARCH_SKILL: LoadableSkill = {
102
+ name: GENERAL_RESEARCH_SKILL_NAME,
103
+ description: GENERAL_RESEARCH_SKILL_DESCRIPTION,
104
+ content: GENERAL_RESEARCH_SKILL_CONTENT,
105
+ }
@@ -0,0 +1,107 @@
1
+ import { constants } from 'node:fs'
2
+ import { type FileHandle, open, realpath } from 'node:fs/promises'
3
+ import path from 'node:path'
4
+
5
+ import { z } from 'zod'
6
+
7
+ import { defineTool, type Tool, type ToolContext } from '@/plugin'
8
+
9
+ export type WriteReportArgs = { path: string; content: string }
10
+
11
+ const REPORT_BASENAME_RE = /^research-[a-z0-9][a-z0-9-]*\.md$/
12
+
13
+ // One report per session. The researcher subagent object — and therefore this
14
+ // tool instance — is built ONCE by `createResearcherSubagent()` at plugin
15
+ // registration (src/bundled-plugins/researcher/index.ts) and reused for every
16
+ // spawn (run/index.ts reuses `entry.pluginSubagent.customTools`). A plain
17
+ // closure boolean would leak across concurrent and sequential sessions, so the
18
+ // "already wrote" state is keyed by the per-spawn `ctx.sessionId` instead.
19
+ const writtenBySession = new Map<string, true>()
20
+
21
+ // A dedicated, enforced report writer for the researcher subagent. The generic
22
+ // `write` tool is NOT given to the researcher: its runtime boundary (the
23
+ // `non-workspace-write` guard) also allowlists IDENTITY.md / SOUL.md / cron.json
24
+ // / typeclaw.json / mounts/ / packages/ and honors `acknowledgeGuards`, so a
25
+ // guest-spawnable subagent holding generic `write` could write far more than one
26
+ // report. This tool moves the boundary INTO a narrow primitive so the contract
27
+ // is enforced in code, not prompt obedience:
28
+ // - path must resolve to exactly `<agentDir>/{workspace,public}/research-<slug>.md`
29
+ // (no nested dirs, no other basenames, no other directories),
30
+ // - the parent dir's realpath must equal the real workspace/public dir, which
31
+ // blocks `workspace -> /agent/.env` style symlink escapes that a lexical
32
+ // check would follow,
33
+ // - the file is created with O_EXCL, so an existing file or a planted
34
+ // final-path symlink is rejected rather than clobbered or followed,
35
+ // - a second write in the same session is rejected (one report per spawn),
36
+ // - the schema is strict, so an `acknowledgeGuards` field is rejected, not
37
+ // silently stripped.
38
+ export function createWriteReportTool(): Tool<WriteReportArgs> {
39
+ return defineTool<WriteReportArgs>({
40
+ description: `Write your single research report as a markdown file. This is your ONLY way to write a file — there is no general write tool. Call it exactly once.
41
+
42
+ Constraints (enforced; a violation returns an error):
43
+ - \`path\` must be an absolute path of the form \`<agent>/workspace/research-<slug>.md\` or \`<agent>/public/research-<slug>.md\` — directly under workspace/ or public/, no subdirectories, basename \`research-<slug>.md\` where <slug> is lowercase letters, digits and hyphens.
44
+ - The file must not already exist (pick a unique slug, e.g. with a timestamp).
45
+ - You may write the report only once per session.
46
+
47
+ Write to \`public/\` instead of \`workspace/\` when your resolved role lacks \`fs.see.private\` (a guest caller cannot read \`workspace/\`); otherwise use \`workspace/\`.`,
48
+ parameters: z.strictObject({
49
+ path: z
50
+ .string()
51
+ .describe('Absolute path: <agent>/workspace/research-<slug>.md or <agent>/public/research-<slug>.md'),
52
+ content: z.string().describe('The full markdown report body.'),
53
+ }),
54
+ async execute(args: WriteReportArgs, ctx: ToolContext) {
55
+ if (writtenBySession.has(ctx.sessionId)) {
56
+ throw new Error('A report has already been written for this session. You may write exactly one report.')
57
+ }
58
+
59
+ const target = path.resolve(args.path)
60
+ const agentDir = path.resolve(ctx.agentDir)
61
+ const workspaceDir = path.join(agentDir, 'workspace')
62
+ const publicDir = path.join(agentDir, 'public')
63
+
64
+ const parent = path.dirname(target)
65
+ const base = path.basename(target)
66
+
67
+ if (!REPORT_BASENAME_RE.test(base)) {
68
+ throw new Error(
69
+ `Report filename must match research-<slug>.md (lowercase slug), got: ${base}. Path: ${target}.`,
70
+ )
71
+ }
72
+ if (parent !== workspaceDir && parent !== publicDir) {
73
+ throw new Error(
74
+ `Report must be written directly under ${workspaceDir} or ${publicDir} (no subdirectories), got parent: ${parent}.`,
75
+ )
76
+ }
77
+
78
+ const [realParent, realWorkspace, realPublic] = await Promise.all([
79
+ realpath(parent),
80
+ realpath(workspaceDir),
81
+ realpath(publicDir),
82
+ ])
83
+ if (realParent !== realWorkspace && realParent !== realPublic) {
84
+ throw new Error(`Report parent directory resolves outside the allowed report directories: ${parent}.`)
85
+ }
86
+
87
+ let handle: FileHandle | undefined
88
+ try {
89
+ handle = await open(target, constants.O_CREAT | constants.O_EXCL | constants.O_WRONLY, 0o644)
90
+ await handle.writeFile(args.content, 'utf8')
91
+ } catch (err) {
92
+ if (err instanceof Error && 'code' in err && err.code === 'EEXIST') {
93
+ throw new Error(`Report file already exists: ${target}. Choose a unique slug (e.g. add a timestamp).`)
94
+ }
95
+ throw err
96
+ } finally {
97
+ await handle?.close()
98
+ }
99
+
100
+ writtenBySession.set(ctx.sessionId, true)
101
+ return {
102
+ content: [{ type: 'text' as const, text: `Wrote research report: ${target} (${args.content.length} bytes).` }],
103
+ details: { path: target, bytes: args.content.length },
104
+ }
105
+ },
106
+ })
107
+ }
@@ -14,17 +14,31 @@ import {
14
14
  } from '@/plugin'
15
15
 
16
16
  import { CODE_REVIEW_SKILL } from './skills/code-review'
17
+ import { DATA_REVIEW_SKILL } from './skills/data-review'
18
+ import { DOC_REVIEW_SKILL } from './skills/doc-review'
17
19
  import { GENERAL_REVIEW_SKILL } from './skills/general'
20
+ import { PLAN_REVIEW_SKILL } from './skills/plan-review'
21
+ import { SECURITY_AUDIT_SKILL } from './skills/security-audit'
22
+ import { WRITING_REVIEW_SKILL } from './skills/writing-review'
18
23
 
19
24
  // The curated set of review-domain skills the reviewer can load on
20
25
  // demand via its `load_skill` tool. Order is the order the model sees
21
26
  // in the tool description; put the most common case first so the
22
27
  // menu's first impression is the right one for the typical caller.
28
+ // `general` stays last: it is the fallback the model reaches for only
29
+ // when no domain skill fits.
23
30
  //
24
- // Ship list is intentionally small for the first release. Adding a
25
- // skill is a one-line append here plus a new file under `./skills/`;
26
- // no runtime change required.
27
- export const REVIEWER_SKILLS: readonly LoadableSkill[] = [CODE_REVIEW_SKILL, GENERAL_REVIEW_SKILL]
31
+ // Adding a skill is a one-line append here plus a new file under
32
+ // `./skills/`; no runtime change required.
33
+ export const REVIEWER_SKILLS: readonly LoadableSkill[] = [
34
+ CODE_REVIEW_SKILL,
35
+ DOC_REVIEW_SKILL,
36
+ PLAN_REVIEW_SKILL,
37
+ SECURITY_AUDIT_SKILL,
38
+ WRITING_REVIEW_SKILL,
39
+ DATA_REVIEW_SKILL,
40
+ GENERAL_REVIEW_SKILL,
41
+ ]
28
42
 
29
43
  // Without a ceiling, a reviewer whose `session.prompt` stalls mid-turn (model
30
44
  // wedges after a tool error, never emits a terminal message) leaves `completion`
@@ -75,7 +89,7 @@ The runtime exposes these tools to you by these EXACT names — call them by nam
75
89
  - \`grep\` — search file contents by text or regex
76
90
  - \`find\` — locate files by name pattern
77
91
  - \`ls\` — list a directory's immediate contents
78
- - \`bash\` — read-only commands ONLY. Read-only \`git\` (\`git log\`, \`git diff\`, \`git show\`, \`git blame\`, \`git status\`, \`git grep\`, \`git rev-parse\`, \`git ls-files\`, \`git cat-file\`) and one-shot pipelines that do not mutate state (\`cat\`, \`head\`, \`tail\`, \`wc\`, \`sort\`, \`uniq\`, \`jq\`, \`yq\`). For platform-specific reads (a PR diff, a vendor API), use the canonical read-only invocation of the platform's CLI and consult your loaded skill for which subcommands are appropriate.
92
+ - \`bash\` — read-only commands ONLY. Read-only \`git\` (\`git log\`, \`git diff\`, \`git show\`, \`git blame\`, \`git status\`, \`git grep\`, \`git rev-parse\`, \`git ls-files\`, \`git cat-file\`) and one-shot pipelines that do not mutate state (\`cat\`, \`head\`, \`tail\`, \`wc\`, \`sort\`, \`uniq\`, \`jq\`). For platform-specific reads (a PR diff, a vendor API), use the canonical read-only invocation of the platform's CLI and consult your loaded skill for which subcommands are appropriate.
79
93
  - \`web_search\` — search the public web (e.g. for OWASP guidance, RFCs, library changelogs, framework docs, prior art)
80
94
  - \`web_fetch\` — fetch a single URL (e.g. to read a linked spec, vendor doc, or article cited in the target)
81
95
  - \`load_skill\` — load a curated review skill by name. See the section below.
@@ -137,9 +151,11 @@ End every response with a single \`<review>\` block. Use this exact structure:
137
151
  <verdict>approve | request-changes | comment</verdict>
138
152
  </review>
139
153
 
140
- \`approve\` = no blockers; concerns are minor or already addressed.
141
- \`request-changes\` = at least one blocker, or a load-bearing concern that needs an answer before this lands.
142
- \`comment\` = neither useful observations without a clear approve/reject signal (typical for early drafts, exploratory documents, partial reviews).
154
+ These three tokens are the universal verdict vocabulary — they apply whether the target is a code change, a plan, a document, or a dataset. Keep the tokens exactly; the loaded skill tells you what each one means for its domain.
155
+
156
+ \`approve\` = no blockers; the target is sound and any concerns are minor or already addressed.
157
+ \`request-changes\` = at least one blocker, or a load-bearing concern that needs an answer before the target should be accepted, shipped, or executed.
158
+ \`comment\` = neither — useful observations that do not resolve to a clear accept/reject signal (typical for early drafts, exploratory documents, partial reviews).
143
159
 
144
160
  ## Rules
145
161
 
@@ -180,6 +196,8 @@ If none of the listed skills fit the target, load \`general\`. Keep the skill-se
180
196
  customTools: [loadSkillTool],
181
197
  payloadSchema: reviewerPayloadSchema,
182
198
  visibility: 'public',
199
+ rosterDescription:
200
+ 'deep read-only code/PR/plan review in a fresh context, returns a structured verdict; it does NOT post — you act on its findings',
183
201
  canSpawnSubagents: true,
184
202
  timeoutMs: REVIEWER_SPAWN_TIMEOUT_MS,
185
203
  inFlightKey: (payload) => payload?.requestId ?? `anon-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
@@ -0,0 +1,77 @@
1
+ import type { LoadableSkill } from '@/plugin'
2
+
3
+ export const DATA_REVIEW_SKILL_NAME = 'data-review'
4
+
5
+ export const DATA_REVIEW_SKILL_DESCRIPTION =
6
+ 'Review structured data and its shape: a database schema or migration, a dataset, a query result, a spreadsheet, a data contract. Covers constraints, nullability, types, indexing, migration safety, and dataset integrity (duplicates, referential integrity, mixed types, aggregate errors).'
7
+
8
+ export const DATA_REVIEW_SKILL_CONTENT = `# data-review
9
+
10
+ You have been asked to review structured data — a database schema, a migration, a dataset, a query result, a spreadsheet, or a data contract. Two kinds of target live here and they need different lenses: **the shape** (schema/migration/contract — the rules data must follow) and **the data itself** (a dataset/file — whether the values obey those rules). Identify which you are reviewing, then apply the matching section below on top of the reviewer's neutral output contract.
11
+
12
+ ## How to acquire the target
13
+
14
+ - **A migration or schema file** — \`read\` it; \`grep\` the surrounding migrations for the current state of the table being altered, because migration safety depends on what already exists.
15
+ - **A dataset / CSV / JSONL** — \`read\` a representative slice (head and a sample of the middle), not just the first rows. Defects hide past the part that looks clean.
16
+ - **A data contract (dbt model, ODCS YAML, JSON Schema)** — \`read\` it and confirm every field declares a type and a nullability/required flag.
17
+ - **A query result in the payload** — read it carefully; quote the offending rows as evidence.
18
+
19
+ ## Reviewing the shape: schema, migration, contract
20
+
21
+ The shape's job is to make invalid states unrepresentable. A finding here is a rule that is missing, wrong, or unsafe to apply.
22
+
23
+ 1. **Constraints that should exist but do not.** A business-key column (email, SKU, order id) with no \`UNIQUE\`. A required column left nullable. A relationship enforced only in application code with no \`FOREIGN KEY\`. A bounded value (\`status\`, \`price >= 0\`) with no \`CHECK\` or enum. Each missing constraint is a way for schema-invalid data to enter and survive.
24
+ 2. **Wrong types.** Money in \`FLOAT\`/\`DOUBLE\` instead of fixed-point \`NUMERIC\` — guarantees rounding drift. Dates or timestamps stored as strings. \`timestamp\` without time zone where UTC-aware (\`timestamptz\`) is meant. A numeric type with no precision/scale that silently coerces.
25
+ 3. **Nullability mistakes.** A nullable foreign key for a relationship that is logically required; a NOT NULL on a column the data sometimes genuinely lacks (forcing sentinel values like empty string that confuse "missing" with "blank").
26
+ 4. **Indexing.** A foreign key with no supporting index (JOINs and cascade deletes table-scan). A frequently-filtered column with no index. Over-indexing a write-heavy table.
27
+ 5. **Migration safety.** This is where a schema change becomes an outage:
28
+ - Adding a \`NOT NULL\` column to a populated table with no default or backfill — the migration aborts on existing rows. The safe form is additive-then-tighten: add nullable, backfill, set NOT NULL in a later step.
29
+ - A destructive single-step change (drop/rename a column live) instead of expand-then-contract.
30
+ - \`CREATE INDEX\` without \`CONCURRENTLY\` on a live table — locks writes for the duration.
31
+ - A migration with no reverse/rollback path.
32
+ 6. **Contract completeness.** For a data contract: every field declares a type and required-flag, the primary key is declared, references resolve to real fields, quality thresholds are realistic, and breaking changes (column removal/rename/type change) are versioned, not silent.
33
+
34
+ ## Reviewing the data itself: dataset, file, spreadsheet
35
+
36
+ Here the rules may be implicit; your job is to find values that violate what the data clearly intends. Borrow the runtime's own vocabulary: data that fails its intended shape is **schema-invalid**, and a strict consumer is **fail-closed** — it drops or rejects bad rows rather than silently half-accepting them, so a defect that looks cosmetic can erase real records.
37
+
38
+ 1. **Mixed types in one column.** A single column carrying integers, currency-formatted strings ("$1,200"), and blanks. A consumer expecting a number is schema-invalid against these rows; a fail-closed parser drops them silently and the totals are quietly wrong.
39
+ 2. **Completeness.** Required fields that are NULL or blank. Watch the empty-string-vs-NULL confusion — they are not the same "missing" and treating them alike corrupts counts.
40
+ 3. **Uniqueness.** Duplicate rows, or duplicate business keys where the data plainly intends one row per key.
41
+ 4. **Referential integrity.** Orphan rows pointing at parent keys that do not exist; this is a foreign-key violation the file format did not enforce.
42
+ 5. **Aggregate errors.** The classic, high-impact dataset bug: a grand-total whose range includes the subtotal rows (double-counting), or an average/sum whose range silently omits rows that belong in it. State which rows are wrongly included or excluded — this is the Reinhart-Rogoff class of defect and it is almost always a blocker.
43
+ 6. **Identifier corruption from auto-formatting.** Spreadsheet date/number coercion mangling identifiers (gene names like \`SEPT2\` becoming \`2-Sep\`, leading zeros stripped from zip codes / IDs). Flag any identifier column stored in a general/auto format.
44
+ 7. **Distribution and freshness anomalies.** Out-of-range values (negative ages, future timestamps), a sudden volume drop or spike, data older than its freshness expectation.
45
+ 8. **PII / secret leakage.** Emails, phone numbers, tokens, or keys appearing in columns not marked sensitive, or stored in plaintext where the surrounding data is classified.
46
+
47
+ ## What NOT to find
48
+
49
+ - **Formatter / tooling territory.** SQL keyword casing, trailing commas in a migration, CSV quoting style a parser handles — not your concern.
50
+ - **Settled project conventions the target follows.** If the codebase uses \`snake_case\` columns, UUIDv7 keys, or \`.passthrough()\` to tolerate extra columns by design, matching that is not a finding; only deviation is.
51
+ - **Cosmetic dataset quirks with no consumer impact.** A harmless trailing blank line, column order that no consumer depends on. If nothing breaks, do not raise it.
52
+ - **Restating the schema or data.** "This table stores users" / "this column has numbers" is not a finding.
53
+ - **Generic "add validation".** Without naming the specific column and the specific invalid state it admits, "needs more validation" is noise.
54
+
55
+ ## Severity hints specific to data
56
+
57
+ - **blocker** — Money in floating-point, a migration that aborts or locks production, a missing constraint that admits corrupt rows, an aggregate that double-counts or silently drops rows, identifier corruption, PII in plaintext. Data defects are often blockers because they are silent and compounding.
58
+ - **concern** — A missing index that will degrade as the table grows, a nullable FK that should be required, a freshness/volume anomaly that needs explanation, schema drift from the declared contract.
59
+ - **nit** — A naming inconsistency, a missing audit column, a tolerable-but-suboptimal type. Optional.
60
+ - **praise** — A constraint set that makes an invalid state genuinely unrepresentable, a migration written safely as expand-then-contract, a contract whose quality thresholds are realistic. Rare.
61
+
62
+ ## Verdict mapping
63
+
64
+ - **approve** — The shape is sound or the data is clean; any issues are nits.
65
+ - **request-changes** — At least one blocker: a corrupting type choice, an unsafe migration, an aggregate error, a constraint gap that admits bad data.
66
+ - **comment** — Useful observations without a clean accept/reject. Common for a partial dataset audit or an early-stage schema sketch.
67
+
68
+ ## Final output
69
+
70
+ Return findings inside the reviewer's neutral \`<review>\` block. Do NOT invent your own output format.
71
+ `
72
+
73
+ export const DATA_REVIEW_SKILL: LoadableSkill = {
74
+ name: DATA_REVIEW_SKILL_NAME,
75
+ description: DATA_REVIEW_SKILL_DESCRIPTION,
76
+ content: DATA_REVIEW_SKILL_CONTENT,
77
+ }