npm - typeclaw - Versions diffs - 0.28.1 → 0.29.0 - Mend

typeclaw 0.28.1 → 0.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

package/package.json +1 -1
package/src/agent/index.ts +37 -5
package/src/agent/loop-guard.ts +112 -26
package/src/agent/plugin-tools.ts +102 -41
package/src/agent/session-origin.ts +3 -3
package/src/agent/subagents.ts +7 -0
package/src/agent/system-prompt.ts +29 -4
package/src/agent/tools/channel-reply.ts +1 -0
package/src/agent/tools/channel-send.ts +2 -1
package/src/agent/tools/spawn-subagent.ts +21 -0
package/src/agent/tools/subagent-output.ts +7 -3
package/src/agent/tools/wikipedia.ts +1 -1
package/src/bundled-plugins/explorer/explorer.ts +2 -0
package/src/bundled-plugins/github-cli-auth/approve-idempotency.ts +74 -0
package/src/bundled-plugins/github-cli-auth/effective-approval.ts +98 -0
package/src/bundled-plugins/github-cli-auth/gh-review-inline-detect.ts +130 -0
package/src/bundled-plugins/github-cli-auth/index.ts +27 -2
package/src/bundled-plugins/github-cli-auth/review-recorder.ts +12 -4
package/src/bundled-plugins/memory/memory-logger.ts +3 -3
package/src/bundled-plugins/operator/operator.ts +2 -0
package/src/bundled-plugins/planner/index.ts +11 -0
package/src/bundled-plugins/planner/planner.ts +282 -0
package/src/bundled-plugins/planner/skills/general.ts +65 -0
package/src/bundled-plugins/planner/skills/project.ts +69 -0
package/src/bundled-plugins/researcher/index.ts +11 -0
package/src/bundled-plugins/researcher/researcher.ts +226 -0
package/src/bundled-plugins/researcher/skills/general.ts +105 -0
package/src/bundled-plugins/researcher/write-report.ts +107 -0
package/src/bundled-plugins/reviewer/reviewer.ts +29 -11
package/src/bundled-plugins/reviewer/skills/data-review.ts +77 -0
package/src/bundled-plugins/reviewer/skills/doc-review.ts +79 -0
package/src/bundled-plugins/reviewer/skills/general.ts +1 -1
package/src/bundled-plugins/reviewer/skills/plan-review.ts +64 -0
package/src/bundled-plugins/reviewer/skills/security-audit.ts +70 -0
package/src/bundled-plugins/reviewer/skills/writing-review.ts +63 -0
package/src/bundled-plugins/scout/scout.ts +2 -0
package/src/bundled-plugins/security/policies/prompt-injection.ts +8 -4
package/src/bundled-plugins/security/policies/secret-exfil-bash.ts +3 -2
package/src/channels/adapters/discord-bot.ts +38 -11
package/src/channels/adapters/github/inbound.ts +74 -9
package/src/channels/adapters/github/index.ts +36 -11
package/src/channels/adapters/github/reconcile-open-prs.ts +306 -0
package/src/channels/adapters/github/review-state.ts +71 -2
package/src/channels/adapters/kakaotalk-classify.ts +2 -2
package/src/channels/adapters/kakaotalk.ts +2 -2
package/src/channels/adapters/slack-bot-classify.ts +1 -1
package/src/channels/adapters/slack-bot.ts +3 -0
package/src/channels/adapters/telegram-bot.ts +3 -0
package/src/channels/engagement.ts +12 -7
package/src/channels/github-rereview-guard.ts +32 -8
package/src/channels/github-review-claim.ts +53 -6
package/src/channels/router.ts +44 -9
package/src/channels/schema.ts +4 -3
package/src/channels/types.ts +17 -6
package/src/cli/init.ts +13 -2
package/src/cli/ui.ts +64 -0
package/src/config/config.ts +21 -15
package/src/container/start.ts +5 -1
package/src/init/dockerfile.ts +19 -56
package/src/init/hatching.ts +1 -1
package/src/init/index.ts +5 -1
package/src/run/bundled-plugins.ts +4 -0
package/src/server/index.ts +24 -5
package/src/shared/host-locale.ts +27 -0
package/src/shared/protocol.ts +1 -1
package/src/shared/wordmark.ts +19 -0
package/src/skills/typeclaw-channel-github/SKILL.md +1 -1
package/src/skills/typeclaw-config/SKILL.md +32 -32
package/src/skills/typeclaw-kaomoji/SKILL.md +3 -3
package/src/skills/typeclaw-tunnels/SKILL.md +3 -1
package/src/tui/banner.ts +19 -0
package/src/tui/format.ts +34 -0
package/src/tui/index.ts +121 -22
package/src/tui/theme.ts +26 -1
package/src/tunnels/providers/cloudflare-named.ts +15 -4
package/src/tunnels/providers/cloudflare-quick.ts +15 -4
package/src/tunnels/providers/cloudflared-binary.ts +11 -0
package/typeclaw.schema.json +15 -7

package/src/bundled-plugins/reviewer/skills/doc-review.ts ADDED Viewed

@@ -0,0 +1,79 @@
+import type { LoadableSkill } from '@/plugin'
+export const DOC_REVIEW_SKILL_NAME = 'doc-review'
+export const DOC_REVIEW_SKILL_DESCRIPTION =
+  'Review a document written to inform or instruct a reader. Covers purpose and audience fit, completeness for the stated job, accuracy of examples and claims, navigability, staleness, terminology consistency, and accessibility — for any kind of document, with a scoped lens for technical docs when the target is one.'
+export const DOC_REVIEW_SKILL_CONTENT = `# doc-review
+You have been asked to review a document — anything written to inform or instruct a reader. Do not assume a kind. The craft below is universal and applies whatever the document turns out to be; the technical-docs section near the end is one specialization you apply only when the target is in fact developer documentation. Read the target, let it tell you what it is, and review it on its own terms. Apply all of this on top of the reviewer's neutral output contract (severity-tagged findings, evidence quotes, suggestions, verdict).
+## How to acquire the target
+- **A file path** — \`read\` it. \`ls\` the surrounding directory to see how this page fits the larger set; a document is reviewed in the context of the set it belongs to.
+- **A URL** — \`web_fetch\` it. If it is a private site the fetch cannot reach, say so in \`<summary>\` and review what the payload provided.
+- **A PR or diff that touches docs** — \`gh pr diff <n>\` for the changed pages; \`read\` the surrounding sections the diff did not touch, because a doc edit is judged against the whole page's flow, not the hunk alone.
+- **A doc set / directory** — \`ls\` and \`grep\` for the navigation or index file; a finding about findability needs the table of contents, not just one page.
+- **Verify external claims.** If the document cites a law, a standard, a price, an SLA, a statistic, or a linked source, check it with \`web_search\`/\`web_fetch\` before letting it stand.
+## State the document's job before reading for defects
+Every document exists to let a specific reader do or understand a specific thing. Before looking for problems, answer two questions and hold them while you read: **Who is this for?** and **What should they be able to do or know after reading it?** Most documentation defects are a mismatch between the page and the answer to one of those. A finding is strongest when it names which reader the document fails and why. This is your private grounding — keep the restatement out of \`<summary>\`.
+## What to look for
+These apply to any document:
+1. **Purpose / audience fit.** The document is pitched at the wrong reader: jargon and unexplained acronyms for a lay audience, or hand-holding for an expert one; a policy written for lawyers handed to new hires. Name the mismatch.
+2. **Completeness for the stated job.** A missing step, an undocumented edge case, a process that stops before the reader's actual goal, a policy that does not say what happens on violation. The gap is a finding when it leaves the reader unable to finish what the document promised.
+3. **Accuracy of examples and claims.** Anything the document asserts as fact must be correct: a quoted figure, a referenced rule or standard, a worked example, a screenshot, a sample. A wrong example or an unsupported load-bearing claim is a real defect regardless of document type — for code docs this means samples that do not run; for a policy it means a cited regulation that says something else.
+4. **Missing prerequisites or assumptions.** The document assumes access, a prior step, a role, a tool, or background the reader does not have and is never told to get. State the assumption the reader cannot meet.
+5. **Navigability / findability.** A page unreachable from the index, no clear next step where the reader needs one, no anchor for the thing a reader will search for, a long document with no structure to scan. Hard-to-navigate is a defect when it blocks the reader from reaching the part they need.
+6. **Broken or stale cross-references.** Links that 404, "see the section below" pointing at a section that no longer exists, references to a renamed page or a superseded policy version.
+7. **Staleness.** Content that describes an older state than the one in force: an old release's flags, a deprecated process, a price or date that has moved, a screenshot of a UI that changed. Cite the current value against the stale one.
+8. **Terminology / consistency.** The same concept called several names with no statement they are the same ("workspace" / "project" / "folder"; "member" / "user" / "seat"). Pick the canonical term and flag the drift.
+9. **Accessibility.** Images with no alt text, heading levels that skip (h1 → h3), meaning carried by color alone, bare "click here" link text. Real findings, not nits, when they block a reader using assistive tech.
+## When the target is technical documentation
+Developer docs have a failure mode worth naming explicitly: the page is the wrong *type* for what the reader needs. When reviewing technical docs, classify the page against the four Diátaxis modes, because the right content for one is wrong for another:
+- **Tutorial** — learning-oriented. A guaranteed-to-succeed lesson for a newcomer. Concrete, linear, no detours.
+- **How-to guide** — task-oriented. Steps to achieve one stated goal for someone who already knows the basics.
+- **Reference** — information-oriented. Dry, complete, accurate description of the API/CLI/config. No teaching.
+- **Explanation** — understanding-oriented. The "why" and the trade-offs. No step-by-step.
+A page that mixes modes — a tutorial padded with architecture explanation, a reference that drifts into opinion — fails the reader who came for one of them. For technical docs, also hold examples to a higher bar: a code sample is a *claim that it runs*, so trace it against the real CLI/API surface (\`read\` the source, \`gh pr diff\`, the changelog) and cite any sample that uses a removed flag, a wrong import, or a renamed subcommand. This Diátaxis lens and runnable-sample check do NOT apply to non-technical documents — do not force a policy or an onboarding page into a "tutorial vs reference" frame.
+## What NOT to find
+- **Formatter / linter territory.** Trailing whitespace, line length, fenced-block language tags, table alignment. Assume a docs linter ran.
+- **House-style the page follows.** Second person, sentence-case headings, "e.g." vs "for example" — if the document is consistent with its house style, that is not a finding. Only the deviation is.
+- **Restating the document as a finding.** "This page documents the start command" / "this policy covers expenses" is not a review.
+- **Rewriting for taste.** A sentence you would have phrased differently but that reads clearly for its reader is not a finding. Clarity is the bar, not your preferred cadence.
+- **Generic "add more examples" / "make it clearer".** Without naming the specific step, field, or passage that is under-documented or unclear, it is noise.
+## Severity hints specific to docs
+- **blocker** — An example or claim that is factually wrong and will lead the reader astray (a sample that fails for everyone, a cited rule that says the opposite). A prerequisite gap that strands the reader at step one. An audience mismatch so severe the intended reader cannot use the document at all.
+- **concern** — A stale reference that still mostly works but will mislead, a missing prerequisite for a later step, a completeness gap that blocks an edge case, terminology drift that will confuse a newcomer, an accessibility defect that degrades but does not block.
+- **nit** — A single awkward sentence, a missing "next step" link, a minor terminology wobble in an aside. Optional.
+- **praise** — A document that genuinely lands its reader: a tutorial that reaches a working state, a policy that is unambiguous on the hard case, an explanation that makes a difficult concept click. Rare.
+## Verdict mapping
+- **approve** — Publishable. The document serves its reader; any gaps are nits.
+- **request-changes** — At least one blocker: a wrong example or claim, an audience mismatch that defeats the purpose, a prerequisite gap that strands the reader.
+- **comment** — Useful observations without a clean accept/reject. Common for an early draft or a partial review of a large doc set.
+## Final output
+Return findings inside the reviewer's neutral \`<review>\` block. Do NOT invent your own output format.
+`
+export const DOC_REVIEW_SKILL: LoadableSkill = {
+  name: DOC_REVIEW_SKILL_NAME,
+  description: DOC_REVIEW_SKILL_DESCRIPTION,
+  content: DOC_REVIEW_SKILL_CONTENT,
+}

package/src/bundled-plugins/reviewer/skills/general.ts CHANGED Viewed

@@ -20,7 +20,7 @@ You have been asked to review something that does not clearly fit a specific dom
 A general review is the hardest because there are no domain shortcuts. Replace shortcuts with discipline:
-1. **State the target's purpose in your own words.** What is the artifact trying to achieve? Who is it for? Put this in \`<summary>\`. If you cannot state it after reading, that itself is a finding — the artifact does not communicate its purpose.
+1. **State the target's purpose in your own words — to yourself, as a comprehension check.** What is the artifact trying to achieve? Who is it for? If you cannot state it after reading, that itself is a finding — the artifact does not communicate its purpose. This is your private grounding, not summary copy: keep the restatement out of \`<summary>\`, which stays a terse verdict justification per the output contract.
 2. **Identify the load-bearing claims.** What does the artifact assert that, if wrong, would invalidate the whole thing? List them mentally before looking for issues.
 3. **Stress-test the load-bearing claims.** For each one: is the evidence sufficient? Are the assumptions stated? Are the counter-arguments addressed?
 4. **Stress-test the boundaries.** Where does the artifact's argument or design stop applying? Does it acknowledge that boundary, or does it overgeneralize?

package/src/bundled-plugins/reviewer/skills/plan-review.ts ADDED Viewed

@@ -0,0 +1,64 @@
+import type { LoadableSkill } from '@/plugin'
+export const PLAN_REVIEW_SKILL_NAME = 'plan-review'
+export const PLAN_REVIEW_SKILL_DESCRIPTION =
+  'Review a plan, RFC, design doc, PRFAQ, or task breakdown. Covers problem framing, measurable success criteria, alternatives considered, reversibility (one-way vs two-way doors), risk and dependency analysis, and RFC-2119 requirement-keyword discipline.'
+export const PLAN_REVIEW_SKILL_CONTENT = `# plan-review
+You have been asked to review a plan — an RFC, a design doc, a PRFAQ, a roadmap, a todo breakdown, or any document that proposes a course of action. Apply this guidance on top of the reviewer's neutral output contract (severity-tagged findings, evidence quotes, suggestions, verdict).
+## How to acquire the target
+- **A file path** — \`read\` it. \`ls\` the directory for a template or sibling RFCs that establish the expected shape; deviation from an established plan template is itself worth noting.
+- **A URL or doc** — \`web_fetch\` it. If it is a private doc the fetch cannot reach, say so in \`<summary>\` and review what the payload provided.
+- **A PR that adds a design doc** — \`gh pr diff <n>\`, then \`read\` the linked issue or prior discussion if one is referenced; a plan is judged partly on whether it answers the question that prompted it.
+- **An inline plan in the payload** — read it carefully and quote from it when forming evidence.
+## What to look for
+1. **Problem framing.** Does the plan state the problem before the solution? Who feels the pain, and what does "solved" look like for them? A plan that opens with the solution and never names the problem cannot be evaluated — that gap is the first finding.
+2. **Measurable success criteria.** Goals must be checkable. "Improve performance" is unverifiable; "P95 latency under 200ms on the checkout path" is. Flag every load-bearing goal that has no metric, threshold, or acceptance condition.
+3. **Alternatives considered.** A serious proposal names the approaches it rejected and why. A plan that presents one path as if it were the only path has hidden its reasoning — ask for the alternatives, because the rejected ones are where the real trade-off lives.
+4. **Reversibility — one-way vs two-way doors.** Identify the decisions that are hard or impossible to undo: public API contracts, on-disk schema changes, data migrations, anything external parties will depend on. A plan that makes a one-way-door decision without acknowledging it as irreversible has under-weighted its own risk. This is frequently the single most important finding.
+5. **Risk and dependency analysis.** External dependencies, blocking teams, legal or compliance constraints, the order in which steps must land. A plan whose step 3 silently depends on a team that has not agreed is carrying unpriced risk.
+6. **Scope boundaries.** What is explicitly in scope and out of scope? A plan that conflates several unrelated changes, or whose title promises A but whose body spends half its bytes on B, has a scope problem — either the scope is wrong or the framing is.
+7. **Requirement-keyword discipline (RFC-2119).** If the plan uses MUST / SHOULD / MAY, are they used in their precise senses, or interchangeably? A "SHOULD" that is actually a "MUST" will be implemented as optional and bite later. Flag normative keywords whose strength does not match their intent.
+8. **Rollback / recovery.** For a plan that changes a running system, how is it undone if it fails, and how long does that take? Absence of a rollback story is a finding when the change is risky enough to need one.
+## Review every plan as a first review — do not guess its maturity
+You are almost never told whether a plan is a first draft, a final RFC, or something between. Do NOT guess, and do NOT let the absence of that signal bias you — neither toward over-blocking (treating a sketch as a contract) nor toward over-softening (treating a serious proposal as throwaway). Review what is actually on the page, every time, as if you are seeing it fresh with no prior history. This neutrality is the point: a plan's verdict should come from the plan, not from an assumption about its stage you had to invent.
+In practice:
+- **Judge the idea, not the polish.** A plan can be early and still sound, or finished and still wrong. Your findings target whether the *approach* holds up — internal consistency, reversibility, measurable success, acknowledged alternatives — not how complete the document looks.
+- **Missing context is missing context, not a defect.** A plan reviewed cold will omit things a real org would supply: who owns it, the deadline, the budget, the constraint that rules out option B. Do NOT raise each absence as its own blocker — that is exactly the generic-review noise the contract forbids. Fold what you would genuinely need into a single \`comment\`-level finding: "To judge this as ready-to-execute I'd need the owning team, a success metric, and the rollback constraint." One finding, not ten.
+- **An unfilled section is only a finding if its absence breaks the idea.** A plan with no rollback section is not automatically blocked — unless the plan's viability *depends* on a rollback that may be impossible, in which case the gap is the finding and you say why. Empty-by-stage is not the same as flawed. Test each gap: does this missing piece change whether the approach is sound, or is it just not written yet?
+- **Real flaws are still blockers, regardless of stage.** Reviewing cold does not mean reviewing soft. An internal contradiction, a one-way-door decision the plan does not acknowledge as irreversible, a success criterion that is unmeasurable *as written*, or a recommendation with no alternatives considered — these are flaws in the idea itself. Raise them at full severity whether the plan is draft or final.
+- **State your footing in \`<summary>\`, once.** Open with one clause naming what you could and could not assess: "Reviewed on its own terms; no constraints or finality were stated, so the verdict reflects the idea as written, not its fit to an unstated bar." This keeps the review honest about the context it lacked instead of pretending to a certainty it does not have — without guessing at a maturity label. Keep this to one clause; it is grounding, not a process narration.
+## Severity hints specific to plans
+- **blocker** — A load-bearing flaw in the approach: an internal contradiction, a one-way-door decision treated as reversible, a goal that cannot be verified as written, a plan whose central mechanism cannot work. The kind of problem that makes executing the plan a mistake.
+- **concern** — A weakness that should be answered before commitment: a missing alternative that undercuts the recommendation, an unpriced dependency, a scope ambiguity that will mislead implementers, a normative keyword whose strength is wrong.
+- **nit** — A small clarity or structure issue, a section that could be tightened, a stage-normal gap worth a one-line mention.
+- **praise** — A non-obvious risk surfaced and handled, a reversibility analysis done honestly, a success metric that is genuinely measurable. Rare.
+## Verdict mapping
+- **approve** — The idea holds and the gaps are stage-normal. No load-bearing flaw in the approach.
+- **request-changes** — At least one blocker: a flaw in the approach that needs an answer before this should be committed to.
+- **comment** — Useful observations that do not resolve to a clean accept/reject. Common when reviewing a plan cold, where your job is to surface what is unverified rather than to gate it.
+## Final output
+Return findings inside the reviewer's neutral \`<review>\` block. Do NOT invent your own output format.
+`
+export const PLAN_REVIEW_SKILL: LoadableSkill = {
+  name: PLAN_REVIEW_SKILL_NAME,
+  description: PLAN_REVIEW_SKILL_DESCRIPTION,
+  content: PLAN_REVIEW_SKILL_CONTENT,
+}

package/src/bundled-plugins/reviewer/skills/security-audit.ts ADDED Viewed

@@ -0,0 +1,70 @@
+import type { LoadableSkill } from '@/plugin'
+export const SECURITY_AUDIT_SKILL_NAME = 'security-audit'
+export const SECURITY_AUDIT_SKILL_DESCRIPTION =
+  'Audit code or configuration through a threat-model lens: injection, broken access control, SSRF, insecure deserialization, secrets exposure, path traversal, TOCTOU, and cryptographic failures. Maps findings to OWASP/CWE and reasons about exploitability, not style.'
+export const SECURITY_AUDIT_SKILL_CONTENT = `# security-audit
+You have been asked to audit a target for security defects. This is not a general code review with a security flavor — it is an adversarial read. Assume an attacker controls every input the target does not prove it controls, and ask what they can make happen. Apply this on top of the reviewer's neutral output contract (severity-tagged findings, evidence quotes, suggestions, verdict).
+## How to acquire the target
+- **A PR or diff** — \`gh pr diff <n>\` for the change; then \`read\` the surrounding code, because a vulnerability often lives in the interaction between the changed line and an untouched caller.
+- **A file or module** — \`read\` it, then \`grep\` for the entry points: where does external input enter, and where does it reach a sink (a shell, a query, a file path, a deserializer, an outbound request)?
+- **Config / infra** — \`read\` the manifest, Dockerfile, CI workflow, or IaC. Misconfiguration is a vulnerability class of its own (default credentials, over-broad permissions, secrets in plaintext).
+- **Verify with primary sources.** When you cite a class (OWASP A03, CWE-89, an RFC), confirm the current definition with \`web_search\`/\`web_fetch\` before asserting it. Cite by identifier.
+## Trace input to sink
+A security finding is a *path*: untrusted input → (insufficient validation) → dangerous sink. Name both ends and the missing control between them. A finding that only says "this looks unsafe" without tracing the path is not actionable. For each entry point, follow the data: where does it go, what touches it on the way, and what does it reach?
+## What to look for
+Prioritize by exploitability, roughly in this order:
+1. **Injection (CWE-78/89/79/90).** Untrusted input concatenated into a shell command, SQL/NoSQL query, LDAP filter, or HTML sink without parameterization or escaping. OS-command injection via string-interpolated \`bash\` is the highest-value catch.
+2. **Broken access control (OWASP A01).** Missing authorization checks, IDOR (a user can read/write another user's object by changing an ID), endpoints that trust a client-supplied role, path-based bypass.
+3. **SSRF (OWASP A10 / CWE-918).** The server fetches a user-supplied URL with no allowlist, letting an attacker reach internal services or cloud metadata endpoints (\`169.254.169.254\`). Flag any outbound request whose destination is attacker-influenced.
+4. **Insecure deserialization / data-integrity (OWASP A08).** Untrusted bytes fed to a deserializer that can instantiate arbitrary types; unsigned updates; a pipeline that trusts input it did not verify.
+5. **Cryptographic failures (OWASP A02).** Secrets at rest in plaintext, weak or broken hashes (MD5/SHA1 for passwords), missing TLS on sensitive transit, hardcoded keys, predictable tokens.
+6. **Secrets exposure.** API keys, tokens, or passwords in logs, error messages, committed config, or echoed in responses. A stack trace returned to the client is an information-disclosure finding.
+7. **Path traversal (CWE-22).** User input builds a filesystem path without canonicalization, allowing \`../\` escape out of the intended directory.
+8. **TOCTOU (CWE-367).** A check (file exists, permission ok) separated from the use by a window an attacker can exploit to swap the target.
+9. **Authentication weaknesses (OWASP A07).** No brute-force protection, session fixation, missing re-auth on sensitive actions, tokens that never expire.
+## Severity via exploitability (CVSS-style reasoning)
+Anchor severity to *what an attacker gains and how easily*, not to how the code reads:
+- **blocker** — Exploitable now with serious impact: remote code execution, auth bypass, injection reachable from an unauthenticated path, secret disclosure. CVSS roughly High/Critical (7.0+). Do not ship.
+- **concern** — A real weakness that requires a precondition (authenticated attacker, user interaction, an unlikely-but-possible input) or whose impact is bounded. CVSS roughly Medium (4.0–6.9).
+- **nit** — Defense-in-depth hardening with no demonstrated exploit path: a missing security header, a slightly-too-broad scope that is not currently reachable. Optional.
+- **praise** — A non-obvious control done right: input correctly parameterized at a tricky sink, an allowlist that closes an SSRF that an obvious implementation would have left open. Rare.
+For blocker and concern findings, state the attack in one sentence: who, with what access, can make what happen. That sentence is what separates a security finding from a style opinion.
+## What NOT to find
+- **Style and formatting.** Linter territory. A security audit is not the place for naming or spacing.
+- **Performance without a security angle.** A slow loop is not a security finding unless it is a denial-of-service vector you can demonstrate.
+- **Theoretical issues with no reachable path.** "This *could* be unsafe if someone later calls it with attacker input" — only raise it if such a caller exists or is plausible. Name the path or drop the finding; un-anchored "could be exploited" is the security flavor of generic review noise.
+- **Re-flagging controls that are present.** If validation, escaping, or an allowlist already guards the sink, that is not a finding — and if it is done well, it may be a \`praise\`.
+## Verdict mapping
+- **approve** — No exploitable finding. Any issues are defense-in-depth nits.
+- **request-changes** — At least one blocker, or a concern serious enough to answer before this lands.
+- **comment** — Observations without a clear gate: a partial audit of a large surface, or hardening advice on code that has no demonstrated vulnerability.
+## Final output
+Return findings inside the reviewer's neutral \`<review>\` block. Do NOT invent your own output format.
+`
+export const SECURITY_AUDIT_SKILL: LoadableSkill = {
+  name: SECURITY_AUDIT_SKILL_NAME,
+  description: SECURITY_AUDIT_SKILL_DESCRIPTION,
+  content: SECURITY_AUDIT_SKILL_CONTENT,
+}

package/src/bundled-plugins/reviewer/skills/writing-review.ts ADDED Viewed

@@ -0,0 +1,63 @@
+import type { LoadableSkill } from '@/plugin'
+export const WRITING_REVIEW_SKILL_NAME = 'writing-review'
+export const WRITING_REVIEW_SKILL_DESCRIPTION =
+  'Review prose for an audience: a blog post, an announcement, marketing copy, an email, an essay. Covers clarity, audience fit, lede placement, claim-evidence support, tone consistency, and jargon — the editorial craft beyond grammar.'
+export const WRITING_REVIEW_SKILL_CONTENT = `# writing-review
+You have been asked to review a piece of writing meant for a reader — a blog post, a launch announcement, marketing copy, an email, an essay. You are an editor, not a proofreader: grammar and spelling are the floor, not the job. Apply this on top of the reviewer's neutral output contract (severity-tagged findings, evidence quotes, suggestions, verdict).
+## How to acquire the target
+- **A file or inline text** — \`read\` it (or read the payload) in full before forming any finding. Prose is judged as a whole; a paragraph that works alone can still break the piece's flow.
+- **A URL** — \`web_fetch\` it. If a published page, also note whether the lede survives the reader's first screen.
+- **Verify factual claims.** If the piece asserts a number, a comparison, a "first/fastest/only", or a cited source, check it with \`web_search\`/\`web_fetch\` before letting it stand. An unsupported superlative is the most common defect in persuasive writing.
+## Read for the reader, not for yourself
+Before looking for defects, answer two questions and hold them while you read: **Who is this for?** and **What should they do or believe after reading?** Most writing failures are a mismatch between the prose and the answer to one of those. A finding is strong when it names which reader the passage fails and why.
+## What to look for
+1. **Buried lede.** The most important thing — the news, the point, the ask — should arrive early (inverted pyramid). If the reader must wade through throat-clearing to find why the piece exists, the lede is buried. Name where the real lede currently sits.
+2. **Audience mismatch.** Jargon and unexplained acronyms for a general audience; over-explanation for an expert one. The register should match who is reading.
+3. **Unsupported claims.** Every load-bearing assertion needs backing. "The fastest runtime", "customers love it", "the industry standard" — without a benchmark, a quote, or a source, these are assertions the reader has no reason to believe. Flag the claim and say what evidence it needs.
+4. **Tone inconsistency.** A piece that starts formal and drifts casual, or whose brand voice wobbles, loses the reader's trust. Point at the shift.
+5. **Clarity / muddy thinking.** Sentences the reader must re-read: ambiguous pronouns, a clause whose subject is lost, a paragraph that says three things and lands none. Unclear prose is usually unclear thinking — point at the sentence.
+6. **Undefined jargon.** A term or acronym used before it is defined, with no gloss and no link. First use should orient the reader.
+7. **Terminology drift.** The same thing named three ways ("dashboard" / "console" / "control panel") confuses; pick one and flag the rest.
+8. **Structure and flow.** Ideas that do not build, missing transitions, a piece that ends without telling the reader what to do next when it clearly wants them to act.
+## What NOT to find
+- **Taste dressed as error.** A sentence you would have phrased differently but that reads clearly and serves the audience is not a finding. "I prefer shorter paragraphs" is not a defect.
+- **Valid style/dialect choices.** British vs American spelling, the Oxford comma, em-dash vs parentheses — when the piece is internally consistent and the house style permits it, leave it.
+- **Grammar a proofreader owns.** A genuine grammar error is fair, but do not pad the review with comma surgery; your value is editorial, not mechanical.
+- **Restating the piece.** "This post announces the new feature" is not a review.
+- **Generic "make it clearer".** Without pointing at the specific passage that is unclear, "could be clearer" is noise.
+## Severity hints specific to writing
+- **blocker** — A factual claim that is verifiably wrong, an audience mismatch so severe the intended reader cannot use the piece, a lede so buried the piece fails its purpose. The kind of problem that means this should not publish as-is.
+- **concern** — An unsupported load-bearing claim, a tone break that undercuts trust, a structural gap that loses the reader partway. Should fix before publishing.
+- **nit** — A single muddy sentence, a minor terminology wobble, a missing transition. Optional; the author can decline.
+- **praise** — A passage that makes a complex thing plain, a lede that lands, a claim backed cleanly with evidence. Rare; call out writing that earns the reader's trust.
+## Verdict mapping
+- **approve** — Ready for its reader. Any issues are nits the author can take or leave.
+- **request-changes** — At least one blocker: a wrong claim, a buried lede that defeats the purpose, an audience mismatch.
+- **comment** — Useful observations without a clean accept/reject. Common for an early draft where the author wants direction more than a gate.
+## Final output
+Return findings inside the reviewer's neutral \`<review>\` block. Do NOT invent your own output format.
+`
+export const WRITING_REVIEW_SKILL: LoadableSkill = {
+  name: WRITING_REVIEW_SKILL_NAME,
+  description: WRITING_REVIEW_SKILL_DESCRIPTION,
+  content: WRITING_REVIEW_SKILL_CONTENT,
+}

package/src/bundled-plugins/scout/scout.ts CHANGED Viewed

@@ -85,6 +85,8 @@ export function createScoutSubagent(): Subagent<ScoutPayload> {
     tools: [webSearchTool, webFetchTool],
     payloadSchema: scoutPayloadSchema,
     visibility: 'public',
+    rosterDescription:
+      'fast single-pass web lookup in a fresh context — searches and fetches, returns citation-backed findings without the raw pages touching your context',
     inFlightKey: (payload) => payload?.requestId ?? `anon-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
     toolResultBudget: {
       maxTotalBytes: 512_000,

package/src/bundled-plugins/security/policies/prompt-injection.ts CHANGED Viewed

@@ -429,10 +429,14 @@ const GIT_EXFIL_VERBS = [
 const GIT_EXFIL_PATTERNS: ReadonlyArray<RegExp> = [
   new RegExp(`(?:${GIT_EXFIL_VERBS})`, 'i'),
-  // Korean shorthand for "do it" / "go ahead" right after a git command,
-  // which is the breach idiom ("...git push origin main ㄱㄱ").
-  /git\s+push[\s\S]{0,40}(?:\u{3131}\u{3131}|ㄱㄱ|\u{ACE0}\u{ACE0}|\u{C5B4}\u{C11C}|\u{ACA9}\u{ACA9})/iu,
-  // "back up to github" / "백업 해줘" framings often dressed as a benign
+  // Urgency shorthand ("do it" / "go ahead" / "now") right after a git command,
+  // which is the breach idiom ("...git push origin main <go-go>"). Multilingual:
+  // the urgency token is what flips a benign-looking push into a "just run it"
+  // exfil nudge, so it must catch the idiom across the major channel languages,
+  // not Korean alone. Tokens are tight imperative go-aheads to avoid matching
+  // ordinary trailing words (e.g. "git push then go to lunch" must NOT match).
+  /git\s+push[\s\S]{0,40}(?:\u{3131}\u{3131}|ㄱㄱ|\u{ACE0}\u{ACE0}|\u{C5B4}\u{C11C}|\u{ACA9}\u{ACA9}|go\s?go|asap|do\s+it\s+now|just\s+do\s+it|right\s+now|行け行け|早く|赶紧|快点|赶快|hazlo\s+ya|dale\s+ya)/iu,
+  // "back up to github" / Korean "백업 해줘" (back it up) framings often dressed as a benign
   // request - if the same message also names a credential or `.env`, the
   // SECRET_DEMAND_PATTERNS already fires; this catches the standalone
   // "push to my backup repo" framing that doesn't mention secrets.

package/src/bundled-plugins/security/policies/secret-exfil-bash.ts CHANGED Viewed

@@ -50,8 +50,9 @@ const DANGEROUS_COMMAND_PATTERNS: ReadonlyArray<{ pattern: RegExp; label: string
   { pattern: /set\s+-o\s+posix[\s\S]{0,40}(?:^|[\s;|&(`])set(?:[\s;|&)`]|$)/m, label: 'set -o posix; set (env dump)' },
   {
     // jq/yq read+emit arbitrary files just like cat (e.g. `jq . .env`,
-    // `yq '.x' .env`) and both ship in the container baseline, so they are
-    // first-class .env exfil vectors and must be gated here, not just the
+    // `yq '.x' .env`). `jq` ships in the container baseline; `yq` no longer
+    // does, but a user can re-add it via `docker.file.append`, so both stay
+    // gated here as first-class .env exfil vectors — not just the
     // pager/dumper family.
     pattern: /(cat|less|more|head|tail|bat|xxd|od|hexdump|strings|jq|yq)\s+[^\n;|&`]*\.env(\s|$|[;|&`])/,
     label: 'reading .env file',

package/src/channels/adapters/discord-bot.ts CHANGED Viewed

@@ -493,16 +493,27 @@ function discordFailureForStatus(status: number): MembershipResolverFailure {
   return { kind: 'transient' }
 }
+// Discord message type for THREAD_STARTER_MESSAGE — the first message in a
+// thread created from an existing message. `referenced_message` is also
+// populated for type 19 (REPLY) and 23 (CONTEXT_MENU_COMMAND), so the opener
+// fallback below must gate on this type alone, not on the field's presence.
+const DISCORD_MESSAGE_TYPE_THREAD_STARTER = 21
 type DiscordRawHistoryMessage = {
   id: string
   channel_id: string
+  type?: number
   author: { id: string; username?: string; global_name?: string | null; bot?: boolean }
   content: string
   timestamp: string
-  message_reference?: { message_id?: string }
+  message_reference?: { message_id?: string; channel_id?: string }
   attachments?: DiscordFile[]
   embeds?: DiscordGatewayEmbed[]
   sticker_items?: DiscordGatewayStickerItem[]
+  // A thread started from an existing message has a type-21 starter whose
+  // top-level content/author are empty; the real opener lives only here.
+  // `null` = referenced message deleted; absent = API did not resolve it.
+  referenced_message?: DiscordRawHistoryMessage | null
 }
 // Discord treats threads as separate channels with their own snowflake ids,
@@ -565,7 +576,18 @@ export function createDiscordHistoryCallback(deps: {
 }
 function mapDiscordMessage(msg: DiscordRawHistoryMessage, botUserId: string | null): ChannelHistoryMessage {
-  const isBot = msg.author.bot === true || (botUserId !== null && msg.author.id === botUserId)
+  // A thread started from an existing message exposes that opener only as the
+  // type-21 starter's `referenced_message` — the starter itself has empty
+  // content and a bot/system author. Without this, the agent never sees the
+  // message the thread was created from. Take the opener's author and body
+  // (the live inbound path does the equivalent via enrichDiscordMessageReferences),
+  // while keeping the starter's own id/timestamp so dedup against the triggering
+  // message and chronological ordering stay correct.
+  const opener = msg.referenced_message ?? undefined
+  const isThreadStarter = msg.type === DISCORD_MESSAGE_TYPE_THREAD_STARTER
+  const source = isThreadStarter && opener !== undefined && bodyOf(msg) === '' ? opener : msg
+  const isBot = source.author.bot === true || (botUserId !== null && source.author.id === botUserId)
   const ts = Date.parse(msg.timestamp)
   // The REST history fetch bypasses the inbound classifier, so attachments,
   // embeds, and stickers on already-posted messages (e.g. an image on a thread
@@ -573,17 +595,12 @@ function mapDiscordMessage(msg: DiscordRawHistoryMessage, botUserId: string | nu
   // otherwise they are silently dropped and look_at_channel_attachment can
   // never resolve them. Mirror the classifier's splitInbound: bake placeholders
   // into text and carry the structured attachments so the router can resolve ids.
-  const attachments = describeDiscordMedia(msg)
-  const text =
-    attachments.length === 0
-      ? msg.content
-      : msg.content === ''
-        ? attachments.map(renderPlaceholder).join('\n')
-        : `${msg.content}\n${attachments.map(renderPlaceholder).join('\n')}`
+  const attachments = describeDiscordMedia(source)
+  const text = bodyOf(source)
   return {
     externalMessageId: msg.id,
-    authorId: msg.author.id,
-    authorName: msg.author.global_name ?? msg.author.username ?? msg.author.id,
+    authorId: source.author.id,
+    authorName: source.author.global_name ?? source.author.username ?? source.author.id,
     text,
     ts: Number.isFinite(ts) ? ts : 0,
     isBot,
@@ -592,6 +609,13 @@ function mapDiscordMessage(msg: DiscordRawHistoryMessage, botUserId: string | nu
   }
 }
+function bodyOf(msg: DiscordRawHistoryMessage): string {
+  const attachments = describeDiscordMedia(msg)
+  if (attachments.length === 0) return msg.content
+  const placeholders = attachments.map(renderPlaceholder).join('\n')
+  return msg.content === '' ? placeholders : `${msg.content}\n${placeholders}`
+}
 function clampLimit(requested: number, max: number): number {
   if (!Number.isFinite(requested) || requested <= 0) return max
   return Math.min(Math.floor(requested), max)
@@ -1006,6 +1030,7 @@ export function createDiscordBotAdapter(options: DiscordBotAdapterOptions): Disc
       options.router.registerReaction('discord-bot', reactionCallback)
       options.router.registerRemoveReaction('discord-bot', removeReactionCallback)
       options.router.registerTyping('discord-bot', typingCallback)
+      options.router.setTypingCapability('discord-bot', true)
       options.router.registerChannelNameResolver('discord-bot', channelResolver)
       options.router.registerSelfIdentity('discord-bot', selfIdentityResolver)
       options.router.registerHistory('discord-bot', historyCallback)
@@ -1023,6 +1048,7 @@ export function createDiscordBotAdapter(options: DiscordBotAdapterOptions): Disc
         options.router.unregisterReaction('discord-bot', reactionCallback)
         options.router.unregisterRemoveReaction('discord-bot', removeReactionCallback)
         options.router.unregisterTyping('discord-bot', typingCallback)
+        options.router.setTypingCapability('discord-bot', false)
         options.router.unregisterChannelNameResolver('discord-bot', channelResolver)
         options.router.unregisterSelfIdentity('discord-bot', selfIdentityResolver)
         options.router.unregisterHistory('discord-bot', historyCallback)
@@ -1043,6 +1069,7 @@ export function createDiscordBotAdapter(options: DiscordBotAdapterOptions): Disc
       options.router.unregisterReaction('discord-bot', reactionCallback)
       options.router.unregisterRemoveReaction('discord-bot', removeReactionCallback)
       options.router.unregisterTyping('discord-bot', typingCallback)
+      options.router.setTypingCapability('discord-bot', false)
       options.router.unregisterChannelNameResolver('discord-bot', channelResolver)
       options.router.unregisterSelfIdentity('discord-bot', selfIdentityResolver)
       options.router.unregisterHistory('discord-bot', historyCallback)

package/src/channels/adapters/github/inbound.ts CHANGED Viewed

@@ -4,6 +4,7 @@ import type { GithubReviewOn } from '@/channels/schema'
 import type { InboundMessage } from '@/channels/types'
 import type { GithubAuthContext } from './auth'
+import { GITHUB_API_BASE, githubJsonHeaders } from './auth-pat'
 import { removeRequestedReviewer } from './decoy-reviewer'
 import type { DeliveryDedup } from './dedup'
 import { isGithubEventAllowed } from './event-allowlist'
@@ -94,10 +95,12 @@ export function createGithubWebhookHandler(options: GithubWebhookHandlerOptions)
     }
     const teamIsBotMember = await resolveTeamMembership(event, payload, options)
+    const reviewCommentParent = await resolveReviewCommentParent(event, payload, selfId, selfLogin, options)
     const classified = classifyGithubInbound(event, payload, selfLogin, {
       teamIsBotMember,
       authType: options.authType?.() ?? 'pat',
       reviewOn: options.reviewOn?.() ?? 'review_requested',
+      ...(reviewCommentParent !== null ? { reviewCommentParent } : {}),
     })
     if (classified === null) return ok()
@@ -286,7 +289,12 @@ export function classifyGithubInbound(
   event: string,
   payload: Record<string, unknown>,
   selfLogin: string | null,
-  options?: { teamIsBotMember?: boolean; authType?: 'pat' | 'app'; reviewOn?: GithubReviewOn },
+  options?: {
+    teamIsBotMember?: boolean
+    authType?: 'pat' | 'app'
+    reviewOn?: GithubReviewOn
+    reviewCommentParent?: ReviewCommentParent
+  },
 ): InboundMessage | null {
   const repository = readRepository(payload)
   if (repository === null) return null
@@ -326,7 +334,10 @@ export function classifyGithubInbound(
     const number = readNumber(pr, 'number')
     const id = readNumber(comment, 'id')
     if (number === null || id === null) return null
-    const root = readNumber(comment, 'in_reply_to_id') ?? id
+    const parentId = readNumber(comment, 'in_reply_to_id')
+    const root = parentId ?? id
+    const parent =
+      parentId !== null && options?.reviewCommentParent?.parentId === parentId ? options.reviewCommentParent : null
     return buildInbound(
       { ...base, chat: `pr:${number}`, thread: String(root) },
       comment.body,
@@ -335,6 +346,12 @@ export function classifyGithubInbound(
       mention,
       comment.created_at,
       { kind: 'pr-review-comment', owner: repository.owner, repo: repository.name, commentId: id },
+      false,
+      {
+        suppressSticky: true,
+        replyToBotMessageId: parent?.isSelf === true ? String(parent.parentId) : null,
+        replyToOtherMessageId: parent?.isSelf === false ? String(parent.parentId) : null,
+      },
     )
   }
@@ -411,6 +428,13 @@ export function classifyGithubInbound(
     // the PR is non-draft once ready — preserving "review when no longer draft".
     const isOpenLike = action === 'opened' || action === 'ready_for_review'
     if (isOpenLike && reviewOn === 'opened') {
+      // Draft opened under `review.on: "opened"`: skip cleanly (null wakes no
+      // session) and wait for the `ready_for_review` trigger. Must NOT fall
+      // through to the awareness path below, where a multi-collaborator repo
+      // silently `observed`s it — a draft whose `ready_for_review` delivery is
+      // later lost would then never get reviewed. `review_requested`
+      // on a draft is unaffected: it returns above via classifyReviewRequest.
+      if (readBoolean(pr, 'draft') === true) return null
       const trigger = classifyOpenedReviewTrigger({
         payload,
         pr,
@@ -460,6 +484,7 @@ export function classifyGithubInbound(
       review.submitted_at,
       null,
       !hasBody,
+      { suppressSticky: true },
     )
   }
@@ -502,6 +527,14 @@ type ReviewRequestInput = {
   teamIsBotMember: boolean | undefined
 }
+type ReviewCommentParent = { isSelf: boolean; parentId: number }
+type BuildInboundOptions = {
+  suppressSticky?: boolean
+  replyToBotMessageId?: string | null
+  replyToOtherMessageId?: string | null
+}
 // A GitHub App can never be a `requested_reviewer` — that field only holds
 // real user accounts, and the App actor (`slug[bot]`) is not one. The
 // supported workaround is a decoy user account named after the App that an
@@ -644,12 +677,6 @@ function classifyOpenedReviewTrigger(input: OpenedReviewTriggerInput): InboundMe
   const decoyLogin = resolveDecoyReviewerLogin(selfLogin, authType)
   if (sender.login === selfLogin || (decoyLogin !== null && sender.login === decoyLogin)) return null
-  // A draft PR is work-in-progress, so the automatic `opened` path skips it: null
-  // here drops to awareness-only context (like a non-`opened` reviewOn) instead of
-  // waking a review. An explicit `review_requested` still triggers on a draft via
-  // classifyReviewRequest, preserving "skip until explicitly requested".
-  if (readBoolean(pr, 'draft') === true) return null
   const title = readString(pr, 'title') ?? `#${number}`
   const head = readString(readRecord(pr.head), 'ref')
   const baseRef = readString(readRecord(pr.base), 'ref')
@@ -700,6 +727,7 @@ function buildInbound(
   rawTs: unknown,
   reactionTarget: GithubReactionTarget | null,
   synthesizedAwareness = false,
+  options?: BuildInboundOptions,
 ): InboundMessage | null {
   if (user === null) return null
   const text = typeof rawText === 'string' ? rawText : ''
@@ -714,6 +742,8 @@ function buildInbound(
   // that handle is the author, never a third-party mention of the bot, so the
   // body-text mention heuristic must not fire on it.
   const isBotMention = !synthesizedAwareness && textMentionsBot(text, mention)
+  const replyToBotMessageId = options?.replyToBotMessageId ?? null
+  const replyToOtherMessageId = options?.replyToOtherMessageId ?? key.replyToOtherMessageId
   return {
     ...key,
     text,
@@ -723,7 +753,9 @@ function buildInbound(
     authorName: user.login,
     authorIsBot: user.type === 'Bot',
     isBotMention,
-    replyToBotMessageId: null,
+    ...(options?.suppressSticky === true ? { suppressSticky: true } : {}),
+    replyToBotMessageId,
+    replyToOtherMessageId,
     ts: typeof rawTs === 'string' ? Date.parse(rawTs) || 0 : 0,
   }
 }
@@ -790,6 +822,39 @@ async function resolveTeamMembership(
   }
 }
+async function resolveReviewCommentParent(
+  event: string,
+  payload: Record<string, unknown>,
+  selfId: string | null,
+  selfLogin: string | null,
+  options: GithubWebhookHandlerOptions,
+): Promise<ReviewCommentParent | null> {
+  if (event !== 'pull_request_review_comment') return null
+  const comment = readRecord(payload.comment)
+  const parentId = readNumber(comment, 'in_reply_to_id')
+  if (parentId === null) return null
+  const repository = readRepository(payload)
+  if (repository === null) return null
+  const authToken = options.authToken
+  if (authToken === undefined) return null
+  try {
+    const token = await authToken({ repoSlug: `${repository.owner}/${repository.name}` })
+    const fetchImpl = options.fetchImpl ?? fetch
+    const response = await fetchImpl(
+      `${GITHUB_API_BASE}/repos/${repository.owner}/${repository.name}/pulls/comments/${parentId}`,
+      { headers: githubJsonHeaders(token) },
+    )
+    if (!response.ok) return null
+    const raw = (await response.json().catch(() => null)) as unknown
+    const user = readUser(readRecord(raw)?.user)
+    if (user === null) return null
+    return { parentId, isSelf: isSelfAuthor(user, selfId, selfLogin) }
+  } catch {
+    return null
+  }
+}
 function readRepository(payload: Record<string, unknown>): { owner: string; name: string } | null {
   const repository = readRecord(payload.repository)
   const owner = readRecord(repository?.owner)