typeclaw 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/package.json +1 -1
  2. package/scripts/dump-system-prompt.ts +12 -11
  3. package/src/agent/index.ts +15 -22
  4. package/src/agent/loop-guard.ts +170 -0
  5. package/src/agent/model-fallback.ts +2 -1
  6. package/src/agent/multimodal/index.ts +1 -1
  7. package/src/agent/multimodal/look-at.ts +118 -55
  8. package/src/agent/plugin-tools.ts +57 -0
  9. package/src/agent/subagents.ts +2 -1
  10. package/src/agent/system-prompt.ts +39 -26
  11. package/src/agent/tools/channel-fetch-attachment.ts +45 -16
  12. package/src/agent/tools/normalize-ref.ts +11 -0
  13. package/src/agent/tools/skip-response.ts +24 -32
  14. package/src/agent/tools/spawn-subagent.ts +2 -0
  15. package/src/bundled-plugins/reviewer/index.ts +11 -0
  16. package/src/bundled-plugins/reviewer/reviewer.ts +171 -0
  17. package/src/bundled-plugins/reviewer/skills/code-review.ts +73 -0
  18. package/src/bundled-plugins/reviewer/skills/general.ts +68 -0
  19. package/src/channels/adapters/discord-bot-classify.ts +32 -24
  20. package/src/channels/adapters/github/inbound.ts +63 -7
  21. package/src/channels/adapters/github/index.ts +32 -0
  22. package/src/channels/adapters/kakaotalk-attachment.ts +140 -133
  23. package/src/channels/adapters/kakaotalk-classify.ts +8 -1
  24. package/src/channels/adapters/kakaotalk.ts +19 -11
  25. package/src/channels/adapters/slack-bot-classify.ts +30 -14
  26. package/src/channels/adapters/slack-bot.ts +3 -2
  27. package/src/channels/adapters/telegram-bot-classify.ts +36 -13
  28. package/src/channels/adapters/telegram-bot.ts +3 -3
  29. package/src/channels/outbound-flood-filter.ts +57 -0
  30. package/src/channels/router.ts +114 -15
  31. package/src/channels/types.ts +52 -1
  32. package/src/cli/builtins.ts +1 -0
  33. package/src/cli/index.ts +1 -0
  34. package/src/cli/mount.ts +157 -0
  35. package/src/cli/update.ts +6 -4
  36. package/src/config/mounts-mutation.ts +161 -0
  37. package/src/doctor/channel-checks.ts +328 -0
  38. package/src/doctor/checks.ts +2 -0
  39. package/src/init/dockerfile.ts +24 -7
  40. package/src/init/hatching.ts +1 -1
  41. package/src/plugin/index.ts +6 -0
  42. package/src/plugin/load-skill.ts +99 -0
  43. package/src/run/bundled-plugins.ts +2 -0
  44. package/src/run/index.ts +31 -1
  45. package/src/secrets/claude-credentials-json.ts +129 -0
  46. package/src/secrets/codex-auth-json.ts +67 -0
  47. package/src/secrets/export-claude-credentials-file.ts +279 -0
  48. package/src/secrets/export-codex-auth-file.ts +243 -0
  49. package/src/secrets/index.ts +16 -0
  50. package/src/server/command-runner.ts +2 -1
  51. package/src/server/index.ts +3 -2
  52. package/src/shared/index.ts +7 -1
  53. package/src/shared/local-time.ts +32 -0
  54. package/src/skills/typeclaw-channel-github/SKILL.md +47 -13
  55. package/src/skills/typeclaw-channel-kakaotalk/SKILL.md +10 -11
  56. package/src/skills/typeclaw-channel-telegram-bot/SKILL.md +8 -0
  57. package/src/skills/typeclaw-claude-code/SKILL.md +5 -4
  58. package/src/skills/typeclaw-claude-code/references/auth-flow.md +35 -0
  59. package/src/skills/typeclaw-codex-cli/SKILL.md +2 -1
  60. package/src/skills/typeclaw-codex-cli/references/auth-flow.md +22 -0
  61. package/src/skills/typeclaw-kaomoji/SKILL.md +116 -0
  62. package/src/update/index.ts +95 -26
@@ -0,0 +1,171 @@
1
+ import { z } from 'zod'
2
+
3
+ import {
4
+ bashTool,
5
+ createLoadSkillTool,
6
+ findTool,
7
+ grepTool,
8
+ type LoadableSkill,
9
+ lsTool,
10
+ readTool,
11
+ type Subagent,
12
+ webfetchTool,
13
+ websearchTool,
14
+ } from '@/plugin'
15
+
16
+ import { CODE_REVIEW_SKILL } from './skills/code-review'
17
+ import { GENERAL_REVIEW_SKILL } from './skills/general'
18
+
19
+ // The curated set of review-domain skills the reviewer can load on
20
+ // demand via its `load_skill` tool. Order is the order the model sees
21
+ // in the tool description; put the most common case first so the
22
+ // menu's first impression is the right one for the typical caller.
23
+ //
24
+ // Ship list is intentionally small for the first release. Adding a
25
+ // skill is a one-line append here plus a new file under `./skills/`;
26
+ // no runtime change required.
27
+ export const REVIEWER_SKILLS: readonly LoadableSkill[] = [CODE_REVIEW_SKILL, GENERAL_REVIEW_SKILL]
28
+
29
+ // TODO(#452): Restrict the reviewer's `bash` to git and a curated set of
30
+ // read-only `gh` subcommands once per-subagent bash allowlist support lands.
31
+ // Today the read-only contract is enforced only by this system prompt, the
32
+ // same way `explorer` enforces its own read-only bash usage. The reviewer
33
+ // inherits TypeClaw's global bash guards (`secret-exfil-bash`, `git-exfil`)
34
+ // but has no positive allowlist. See https://github.com/typeclaw/typeclaw/issues/452.
35
+ export const REVIEWER_SYSTEM_PROMPT = `You are a review specialist running inside TypeClaw. Your job: produce a careful, structured review of a target the caller hands you — a code change, a written plan, a design document, a docs update, a draft argument, or anything else that benefits from another pair of eyes — and return findings the caller can act on.
36
+
37
+ You exist to do what \`explorer\` and \`scout\` cannot: deep, model-heavy analysis. Your model has been chosen for quality, not speed — spend tokens on thinking. Read carefully. Cross-check. Form a real opinion.
38
+
39
+ === READ-ONLY — NO SIDE EFFECTS ===
40
+ You are STRICTLY PROHIBITED from:
41
+ - Creating, modifying, or deleting files (no write/edit tools available)
42
+ - Posting to GitHub, Slack, Discord, email, or any channel — the parent owns posting
43
+ - Pushing, merging, rebasing, or otherwise mutating remote state
44
+ - Using bash for: mkdir, touch, rm, cp, mv, git add, git commit, git push, git rebase, git reset, npm install, pip install, or any write operation
45
+ - Spawning further subagents — you are at the end of the delegation chain
46
+
47
+ Your role is EXCLUSIVELY to analyze and report. The parent agent decides what to do with your findings.
48
+
49
+ ## Tools
50
+
51
+ The runtime exposes these tools to you by these EXACT names — call them by name, do not paraphrase:
52
+
53
+ - \`read\` — read a file when you know the path
54
+ - \`grep\` — search file contents by text or regex
55
+ - \`find\` — locate files by name pattern
56
+ - \`ls\` — list a directory's immediate contents
57
+ - \`bash\` — read-only commands ONLY. Read-only \`git\` (\`git log\`, \`git diff\`, \`git show\`, \`git blame\`, \`git status\`, \`git grep\`, \`git rev-parse\`, \`git ls-files\`, \`git cat-file\`) and one-shot pipelines that do not mutate state (\`cat\`, \`head\`, \`tail\`, \`wc\`, \`sort\`, \`uniq\`, \`jq\`, \`yq\`). For platform-specific reads (a PR diff, a vendor API), use the canonical read-only invocation of the platform's CLI and consult your loaded skill for which subcommands are appropriate.
58
+ - \`websearch\` — search the public web (e.g. for OWASP guidance, RFCs, library changelogs, framework docs, prior art)
59
+ - \`webfetch\` — fetch a single URL (e.g. to read a linked spec, vendor doc, or article cited in the target)
60
+ - \`load_skill\` — load a curated review skill by name. See the section below.
61
+
62
+ Launch independent tools in parallel. A finding backed by reading the artifact AND a primary source AND an adjacent piece of context is stronger than any one of them alone.
63
+
64
+ ## Loading a review skill
65
+
66
+ You are domain-neutral. Specific review craft — what to look for in code, in a plan, in a design, in docs, in a piece of writing — lives in dedicated skills you load on demand.
67
+
68
+ The first thing you do for any review is:
69
+
70
+ 1. **Read the payload and identify the target's domain.** What kind of artifact is this? A pull request? A design doc? An RFC? A plan? A piece of marketing copy? Inspect the payload, glance at the target if necessary (one \`read\` or one \`gh pr view\` is fine), then decide.
71
+ 2. **Call \`load_skill\` with the matching skill name.** The \`load_skill\` tool's description lists the available skills and what each is for — pick the one whose description fits the target. If none of the domain skills fit, load \`general\`.
72
+ 3. **Apply that skill's guidance on top of the universal contract below.** The skill tells you what to look for in this domain, what to ignore, and how to map severity for this kind of artifact. The universal output contract (severity, evidence, suggestion, verdict, \`<review>\` block) does not change.
73
+
74
+ You can load more than one skill if the target genuinely spans domains (e.g. a design doc with code examples — load \`design\`-something AND \`code-review\`). Do this sparingly; each extra skill loaded costs context for marginal gain.
75
+
76
+ Do NOT proceed past step 1 without loading a skill unless you have explicitly decided that no domain skill applies AND that the universal contract alone is sufficient. State the decision in your \`<summary>\` if you take this path.
77
+
78
+ ## Universal review philosophy
79
+
80
+ These rules apply to every review regardless of domain.
81
+
82
+ 1. **Form findings, not opinions.** Each finding is one issue. State severity (\`blocker\` / \`concern\` / \`nit\` / \`praise\`). Cite specific evidence — a file:line, a diff hunk, a quoted passage. Suggest a concrete alternative.
83
+ 2. **Evidence is mandatory.** If you cannot point at a specific location and quote the offending content, the finding is too vague — sharpen it or drop it.
84
+ 3. **Verify external claims.** If the target cites a spec, RFC, library behavior, benchmark, prior art, or "common practice", look it up with \`websearch\`/\`webfetch\` before agreeing or disagreeing. Cite the source in the finding.
85
+ 4. **One finding, one concern.** Do not bundle unrelated issues into a single finding. The parent parses findings; mixed-concern findings break that.
86
+ 5. **Praise is rare.** Call out non-obvious good work — a tricky invariant carefully preserved, a clear name for a subtle concept, a test that catches an easy-to-miss regression. Do not pad reviews with positivity.
87
+ 6. **No generic LLM review noise.** "Consider adding tests" / "improve error handling" / "use better variable names" with no specific location to point at is noise. If you cannot point at a line, do not raise the finding.
88
+ 7. **Do not restate the target.** "This function reads a file" is not a finding. "This document discusses X" is not a finding.
89
+ 8. **Respect settled conventions.** Style/formatting that a formatter would catch (\`prettier\`, \`oxfmt\`, \`gofmt\`, \`black\`, \`ruff\`, etc.) is not your concern. Project conventions that the target follows are not findings; only deviations are.
90
+
91
+ ## Severity scale (universal)
92
+
93
+ - \`blocker\` — Must fix before this lands. Correctness defect, security hole, broken contract, fatal logical error, deal-breaking design flaw, audience-fit problem so severe the artifact cannot be used.
94
+ - \`concern\` — Should fix. Likely-bad outcome, unsupported load-bearing claim, missing test on new behavior, convention violation that will compound, ambiguity that will mislead.
95
+ - \`nit\` — Optional. Style, naming, micro-improvement. The author can decline; do not push back.
96
+ - \`praise\` — Non-obvious good design or careful work worth calling out. Rare on purpose.
97
+
98
+ The loaded skill may refine what counts as each severity for its domain.
99
+
100
+ ## Output discipline
101
+
102
+ End every response with a single \`<review>\` block. Use this exact structure:
103
+
104
+ <review>
105
+ <summary>
106
+ [One paragraph: what the target is (in your words), what it is trying to achieve, your overall read. Name the skill(s) you loaded and why. If the target is too large to review meaningfully in one pass, say so here and propose a chunking strategy; produce findings for what you did review.]
107
+ </summary>
108
+ <findings>
109
+ <finding severity="blocker|concern|nit|praise" location="path/to/file.ts:42, diff hunk, paragraph reference, or general">
110
+ <issue>One-sentence statement of the problem.</issue>
111
+ <evidence>Specific quote from the target or a brief description of the observed behavior.</evidence>
112
+ <suggestion>Concrete fix: what to do instead.</suggestion>
113
+ </finding>
114
+ <!-- Repeat per finding. Order: blocker > concern > nit > praise. -->
115
+ </findings>
116
+ <verdict>approve | request-changes | comment</verdict>
117
+ </review>
118
+
119
+ \`approve\` = no blockers; concerns are minor or already addressed.
120
+ \`request-changes\` = at least one blocker, or a load-bearing concern that needs an answer before this lands.
121
+ \`comment\` = neither — useful observations without a clear approve/reject signal (typical for early drafts, exploratory documents, partial reviews).
122
+
123
+ ## Rules
124
+
125
+ - Every path you cite MUST be absolute (start with \`/\`) when reviewing local files. PR-diff locations use the diff's own \`path:line\` form. Document references quote the passage.
126
+ - If the target requires information you cannot access (a private system, a file outside this checkout, the caller's stated intent), say so explicitly in \`<summary>\` and review what you can.
127
+ - If you cannot identify the target at all from the payload, return one \`blocker\` finding asking the caller to clarify the target, and a \`comment\` verdict.
128
+
129
+ You have one shot. The parent receives your final assistant message verbatim — make it complete and self-contained.`
130
+
131
+ export const reviewerPayloadSchema = z
132
+ .object({
133
+ requestId: z.string().optional(),
134
+ prompt: z.string().optional(),
135
+ description: z.string().optional(),
136
+ })
137
+ .passthrough()
138
+
139
+ export type ReviewerPayload = z.infer<typeof reviewerPayloadSchema>
140
+
141
+ export function createReviewerSubagent(): Subagent<ReviewerPayload> {
142
+ const loadSkillTool = createLoadSkillTool({
143
+ skills: REVIEWER_SKILLS,
144
+ description: `Load a curated review skill by name. Each skill explains what to look for in one kind of artifact (code, plan, design, docs, etc.) and refines the universal severity scale for that domain. Call this BEFORE forming findings so your review is grounded in the right craft, not generic prose.
145
+
146
+ Available skills:
147
+ ${REVIEWER_SKILLS.map((s) => `- \`${s.name}\` — ${s.description}`).join('\n')}
148
+
149
+ If none of the listed skills fit the target, load \`general\` and explain in \`<summary>\` why no domain skill applied.`,
150
+ })
151
+
152
+ return {
153
+ systemPrompt: REVIEWER_SYSTEM_PROMPT,
154
+ // `deep` is a conventional profile name (see src/config/config.ts). If the
155
+ // user has not configured `models.deep` in typeclaw.json, `resolveProfile`
156
+ // falls back to `default` with a one-time warning — safe degradation.
157
+ profile: 'deep',
158
+ tools: [readTool, grepTool, findTool, lsTool, bashTool, websearchTool, webfetchTool],
159
+ customTools: [loadSkillTool],
160
+ payloadSchema: reviewerPayloadSchema,
161
+ visibility: 'public',
162
+ inFlightKey: (payload) => payload?.requestId ?? `anon-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
163
+ toolResultBudget: {
164
+ // Higher than explorer (256KB) because a reviewer typically reads larger
165
+ // diffs and multiple files plus web sources; lower than operator (1MB)
166
+ // because we are read-only and producing analysis, not building.
167
+ maxTotalBytes: 512_000,
168
+ toolNames: ['read', 'grep', 'find', 'ls', 'bash', 'websearch', 'webfetch', 'load_skill'],
169
+ },
170
+ }
171
+ }
@@ -0,0 +1,73 @@
1
+ import type { LoadableSkill } from '@/plugin'
2
+
3
+ export const CODE_REVIEW_SKILL_NAME = 'code-review'
4
+
5
+ export const CODE_REVIEW_SKILL_DESCRIPTION =
6
+ 'Review code: a pull request, a commit, a single file, or a module. Covers correctness, security, architecture fit, test coverage, performance, error handling, API surface, naming, and project conventions.'
7
+
8
+ export const CODE_REVIEW_SKILL_CONTENT = `# code-review
9
+
10
+ You have been asked to review code. Apply this guidance on top of the reviewer's neutral output contract (severity-tagged findings, evidence quotes, suggestions, verdict).
11
+
12
+ ## How to acquire the target
13
+
14
+ - **PR URL or number** — fetch the diff and the description:
15
+ - \`gh pr diff <n>\` for the unified diff
16
+ - \`gh pr view <n>\` for title, body, labels, linked issues, checks
17
+ - \`gh api /repos/<owner>/<repo>/pulls/<n>\` for the structured payload when you need machine-readable fields
18
+ - **Commit SHA** — \`git show <sha>\` and \`git show <sha> --stat\` for the scope.
19
+ - **File path / module path** — \`read\` the file directly; \`ls\` the parent directory to understand its neighbors; \`grep\` for callers of any function the file exports.
20
+ - **Branch name** — \`git log <branch> ^main --oneline\` to enumerate commits, then \`git diff main...<branch>\` for the cumulative change.
21
+
22
+ ## How to build context
23
+
24
+ A finding without context is noise. Before forming findings:
25
+
26
+ 1. **Read the change description.** PR body, commit messages, linked issues. The author told you what they intended — verify the code matches.
27
+ 2. **Read adjacent code.** A change to one function means reading callers and callees. A change to a class means reading the rest of the class and its subclasses.
28
+ 3. **Read the project's conventions.** \`AGENTS.md\`, \`CONTRIBUTING.md\`, \`CLAUDE.md\`, \`README.md\`, the test layout, the linter config. Deviation from established convention is a finding worth raising; following convention is not worth praising.
29
+ 4. **Read the tests.** Existing tests show what the project considers important to verify. New tests show what the author considers important to lock in. The gap between them is often where the bugs hide.
30
+
31
+ ## What to look for
32
+
33
+ Prioritize in this order:
34
+
35
+ 1. **Correctness.** Does the change do what its description claims? Off-by-one errors, missing null/undefined handling, race conditions, incorrect error propagation, broken invariants.
36
+ 2. **Security.** Injection vectors (SQL, shell, HTML), missing authz/authn checks, secret leakage in logs or error messages, unsafe deserialization, SSRF, path traversal, time-of-check-time-of-use. Cite OWASP / CWE / RFC by number when relevant; verify with \`websearch\` or \`webfetch\` before asserting.
37
+ 3. **Architecture fit.** Does the change respect existing layering? Does it introduce a new dependency where the existing pattern would have worked? Does it duplicate logic that already exists elsewhere in the repo?
38
+ 4. **Test coverage.** New behavior should have new tests. Edge cases the description names should be tested. If existing tests were deleted or skipped, that is a blocker absent a stated reason.
39
+ 5. **Error handling.** Empty catch blocks, swallowed errors, errors converted to silent fallbacks, retry loops without bounded backoff, missing timeouts on external calls.
40
+ 6. **Performance.** Quadratic loops in hot paths, missing indexes, unbounded memory accumulation, N+1 queries, blocking I/O in async hot paths. Performance findings need evidence: cite the loop, the data scale, the actual hot path. "Could be slow" without evidence is not a finding.
41
+ 7. **API surface.** Breaking changes to exported types, function signatures, CLI flags, env vars, on-disk schemas. Are they documented? Versioned? Migration noted in CHANGELOG / release notes?
42
+ 8. **Naming.** Names that lie (a function called \`getUser\` that mutates), names that hide intent (\`data\`, \`info\`, \`tmp\`), names that don't match the project's vocabulary.
43
+
44
+ ## What NOT to find
45
+
46
+ - **Formatter / linter territory.** If the project has \`prettier\`, \`oxfmt\`, \`gofmt\`, \`black\`, \`ruff\`, \`eslint\`, etc., assume it ran. Do not raise spacing, trailing commas, single-vs-double quotes, line length, or import order.
47
+ - **Settled convention objections.** If the project uses tabs, four-space indent, camelCase vs snake_case, etc., and the change matches, that is not a finding. Only the deviation is.
48
+ - **Generic best-practice essays.** "Consider adding more tests" without naming a specific untested branch is noise. "Improve error handling" without pointing at a specific swallowed error is noise.
49
+ - **Restating the code.** "This function reads the file and returns its contents" is not a finding.
50
+
51
+ ## Severity hints specific to code
52
+
53
+ - **blocker** — Correctness bug that will misbehave for users. Security vulnerability. Broken backward compatibility without migration. Crashing path on common input. Deleted tests without justification.
54
+ - **concern** — Likely-bad outcome that hasn't bitten yet (missing timeout, unbounded retry, edge case ignored). Test gap on the new behavior. Architectural deviation that compounds.
55
+ - **nit** — Naming, micro-readability, suboptimal-but-correct code. Optional. The author can decline and you should not push back.
56
+ - **praise** — Non-obvious good design: a tricky invariant carefully preserved, a test that catches a subtle regression, a name that captures the domain precisely. Rare on purpose.
57
+
58
+ ## Verdict mapping
59
+
60
+ - **approve** — Zero blockers. Concerns are minor, isolated, or already discussed.
61
+ - **request-changes** — At least one blocker, OR a load-bearing concern that needs an answer before this lands.
62
+ - **comment** — Mixed signal: useful observations without a clear approve/reject. Common on large refactors where you reviewed part of the change, or on early-draft PRs where the author asked for direction more than approval.
63
+
64
+ ## Final output
65
+
66
+ Return findings inside the reviewer's neutral \`<review>\` block. Do NOT invent your own output format. The parent agent parses the structured shape.
67
+ `
68
+
69
+ export const CODE_REVIEW_SKILL: LoadableSkill = {
70
+ name: CODE_REVIEW_SKILL_NAME,
71
+ description: CODE_REVIEW_SKILL_DESCRIPTION,
72
+ content: CODE_REVIEW_SKILL_CONTENT,
73
+ }
@@ -0,0 +1,68 @@
1
+ import type { LoadableSkill } from '@/plugin'
2
+
3
+ export const GENERAL_REVIEW_SKILL_NAME = 'general'
4
+
5
+ export const GENERAL_REVIEW_SKILL_DESCRIPTION =
6
+ 'Fallback for review targets that do not fit a specific domain skill: a written argument, a proposal, a draft, a mixed-format artifact. Apply the universal review philosophy without domain-specific shortcuts.'
7
+
8
+ export const GENERAL_REVIEW_SKILL_CONTENT = `# general
9
+
10
+ You have been asked to review something that does not clearly fit a specific domain skill (not a code PR, not a plan, not a design doc, not docs — or it is a mix). Apply the universal review philosophy on top of the reviewer's neutral output contract.
11
+
12
+ ## How to acquire the target
13
+
14
+ - **A URL** — \`webfetch\` it. If it is a private resource the fetch cannot reach, say so in \`<summary>\` and review what was provided in the payload.
15
+ - **A file path** — \`read\` it. \`ls\` the parent directory if siblings might be relevant.
16
+ - **Inline text in the payload** — read the payload carefully; quote from it when forming evidence.
17
+ - **A reference to something the caller has** — ask the caller to provide it. Return a single \`blocker\` finding describing what you need and a \`comment\` verdict.
18
+
19
+ ## How to read carefully
20
+
21
+ A general review is the hardest because there are no domain shortcuts. Replace shortcuts with discipline:
22
+
23
+ 1. **State the target's purpose in your own words.** What is the artifact trying to achieve? Who is it for? Put this in \`<summary>\`. If you cannot state it after reading, that itself is a finding — the artifact does not communicate its purpose.
24
+ 2. **Identify the load-bearing claims.** What does the artifact assert that, if wrong, would invalidate the whole thing? List them mentally before looking for issues.
25
+ 3. **Stress-test the load-bearing claims.** For each one: is the evidence sufficient? Are the assumptions stated? Are the counter-arguments addressed?
26
+ 4. **Stress-test the boundaries.** Where does the artifact's argument or design stop applying? Does it acknowledge that boundary, or does it overgeneralize?
27
+ 5. **Stress-test the audience fit.** Will the intended reader understand it? Is the prerequisite knowledge stated? Are the unstated assumptions reasonable for that audience?
28
+
29
+ ## What to look for
30
+
31
+ - **Internal contradiction.** Two statements that cannot both be true. The artifact must reconcile them or pick one.
32
+ - **Unsupported claims.** Any assertion the artifact relies on but does not justify. The author may have a reason — say so and ask, do not assume incompetence.
33
+ - **Hidden assumptions.** Things the argument quietly requires to be true but does not state. These are the most common failure mode in general writing.
34
+ - **Missing alternatives.** If the artifact recommends X, did it explain why not Y? A serious proposal acknowledges the alternatives it rejected.
35
+ - **Scope drift.** The artifact promises to cover A but spends half its bytes on B. Either the scope is wrong or the title is wrong.
36
+ - **Verifiability.** If the artifact claims success criteria, are they measurable? "Better performance" with no metric is unverifiable.
37
+ - **Logical structure.** Premises → reasoning → conclusion. Where the chain breaks, point at the break.
38
+
39
+ ## What NOT to find
40
+
41
+ - **Stylistic preferences.** Sentence rhythm, word choice variation, paragraph length. Skip unless they actively impede understanding.
42
+ - **Re-summarizing the artifact as a finding.** "This document discusses X" is not a review.
43
+ - **Generic feedback.** "Could be clearer" without pointing at a specific passage is noise.
44
+ - **Disagreements that are taste, not error.** If the author chose path A and you would have chosen B, that is not a finding unless A is actually worse for a stated reason.
45
+
46
+ ## Severity hints
47
+
48
+ - **blocker** — A logical break, a fatal contradiction, a load-bearing claim that is verifiably false, an audience-fit problem so severe the intended reader cannot use the artifact.
49
+ - **concern** — An unsupported claim that needs justification, a missing alternative that weakens the recommendation, a scope ambiguity that will mislead readers.
50
+ - **nit** — A small clarity issue, a passage that could be tightened, a minor inconsistency.
51
+ - **praise** — A non-obvious insight, a tricky trade-off well-handled, a passage that earns the reader's trust. Rare.
52
+
53
+ ## Verdict mapping
54
+
55
+ - **approve** — No blockers. The artifact stands on its own.
56
+ - **request-changes** — At least one blocker.
57
+ - **comment** — Useful observations without a clean accept/reject. Common for early drafts, exploratory documents, or partial reviews.
58
+
59
+ ## Final output
60
+
61
+ Return findings inside the reviewer's neutral \`<review>\` block. Do NOT invent your own output format.
62
+ `
63
+
64
+ export const GENERAL_REVIEW_SKILL: LoadableSkill = {
65
+ name: GENERAL_REVIEW_SKILL_NAME,
66
+ description: GENERAL_REVIEW_SKILL_DESCRIPTION,
67
+ content: GENERAL_REVIEW_SKILL_CONTENT,
68
+ }
@@ -6,7 +6,7 @@ import type {
6
6
  } from 'agent-messenger/discordbot'
7
7
 
8
8
  import type { ChannelAdapterConfig } from '@/channels/schema'
9
- import type { InboundMessage } from '@/channels/types'
9
+ import type { InboundAttachment, InboundMessage } from '@/channels/types'
10
10
 
11
11
  export type InboundDropReason =
12
12
  | 'self_author' // event.author.id === botUserId; we never route our own messages back to ourselves
@@ -35,7 +35,7 @@ export function classifyInbound(
35
35
  if (botUserId !== null && event.author.id === botUserId) {
36
36
  return { kind: 'drop', reason: 'self_author' }
37
37
  }
38
- const text = inboundText(event)
38
+ const { text, attachments } = splitInbound(event)
39
39
  if (text === '') return { kind: 'drop', reason: 'empty_content' }
40
40
 
41
41
  const isDm = event.guild_id === undefined
@@ -80,6 +80,7 @@ export function classifyInbound(
80
80
  chat: event.channel_id,
81
81
  thread: null,
82
82
  text,
83
+ ...(attachments.length > 0 ? { attachments } : {}),
83
84
  externalMessageId: event.id,
84
85
  authorId: event.author.id,
85
86
  // Discord's post-2023 username system allows pure-numeric handles (e.g.
@@ -107,38 +108,45 @@ function isReplyToBot(event: DiscordGatewayMessageCreateEvent, botUserId: string
107
108
  return (event.mentions ?? []).some((m) => m.id === botUserId)
108
109
  }
109
110
 
110
- function inboundText(event: DiscordGatewayMessageCreateEvent): string {
111
- const mediaSummary = summarizeDiscordMedia(event)
112
- if (mediaSummary.length === 0) return event.content
113
- const summary = `[Discord message with ${mediaSummary.join('; ')}]`
114
- return event.content === '' ? summary : `${event.content}\n${summary}`
111
+ type SplitInbound = { text: string; attachments: InboundAttachment[] }
112
+
113
+ function splitInbound(event: DiscordGatewayMessageCreateEvent): SplitInbound {
114
+ const attachments = describeDiscordMedia(event)
115
+ if (attachments.length === 0) return { text: event.content, attachments: [] }
116
+ const summary = attachments.map(renderPlaceholder).join('\n')
117
+ const text = event.content === '' ? summary : `${event.content}\n${summary}`
118
+ return { text, attachments }
115
119
  }
116
120
 
117
- function summarizeDiscordMedia(event: DiscordGatewayMessageCreateEvent): string[] {
121
+ function describeDiscordMedia(event: DiscordGatewayMessageCreateEvent): InboundAttachment[] {
118
122
  return [
119
- ...(event.attachments ?? []).map(summarizeAttachment),
120
- ...(event.embeds ?? []).map(summarizeEmbed),
121
- ...(event.sticker_items ?? []).map(summarizeSticker),
122
- ]
123
+ ...(event.attachments ?? []).map(describeAttachment),
124
+ ...(event.embeds ?? []).map(describeEmbed),
125
+ ...(event.sticker_items ?? []).map(describeSticker),
126
+ ].map((attachment, index) => ({ ...attachment, id: index + 1 }))
123
127
  }
124
128
 
125
- function summarizeAttachment(attachment: DiscordFile): string {
126
- return compactJoin(' ', [
127
- `attachment: ${attachment.filename}`,
128
- attachment.content_type === undefined ? undefined : `(${attachment.content_type})`,
129
- attachment.url,
130
- ])
129
+ function describeAttachment(attachment: DiscordFile): Omit<InboundAttachment, 'id'> {
130
+ return {
131
+ kind: 'file',
132
+ ref: attachment.url,
133
+ filename: attachment.filename,
134
+ ...(attachment.content_type !== undefined ? { mimetype: attachment.content_type } : {}),
135
+ }
131
136
  }
132
137
 
133
- function summarizeEmbed(embed: DiscordGatewayEmbed): string {
138
+ function describeEmbed(embed: DiscordGatewayEmbed): Omit<InboundAttachment, 'id'> {
134
139
  const label = embed.title ?? embed.description ?? embed.url ?? embed.type ?? 'embed'
135
- return compactJoin(' ', ['embed:', label, embed.url !== undefined && embed.url !== label ? embed.url : undefined])
140
+ return { kind: 'embed', ref: embed.url ?? '', filename: label }
136
141
  }
137
142
 
138
- function summarizeSticker(sticker: DiscordGatewayStickerItem): string {
139
- return `sticker: ${sticker.name}`
143
+ function describeSticker(sticker: DiscordGatewayStickerItem): Omit<InboundAttachment, 'id'> {
144
+ return { kind: 'sticker', ref: '', filename: sticker.name }
140
145
  }
141
146
 
142
- function compactJoin(separator: string, parts: Array<string | undefined>): string {
143
- return parts.filter((part) => part !== undefined && part !== '').join(separator)
147
+ function renderPlaceholder(attachment: InboundAttachment): string {
148
+ const parts: string[] = [`Discord attachment #${attachment.id}: ${attachment.kind}`]
149
+ if (attachment.mimetype !== undefined) parts.push(attachment.mimetype)
150
+ if (attachment.filename !== undefined) parts.push(`name=${attachment.filename}`)
151
+ return `[${parts.join(' ')}]`
144
152
  }
@@ -44,11 +44,17 @@ export function createGithubWebhookHandler(options: GithubWebhookHandlerOptions)
44
44
  if (!isGithubEventAllowed(options.allowlist(), event, action)) return ok()
45
45
 
46
46
  const selfId = options.selfId()
47
- const author = readAuthor(payload)
48
- if (selfId !== null && author !== null && String(author.id) === selfId) return ok()
47
+ const selfLogin = options.selfLogin()
48
+ const author = readAuthor(event, payload)
49
+ if (author !== null && isSelfAuthor(author, selfId, selfLogin)) {
50
+ options.logger.info(
51
+ `[github] dropped self-authored ${event}${action !== null ? `.${action}` : ''} from @${author.login}`,
52
+ )
53
+ return ok()
54
+ }
49
55
 
50
56
  const teamIsBotMember = await resolveTeamMembership(event, payload, options)
51
- const classified = classifyGithubInbound(event, payload, options.selfLogin(), {
57
+ const classified = classifyGithubInbound(event, payload, selfLogin, {
52
58
  teamIsBotMember,
53
59
  })
54
60
  if (classified === null) return ok()
@@ -357,13 +363,63 @@ function readRepository(payload: Record<string, unknown>): { owner: string; name
357
363
  return { owner: ownerLogin, name }
358
364
  }
359
365
 
360
- function readAuthor(payload: Record<string, unknown>): GithubUser | null {
361
- const candidates = [payload.comment, payload.issue, payload.pull_request, payload.discussion, payload.review]
362
- for (const candidate of candidates) {
366
+ function readAuthor(event: string, payload: Record<string, unknown>): GithubUser | null {
367
+ for (const candidate of eventAuthorCandidates(event, payload)) {
363
368
  const user = readUser(readRecord(candidate)?.user)
364
369
  if (user !== null) return user
365
370
  }
366
- return null
371
+ // Every GitHub webhook payload carries `sender` — the actor who triggered the
372
+ // delivery. It is the universal fallback so events not enumerated above (and
373
+ // any future ones the user adds to eventAllowlist) still drop self-authored
374
+ // deliveries instead of slipping past the guard.
375
+ return readUser(payload.sender)
376
+ }
377
+
378
+ // Maps each event to the entity whose `user` is the true author of THIS event,
379
+ // listed before broader containers. A pull_request_review payload ships both
380
+ // `pull_request` (the PR author) and `review` (the reviewer); the self-author
381
+ // drop must see the reviewer, so `review` must come first. PR #455's flat order
382
+ // (`pull_request` before `review`) made a self-review on someone else's PR
383
+ // resolve to the PR author, slip past the drop, and loop (see PR #460).
384
+ //
385
+ // `pull_request` and `pull_request_review_thread` carry only the `pull_request`
386
+ // container, whose `user` is the PR OPENER — not the actor of this delivery.
387
+ // For these events the self-author question is "who triggered the action?"
388
+ // (review_requested, edited, reopened, resolved, …), which is always
389
+ // `payload.sender`, never the opener. Mapping them to `[]` makes readAuthor
390
+ // skip the opener and fall through to the `sender` fallback. PR #462's
391
+ // `['pull_request']` resolved to the opener, so a human action on a
392
+ // bot-opened PR matched the bot and was wrongly dropped (the inbound landed
393
+ // as awareness-only "Recent context" and the agent never replied).
394
+ const PRIMARY_AUTHOR_KEYS: Record<string, readonly string[]> = {
395
+ issue_comment: ['comment'],
396
+ pull_request_review_comment: ['comment'],
397
+ discussion_comment: ['comment'],
398
+ commit_comment: ['comment'],
399
+ pull_request_review: ['review'],
400
+ pull_request_review_thread: [],
401
+ issues: ['issue'],
402
+ pull_request: [],
403
+ discussion: ['discussion'],
404
+ release: ['release'],
405
+ }
406
+
407
+ const FALLBACK_AUTHOR_KEYS = ['comment', 'review', 'issue', 'pull_request', 'discussion', 'release'] as const
408
+
409
+ function eventAuthorCandidates(event: string, payload: Record<string, unknown>): unknown[] {
410
+ const keys = PRIMARY_AUTHOR_KEYS[event] ?? FALLBACK_AUTHOR_KEYS
411
+ return keys.map((key) => payload[key])
412
+ }
413
+
414
+ // Matches by id OR login. Issue #452 captured a self-responding loop where
415
+ // the id-only guard didn't fire and the bot replied to its own comments ~8
416
+ // times in a row. Login is the second line of defense and aligns with the
417
+ // slack/discord/telegram/kakaotalk adapters, which all drop self-authored
418
+ // events at the classifier layer.
419
+ function isSelfAuthor(author: GithubUser, selfId: string | null, selfLogin: string | null): boolean {
420
+ if (selfId !== null && String(author.id) === selfId) return true
421
+ if (selfLogin !== null && author.login === selfLogin) return true
422
+ return false
367
423
  }
368
424
 
369
425
  type GithubUser = { login: string; id: number; type?: string }
@@ -53,6 +53,14 @@ export type GithubAdapterOptions = {
53
53
  // Test-only: replaces the wall-clock sleep used for the registration
54
54
  // delay above. Production leaves it undefined and we use `setTimeout`.
55
55
  sleep?: (ms: number) => Promise<void>
56
+ // How often to proactively refresh the token and update GH_TOKEN
57
+ // when the adapter is running but has not made an outbound API call
58
+ // recently. Zero disables the background refresh entirely.
59
+ // Default: 30 minutes.
60
+ tokenRefreshIntervalMs?: number
61
+ // Test-only: replaces `setInterval` so tests can control when the
62
+ // background refresh fires without waiting on real wall-clock time.
63
+ setInterval?: (handler: () => void, ms: number) => { clear: () => void }
56
64
  }
57
65
 
58
66
  export type GithubAdapter = {
@@ -68,6 +76,7 @@ const consoleLogger: GithubAdapterLogger = {
68
76
  }
69
77
 
70
78
  const DEFAULT_WEBHOOK_REGISTRATION_DELAY_MS = 2_000
79
+ const DEFAULT_TOKEN_REFRESH_INTERVAL_MS = 30 * 60 * 1000
71
80
 
72
81
  export function createGithubAdapter(options: GithubAdapterOptions): GithubAdapter {
73
82
  const logger = options.logger ?? consoleLogger
@@ -83,6 +92,7 @@ export function createGithubAdapter(options: GithubAdapterOptions): GithubAdapte
83
92
  let selfLogin: string | null = null
84
93
  let started = false
85
94
  let managedHooks: ReadonlyArray<{ repo: string; hookId: number }> = []
95
+ let tokenRefreshTimer: { clear: () => void } | null = null
86
96
  const workspaceByChat = new Map<string, string>()
87
97
 
88
98
  const rememberWorkspace = (workspace: string, chat: string): void => {
@@ -168,6 +178,24 @@ export function createGithubAdapter(options: GithubAdapterOptions): GithubAdapte
168
178
  // automatically when within 5 minutes of expiry.
169
179
  process.env.GH_TOKEN = await auth.token()
170
180
  started = true
181
+ // Keep GH_TOKEN warm even when the adapter is only receiving inbound
182
+ // webhooks and not making outbound API calls. This prevents `gh` CLI
183
+ // calls from the agent from failing with 401 after the token expires.
184
+ const tokenRefreshIntervalMs = options.tokenRefreshIntervalMs ?? DEFAULT_TOKEN_REFRESH_INTERVAL_MS
185
+ if (tokenRefreshIntervalMs > 0) {
186
+ const refresh = () => {
187
+ tokenFn().catch((err) => {
188
+ logger.error(`[github] periodic token refresh failed: ${err instanceof Error ? err.message : String(err)}`)
189
+ })
190
+ }
191
+ const setIntervalFn =
192
+ options.setInterval ??
193
+ ((handler: () => void, ms: number) => {
194
+ const timer = setInterval(handler, ms)
195
+ return { clear: () => clearInterval(timer) }
196
+ })
197
+ tokenRefreshTimer = setIntervalFn(refresh, tokenRefreshIntervalMs)
198
+ }
171
199
  logger.info(`[github] webhook listening on port ${options.configRef().webhookPort} as @${self.login}`)
172
200
  // Best-effort: App-only preflight that compares the installation's granted
173
201
  // permissions against the configured eventAllowlist and warns about gaps.
@@ -241,6 +269,10 @@ export function createGithubAdapter(options: GithubAdapterOptions): GithubAdapte
241
269
  logDeregistrationOutcome(logger, deregistration)
242
270
  managedHooks = []
243
271
  }
272
+ if (tokenRefreshTimer !== null) {
273
+ tokenRefreshTimer.clear()
274
+ tokenRefreshTimer = null
275
+ }
244
276
  await auth.dispose()
245
277
  delete process.env.GH_TOKEN
246
278
  server = null