npm - pullfrog - Versions diffs - 0.1.5 → 0.1.7 - Mend

pullfrog 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/dist/agents/postRun.d.ts +21 -0
package/dist/agents/sessionLabeler.d.ts +38 -18
package/dist/agents/subagentModels.d.ts +19 -0
package/dist/cli.mjs +678 -278
package/dist/index.js +662 -264
package/dist/internal.js +151 -59
package/dist/models.d.ts +63 -3
package/dist/utils/agent.d.ts +5 -2
package/dist/utils/apiKeys.d.ts +18 -0
package/dist/utils/instructions.d.ts +19 -0
package/dist/utils/learnings.d.ts +20 -9
package/dist/utils/normalizeEnv.d.ts +21 -1
package/dist/utils/runContext.d.ts +16 -0
package/dist/utils/subprocess.d.ts +40 -0
package/dist/utils/timer.d.ts +11 -0
package/package.json +1 -1

package/dist/internal.js CHANGED Viewed

@@ -13,7 +13,8 @@ var providers = {
         displayName: "Claude Opus",
         resolve: "anthropic/claude-opus-4-7",
         openRouterResolve: "openrouter/anthropic/claude-opus-4.7",
-        preferred: true
+        preferred: true,
+        subagentModel: "claude-sonnet"
       },
       "claude-sonnet": {
         displayName: "Claude Sonnet",
@@ -35,12 +36,23 @@ var providers = {
         displayName: "GPT",
         resolve: "openai/gpt-5.5",
         openRouterResolve: "openrouter/openai/gpt-5.5",
-        preferred: true
+        preferred: true,
+        subagentModel: "gpt-5.4"
       },
       "gpt-pro": {
         displayName: "GPT Pro",
         resolve: "openai/gpt-5.5-pro",
-        openRouterResolve: "openrouter/openai/gpt-5.5-pro"
+        openRouterResolve: "openrouter/openai/gpt-5.5-pro",
+        subagentModel: "gpt"
+      },
+      // hidden subagent target — `gpt` lenses run against this. surfacing
+      // it in the picker would just confuse users (it's the prior-flagship,
+      // and they already have `gpt` and `gpt-mini` to choose from).
+      "gpt-5.4": {
+        displayName: "GPT 5.4",
+        resolve: "openai/gpt-5.4",
+        openRouterResolve: "openrouter/openai/gpt-5.4",
+        hidden: true
       },
       "gpt-mini": {
         displayName: "GPT Mini",
@@ -78,7 +90,8 @@ var providers = {
         displayName: "Gemini Pro",
         resolve: "google/gemini-3.1-pro-preview",
         openRouterResolve: "openrouter/google/gemini-3.1-pro-preview",
-        preferred: true
+        preferred: true,
+        subagentModel: "gemini-flash"
       },
       "gemini-flash": {
         displayName: "Gemini Flash",
@@ -166,7 +179,8 @@ var providers = {
       "claude-opus": {
         displayName: "Claude Opus",
         resolve: "opencode/claude-opus-4-7",
-        openRouterResolve: "openrouter/anthropic/claude-opus-4.7"
+        openRouterResolve: "openrouter/anthropic/claude-opus-4.7",
+        subagentModel: "claude-sonnet"
       },
       "claude-sonnet": {
         displayName: "Claude Sonnet",
@@ -181,12 +195,21 @@ var providers = {
       gpt: {
         displayName: "GPT",
         resolve: "opencode/gpt-5.5",
-        openRouterResolve: "openrouter/openai/gpt-5.5"
+        openRouterResolve: "openrouter/openai/gpt-5.5",
+        subagentModel: "gpt-5.4"
       },
       "gpt-pro": {
         displayName: "GPT Pro",
         resolve: "opencode/gpt-5.5-pro",
-        openRouterResolve: "openrouter/openai/gpt-5.5-pro"
+        openRouterResolve: "openrouter/openai/gpt-5.5-pro",
+        subagentModel: "gpt"
+      },
+      // hidden subagent target — see openai provider above for context.
+      "gpt-5.4": {
+        displayName: "GPT 5.4",
+        resolve: "opencode/gpt-5.4",
+        openRouterResolve: "openrouter/openai/gpt-5.4",
+        hidden: true
       },
       "gpt-mini": {
         displayName: "GPT Mini",
@@ -209,7 +232,8 @@ var providers = {
       "gemini-pro": {
         displayName: "Gemini Pro",
         resolve: "opencode/gemini-3.1-pro",
-        openRouterResolve: "openrouter/google/gemini-3.1-pro-preview"
+        openRouterResolve: "openrouter/google/gemini-3.1-pro-preview",
+        subagentModel: "gemini-flash"
       },
       "gemini-flash": {
         displayName: "Gemini Flash",
@@ -241,6 +265,20 @@ var providers = {
       }
     }
   }),
+  bedrock: provider({
+    displayName: "Amazon Bedrock",
+    envVars: ["AWS_BEARER_TOKEN_BEDROCK", "AWS_REGION", "BEDROCK_MODEL_ID"],
+    models: {
+      // single routing entry — the actual Bedrock model ID is read from
+      // BEDROCK_MODEL_ID at run time. see ModelRouting docs for why we
+      // don't catalog individual Bedrock models.
+      byok: {
+        displayName: "Amazon Bedrock",
+        resolve: "bedrock",
+        routing: "bedrock"
+      }
+    }
+  }),
   openrouter: provider({
     displayName: "OpenRouter",
     envVars: ["OPENROUTER_API_KEY"],
@@ -249,7 +287,8 @@ var providers = {
         displayName: "Claude Opus",
         resolve: "openrouter/anthropic/claude-opus-4.7",
         openRouterResolve: "openrouter/anthropic/claude-opus-4.7",
-        preferred: true
+        preferred: true,
+        subagentModel: "claude-sonnet"
       },
       "claude-sonnet": {
         displayName: "Claude Sonnet",
@@ -264,12 +303,21 @@ var providers = {
       gpt: {
         displayName: "GPT",
         resolve: "openrouter/openai/gpt-5.5",
-        openRouterResolve: "openrouter/openai/gpt-5.5"
+        openRouterResolve: "openrouter/openai/gpt-5.5",
+        subagentModel: "gpt-5.4"
       },
       "gpt-pro": {
         displayName: "GPT Pro",
         resolve: "openrouter/openai/gpt-5.5-pro",
-        openRouterResolve: "openrouter/openai/gpt-5.5-pro"
+        openRouterResolve: "openrouter/openai/gpt-5.5-pro",
+        subagentModel: "gpt"
+      },
+      // hidden subagent target — see openai provider above for context.
+      "gpt-5.4": {
+        displayName: "GPT 5.4",
+        resolve: "openrouter/openai/gpt-5.4",
+        openRouterResolve: "openrouter/openai/gpt-5.4",
+        hidden: true
       },
       "gpt-mini": {
         displayName: "GPT Mini",
@@ -297,7 +345,8 @@ var providers = {
       "gemini-pro": {
         displayName: "Gemini Pro",
         resolve: "openrouter/google/gemini-3.1-pro-preview",
-        openRouterResolve: "openrouter/google/gemini-3.1-pro-preview"
+        openRouterResolve: "openrouter/google/gemini-3.1-pro-preview",
+        subagentModel: "gemini-flash"
       },
       "gemini-flash": {
         displayName: "Gemini Flash",
@@ -370,7 +419,13 @@ var modelAliases = Object.entries(providers).flatMap(
     openRouterResolve: def.openRouterResolve,
     preferred: def.preferred ?? false,
     isFree: def.isFree ?? false,
-    fallback: def.fallback
+    fallback: def.fallback,
+    routing: def.routing,
+    // subagentModel is stored as an alias key local to the provider; expand
+    // here to a fully-qualified slug so callers can look up the target alias
+    // directly without re-deriving the provider.
+    subagentModel: def.subagentModel ? `${providerKey}/${def.subagentModel}` : void 0,
+    hidden: def.hidden ?? false
   }))
 );
 function resolveModelSlug(slug) {
@@ -550,18 +605,24 @@ For simple, well-defined tasks, skip the plan phase and go straight to build.`
    - resolve addressed threads via \`${t("resolve_review_thread")}\`
    - call \`${t("report_progress")}\` with a brief summary (or the exact push error if push failed)`
     },
-    // Review and IncrementalReview use the multi-lens orchestrator pattern
-    // (canonical source: .claude/commands/anneal.md). The orchestrator does
-    // triage → parallel read-only subagent fan-out → aggregate → draft comments
-    // → submit. For someone else's PR, parallel lenses (correctness, security,
-    // research-validated claims, user-journey, etc.) provide breadth across
-    // angles that a single subagent can't carry coherently. Build mode keeps
-    // a single fresh-eyes subagent (different problem shape — orchestrator
-    // wrote the code and bias-mitigation comes from delegating to one
-    // subagent that doesn't share the implementation context).
-    // Deliberate omission vs canonical /anneal: severity categorization in the
-    // final message (the review body has its own CAUTION/IMPORTANT framing
-    // instead of a severity table).
+    // Review and IncrementalReview use a 0-or-2+ lens pattern. The default is
+    // 0 lenses (orchestrator handles the review solo). Multi-lens (2+
+    // reviewfrog subagents in parallel) only fires for substantive PRs or
+    // high-stakes-subsystem touches — and when it fires, ALL lenses must
+    // dispatch in a single assistant turn or the parallelism win disappears.
+    // We never dispatch exactly one lens: a single lens is just a worse,
+    // slower version of doing the work yourself.
+    //
+    // Build mode self-review is a different problem shape: the orchestrator
+    // wrote the code, so bias-mitigation comes from delegating to one
+    // fresh-eyes subagent that doesn't share the implementation context. A
+    // single subagent there is appropriate; the 0-or-2+ rule applies only to
+    // the Review/IncrementalReview lens fan-out where independence between
+    // perspectives is what's being purchased.
+    //
+    // Deliberate omission vs canonical /anneal: severity categorization in
+    // the final message (the review body has its own CAUTION/IMPORTANT
+    // framing instead of a severity table).
     {
       name: "Review",
       description: "Review code, PRs, or implementations; provide feedback or suggestions; identify issues; or check code quality, style, and correctness",
@@ -571,9 +632,9 @@ For simple, well-defined tasks, skip the plan phase and go straight to build.`
 2. **checkout**: call \`${t("checkout_pr")}\` \u2014 this returns PR metadata and a \`diffPath\`. read the diff TOC end-to-end and treat its file line ranges as your coverage checklist.
-3. **triage**: orient yourself on the PR \u2014 identify *what kind of thing this is* (domain it touches, seams it crosses, external contracts it depends on, user-facing surfaces it changes). orientation only \u2014 defer specific defect-hunting to the subagents; pre-reviewing biases the lenses you pick. use \`${t("get_pull_request")}\` and other read-only GitHub tools for additional context if needed.
+3. **triage**: orient yourself on the PR \u2014 identify *what kind of thing this is* (domain it touches, seams it crosses, external contracts it depends on, user-facing surfaces it changes). pull as much context as you need to render a confident, well-grounded review: read related files, grep for callers of changed symbols, check tests that exercise the touched paths, fetch related GitHub state. **you are the synthesizer** \u2014 never delegate understanding to subagents.
-   if the PR is **genuinely trivial**, skip steps 4\u20135 entirely and submit a \`No new issues found.\` review per step 6. there's no value in dispatching even one lens for a typo.
+   if the PR is **genuinely trivial**, skip the fan-out entirely and submit a \`No new issues found.\` review per step 7.
    "Genuinely trivial" (skip):
    - single-word doc typo, whitespace/format-only, comment-only across any number of files
@@ -592,23 +653,25 @@ For simple, well-defined tasks, skip the plan phase and go straight to build.`
    - any "typo fix" in user-facing copy that changes meaning ("approved" \u2192 "denied")
    - mixed diffs where a semantic 1-liner is buried in whitespace/formatting changes
-   When unsure, treat as non-trivial. The cost of one extra subagent is cents; the cost of a missed billing/auth/data bug is much more.
+4. **lens decision \u2014 0 or 2+, NEVER 1**.
+   The default is **0 lenses**: handle the review yourself end-to-end. Most PRs land here.
-   otherwise pick lenses by where the PR concentrates risk \u2014 **there's no fixed count**. lens count is judgment, not a formula. concrete shapes to anchor against:
+   Dispatch **2+ \`${REVIEWER_AGENT_NAME}\` lenses in parallel** ONLY when ALL of the following are true:
+   - the PR is substantive (>5 files changed AND >200 net lines), OR touches a high-stakes subsystem (auth, billing, payments, schema migration, webhooks, secrets, RBAC, multi-tenant isolation, cron/scheduling)
+   - you can name 2+ distinct concrete failure modes that warrant independent lenses (one lens per failure mode; orthogonal, not overlapping)
+   - parallel-orchestrated independent perspectives meaningfully outperform what you'd find solo
-   - **1 lens** \u2014 pure refactor / mechanical rename across many files (impact); new test file with no source change (test-integrity); small isolated bug fix (correctness); doc-only PR with non-trivial technical content (research-validated or holistic)
-   - **2\u20133 lenses (most PRs land here)** \u2014 new CRUD endpoint (correctness + security + test-integrity); new UI flow (user-journey + correctness); a single bug fix in a non-critical subsystem (correctness + test-integrity); design doc covering one domain (research-validated + correctness or holistic)
-   - **4\u20135 lenses (high-stakes subsystem touches)** \u2014 any billing/payments change (billing-subsystem + correctness + security + operational-readiness); new auth flow (auth-subsystem + correctness + security + test-integrity); schema migration (schema-migration-subsystem + correctness + operational-readiness + impact); cross-subsystem PR that touches billing AND auth AND schema (one subsystem lens per domain + correctness)
-   - **6+ lenses** \u2014 almost always a smell; you're either covering overlapping ground or this PR should have been split. push back via the review body rather than expanding lens count.
+   **NEVER dispatch exactly one lens.** A single lens is just a more expensive version of doing the work yourself with a worse model \u2014 it adds wall time and a context-handoff for no orthogonality benefit. Either you have at least two genuinely independent failure-mode hypotheses (dispatch all in one turn), or you don't (do the review yourself).
-   lenses come in two flavors, and you can mix them:
+   When you do go multi-lens, lens framings come in two flavors:
    - **themed lenses** \u2014 a perspective applied across the whole diff (correctness, security, user-journey, performance, etc.).
-   - **subsystem lenses** \u2014 a domain-scoped frame for high-stakes subsystems the PR touches (e.g. "the auth lens", "the billing lens", "the schema-migration lens"). a subsystem lens is "review the PR specifically for what could go wrong in this subsystem" and naturally combines theme + scope. **for high-stakes domains, lead with the subsystem lens rather than the generic themed equivalent** \u2014 "billing-subsystem" outperforms "correctness on billing code" because the framing primes the subagent to remember domain-specific failure modes (double-charges, refund races, currency rounding, dispute flows) the generic lens misses.
+   - **subsystem lenses** \u2014 a domain-scoped frame for high-stakes subsystems the PR touches (e.g. "the auth lens", "the billing lens", "the schema-migration lens"). **for high-stakes domains, lead with the subsystem lens rather than the generic themed equivalent** \u2014 "billing-subsystem" outperforms "correctness on billing code" because the framing primes the subagent to remember domain-specific failure modes (double-charges, refund races, currency rounding, dispute flows) the generic lens misses.
    starter menu (combine, omit, or invent your own):
    - **correctness & invariants** \u2014 bugs, races, error handling, edge cases, state-machine boundaries
-   - **impact** \u2014 when the PR removes features, deletes exports, renames identifiers, or changes architectural patterns: stale references in code, tests, docs (\`docs/\`, \`wiki/\`), comments, configs, UI
-   - **research-validated assumptions** \u2014 third-party API contracts, SDK semantics, framework directives, version-gated behavior. the subagent must verify load-bearing claims via web search and quote source URLs.
+   - **impact** \u2014 stale references in code/tests/docs/configs/UI after rename/remove
+   - **research-validated assumptions** \u2014 third-party API contracts, SDK semantics, framework directives, version-gated behavior. **only pick when the PR's correctness depends on the contract behaving a specific way** \u2014 not when the API is merely used. The bar is "if the third-party contract differs from what the diff assumes, the PR is incorrect." When dispatched, the subagent must verify load-bearing claims via web search and quote source URLs.
    - **security** \u2014 new endpoints, authZ, input validation, secrets handling, replay/CSRF/injection, cross-tenant isolation
    - **user-journey** \u2014 UX-touching flows: walk through happy path and failure modes as a user
    - **operational readiness** \u2014 observability, alerting, migrations (forward + rollback), feature flags, on-call burden
@@ -618,26 +681,36 @@ For simple, well-defined tasks, skip the plan phase and go straight to build.`
    - **holistic** \u2014 does the PR make sense as a whole? symmetric flows (delete for every create, rollback for every migration)?
    - **subsystem lenses** (invent as the PR demands) \u2014 auth, billing, payments, schema migration, webhooks, secrets, RBAC, multi-tenant isolation, cron/scheduling, etc.
-4. **fan out**: dispatch one \`${REVIEWER_AGENT_NAME}\` subagent per lens \u2014 its baked-in system prompt enforces the non-mutative + non-recursive contract (read-only file/search/web tools and read-only MCP queries; no writes, shell side effects, state-changing MCP calls, or nested subagent dispatch). when picking 2+ lenses, dispatch them in a **single assistant turn with multiple parallel subagent calls**; issuing one and awaiting reply before the next collapses the fan-out into a serial review. if a subagent errors out, times out, or returns nothing usable, retry once with the same lens; if it still fails, proceed with partial coverage and note the missing lens in the review body \u2014 do not skip step 4 entirely on a single subagent failure. each subagent gets:
+   The only subagent type is \`${REVIEWER_AGENT_NAME}\` \u2014 used for lens judgment work ("is this safe / correct / well-tested?"), runs on a mid-tier model.
+5. **fan out (only if step 4 said 2+ lenses)**: dispatch every \`${REVIEWER_AGENT_NAME}\` subagent for this run **IN A SINGLE ASSISTANT TURN, AS MULTIPLE PARALLEL TASK TOOL_USE BLOCKS IN ONE MESSAGE.**
+   \u26A0\uFE0F  CRITICAL \u2014 PARALLELISM IS THE ONLY REASON LENSES EXIST. \u26A0\uFE0F
+   The default tool-call behavior of Claude Code (and most agent runtimes) is **serial dispatch**: emit one Task call, await result, emit next, await, etc. This collapses your fan-out into a sequential review where each lens adds N \xD7 (orchestrator-think-time + lens-execution-time) to wall time. **YOU MUST OVERRIDE THIS DEFAULT.** Emit ALL of your Task tool_use blocks in the SAME assistant message, BEFORE you read ANY result from ANY of them. If you find yourself emitting one Task call, then thinking about the result, then emitting another \u2014 STOP and re-issue them all together. The whole point of going multi-lens is the wall-clock speedup from parallel execution; serial dispatch defeats it entirely.
+   \u2705 Right pattern: one assistant turn with N Task tool_use blocks \u2192 wait \u2192 N results arrive together \u2192 aggregate.
+   \u274C Wrong pattern: turn 1 = Task(lens A) \u2192 turn 2 (after A's result) = Task(lens B) \u2192 turn 3 (after B's result) = Task(lens C). This is the failure mode. Do not do this.
+   You can also include your own \`read\` / \`grep\` / \`webfetch\` calls in the SAME turn as the parallel \`${REVIEWER_AGENT_NAME}\` dispatches \u2014 concurrent context-pulling on the orchestrator side runs in parallel with the lens fan-out and costs zero extra wall time.
+   if a subagent errors out, times out, or returns nothing usable, retry once with the same lens; if it still fails, proceed with partial coverage and note the missing lens in the review body \u2014 do not skip the fan-out entirely on a single subagent failure. each subagent gets:
    - the diff path / target \u2014 reading the diff and the codebase is its job
    - **only one lens** \u2014 never a multi-section "review for X, Y, and Z" prompt
    - **a Task \`description\` set to the lens name** (e.g. \`"security"\`, \`"correctness"\`, \`"billing-subsystem"\`) \u2014 the harness reads this field to label the subagent's log lines so parallel runs can be told apart in CI output. without it, every subagent shows up as \`subagent#N\`.
-   - the read-only contract restated in your dispatch instructions so the rule is present twice (the subagent's system prompt also enforces it). The test: would this call still be a no-op if reverted? If not (PR comments, branch pushes, issue updates, set_output, label changes, dependency installs, etc.), don't make it.
    - if the lens touches external contracts, instruct the subagent to verify load-bearing claims via web search rather than trust training data, and to quote source URLs in its reasoning. action runs are non-interactive \u2014 there's no human in the loop to catch "I'm pretty sure Stripe does X."
    - ask the subagent to report findings with file paths and NEW line numbers from the diff so you can anchor inline comments without re-reading the entire diff.
    delegation discipline:
-   - do NOT lens-review the diff yourself in parallel with the subagents (your job is dispatch + comment-drafting; doing the lens work yourself reintroduces the bias the fan-out avoids)
    - do NOT summarize the PR for them (biases toward a validation frame)
    - do NOT hand them a curated reading list (let them discover scope)
    - do NOT pre-shape their output with a finding schema
    - do NOT mention the other lenses (independence is the point \u2014 overlapping findings are a strong signal)
-5. **aggregate & draft**: merge findings; de-dup overlaps (two lenses catching the same issue = higher-confidence signal); trace each finding yourself before accepting it. drop praise, style preferences, speculative/unverified claims, findings about pre-existing code unrelated to the PR (heuristic: if the finding's root cause lives in lines this PR added or modified, it's in scope; otherwise drop unless the PR plausibly introduced or amplified the regression), and anything not actionable. also drop **bloat-shaped findings** \u2014 proposed fixes that would add defensive checks for cases that can't happen, abstractions used once, comments restating obvious code, tests asserting tautologies, or "just-in-case" guards. subagents are fallible and bias toward recommending changes; the bar for an actionable inline comment is sound + correct + elegant. recommending a change that improves only one of the three (or worse, degrades elegance to nominally improve correctness) makes the codebase worse, not better.
+6. **aggregate & draft**: when the fan-out lands, merge findings; de-dup overlaps (two lenses catching the same issue = higher-confidence signal); trace each finding yourself before accepting it. drop praise, style preferences, speculative/unverified claims, findings about pre-existing code unrelated to the PR (heuristic: if the finding's root cause lives in lines this PR added or modified, it's in scope; otherwise drop unless the PR plausibly introduced or amplified the regression), and anything not actionable. also drop **bloat-shaped findings** \u2014 proposed fixes that would add defensive checks for cases that can't happen, abstractions used once, comments restating obvious code, tests asserting tautologies, or "just-in-case" guards. subagents are fallible and bias toward recommending changes; the bar for an actionable inline comment is sound + correct + elegant. recommending a change that improves only one of the three (or worse, degrades elegance to nominally improve correctness) makes the codebase worse, not better.
    for surviving findings, draft inline comments with NEW line numbers from the diff. every comment must be actionable, 2-3 sentences max. use GitHub permalink format for code references. for impact-analysis findings (stale references after rename/remove), report them in the review body ordered by severity (runtime breakage > incorrect docs > stale comments) rather than as inline comments unless they're anchored to a specific line.
-6. **submit**: ALWAYS submit exactly one review via \`${t("create_pull_request_review")}\`. Do NOT call \`report_progress\` \u2014 the review is the final record and the progress comment will be cleaned up automatically.
+7. **submit**: ALWAYS submit exactly one review via \`${t("create_pull_request_review")}\`. Do NOT call \`report_progress\` \u2014 the review is the final record and the progress comment will be cleaned up automatically.
    note: the first create_pull_request_review submission may error with a one-time diff-coverage nudge listing unread TOC regions. retry the same call to proceed \u2014 optionally after reading the listed ranges. the pre-flight will not block again this session.
@@ -665,10 +738,10 @@ For simple, well-defined tasks, skip the plan phase and go straight to build.`
 ${PR_SUMMARY_FORMAT}`
     },
-    // IncrementalReview shares Review's multi-lens orchestrator pattern but
-    // scopes the target to the incremental diff. The "issues must be NEW
-    // since the last Pullfrog review" filter lives at aggregation time
-    // (step 6), NOT in the subagent prompt — pushing the filter into
+    // IncrementalReview shares Review's 0-or-2+ lens pattern but scopes the
+    // target to the incremental diff. The "issues must be NEW since the last
+    // Pullfrog review" filter lives at aggregation time (step 8), NOT in the
+    // subagent prompt — pushing the filter into
     // subagents matches the canonical anneal anti-pattern of "list known
     // pre-existing failures — don't flag these" and suppresses signal on
     // regressions the new commits amplified. The review body is just
@@ -687,38 +760,57 @@ ${PR_SUMMARY_FORMAT}`
 3. **incremental scope**: if \`incrementalDiffPath\` is present, read it to see what changed since the last review. this is a range-diff that isolates the net changes, filtering out base branch noise. if not present, fall back to reviewing the full PR diff and determine what changed since Pullfrog's most recent review.
-4. **prior feedback**: fetch previous reviews via \`${t("list_pull_request_reviews")}\`. for the most recent Pullfrog review, call \`${t("get_review_comments")}\` with the review ID to retrieve specific prior line-level feedback. you'll use this to filter your aggregation in step 6 \u2014 anything already flagged in a prior review and not changed by the new commits should not be re-raised. you do NOT need to render this in the review body; the rolling PR summary snapshot is the durable record of what's been addressed.
+4. **prior feedback**: fetch previous reviews via \`${t("list_pull_request_reviews")}\`. for the most recent Pullfrog review, call \`${t("get_review_comments")}\` with the review ID to retrieve specific prior line-level feedback. you'll use this to filter your aggregation in step 8 \u2014 anything already flagged in a prior review and not changed by the new commits should not be re-raised. you do NOT need to render this in the review body; the rolling PR summary snapshot is the durable record of what's been addressed.
-5. **triage & fan out**: orient on the *incremental* changes \u2014 domain, seams, external contracts, user-facing surfaces.
+5. **triage**: orient on the *incremental* changes \u2014 domain, seams, external contracts, user-facing surfaces. pull as much context as you need to render a confident review: read related files, grep for callers of changed symbols, check tests that exercise the touched paths. **you are the synthesizer.**
-   if the incremental changes are **genuinely trivial**, skip the fan-out entirely and jump to step 8's non-substantive path (do NOT submit a review).
+   if the incremental changes are **genuinely trivial**, skip the fan-out entirely and jump to step 10's non-substantive path (do NOT submit a review).
    "Genuinely trivial" (skip): formatting/comment tweaks, import reordering, lockfile regen, mechanical rename of import paths, whitespace-only.
    "Looks trivial but isn't" (do NOT skip \u2014 same anti-patterns as Review mode): 1-line changes to SQL/regex/auth/billing/permissions/signature-verification code; flipping feature-flag defaults or retry/timeout constants; money/tax/HTTP-method/redirect changes; tightening or loosening a comparison operator; mixed diffs with a semantic line buried in formatting.
    When unsure, treat as non-trivial.
-   otherwise pick lenses by where the new commits concentrate risk \u2014 **there's no fixed count**, same calibration as Review mode (1 lens for pure refactor / isolated fix; 2\u20133 for typical features; 4\u20135 for high-stakes subsystem touches; 6+ is a smell). lens framing follows Review mode: themed lenses (correctness & invariants, impact when new commits remove/rename/deprecate things, research-validated assumptions, security, user-journey, operational readiness, integration & cross-cutting, test integrity, performance, holistic) and subsystem lenses (auth, billing, schema migration, etc.) \u2014 for high-stakes domains lead with the subsystem lens rather than the generic themed equivalent.
+6. **lens decision \u2014 0 or 2+, NEVER 1**.
+   The default is **0 lenses**: handle the re-review yourself end-to-end. Most incremental reviews land here \u2014 especially thread-reply re-reviews where the user is asking "did you address X?" rather than "review the diff again."
-   dispatch one \`${REVIEWER_AGENT_NAME}\` subagent per lens \u2014 its baked-in system prompt enforces the non-mutative + non-recursive contract (read-only file/search/web tools and read-only MCP queries; no writes, shell side effects, state-changing MCP calls, or nested subagent dispatch). dispatch them in a **single assistant turn with multiple parallel subagent calls** (serial dispatch collapses the fan-out). if a subagent errors out, times out, or returns nothing usable, retry once with the same lens; if it still fails, proceed with partial coverage and note the missing lens in the review body \u2014 do not skip step 5 entirely on a single subagent failure. each subagent gets:
-   - the diff scope (incremental diff path if available, full diff otherwise). do NOT tell them to skip pre-existing issues \u2014 that suppresses regressions the new commits amplified; the "issues must be NEW" filter lives at aggregation time (step 6), not in the subagent prompt
+   Dispatch **2+ \`${REVIEWER_AGENT_NAME}\` lenses in parallel** ONLY when ALL of the following are true:
+   - the incremental changes are substantive (>5 files changed AND >200 net new lines), OR touch a high-stakes subsystem (auth, billing, payments, schema migration, webhooks, secrets, RBAC, multi-tenant isolation, cron/scheduling)
+   - you can name 2+ distinct concrete failure modes the new commits plausibly introduce that warrant independent lenses
+   - parallel-orchestrated independent perspectives meaningfully outperform what you'd find solo
+   **NEVER dispatch exactly one lens.** Single-lens dispatch adds wall time and cost for no orthogonality benefit. Either go multi-lens (\u22652 in parallel) or do the re-review yourself.
+   Lens framing follows Review mode: themed lenses (correctness, security, etc.) and subsystem lenses (auth, billing, schema-migration, etc.) \u2014 for high-stakes domains lead with the subsystem lens.
+7. **fan out (only if step 6 said 2+ lenses)**: dispatch every \`${REVIEWER_AGENT_NAME}\` subagent for this run **IN A SINGLE ASSISTANT TURN, AS MULTIPLE PARALLEL TASK TOOL_USE BLOCKS IN ONE MESSAGE.**
+   \u26A0\uFE0F  CRITICAL \u2014 PARALLELISM IS THE ONLY REASON LENSES EXIST. \u26A0\uFE0F
+   Default tool-call behavior is **serial dispatch**: emit one Task call, await result, emit next, await, etc. This collapses your fan-out into a sequential review where each lens adds N \xD7 (orchestrator-think-time + lens-execution-time) to wall time. **YOU MUST OVERRIDE THIS DEFAULT.** Emit ALL of your Task tool_use blocks in the SAME assistant message, BEFORE you read ANY result from ANY of them.
+   \u2705 Right pattern: one assistant turn with N Task tool_use blocks \u2192 wait \u2192 N results arrive together \u2192 aggregate.
+   \u274C Wrong pattern: turn 1 = Task(lens A) \u2192 turn 2 (after A's result) = Task(lens B). This is the failure mode.
+   You can also include your own \`read\` / \`grep\` / \`webfetch\` calls in the SAME turn as the parallel \`${REVIEWER_AGENT_NAME}\` dispatches.
+   if a subagent errors out, times out, or returns nothing usable, retry once with the same lens; if it still fails, proceed with partial coverage and note the missing lens in the review body. each subagent gets:
+   - the diff scope (incremental diff path if available, full diff otherwise). do NOT tell them to skip pre-existing issues \u2014 that suppresses regressions the new commits amplified; the "issues must be NEW" filter lives at aggregation time (step 8), not in the subagent prompt
    - **only one lens** \u2014 never a multi-section "review for X, Y, and Z" prompt
-   - **a Task \`description\` set to the lens name** (e.g. \`"security"\`, \`"correctness"\`, \`"billing-subsystem"\`) \u2014 the harness reads this field to label the subagent's log lines so parallel runs can be told apart in CI output. without it, every subagent shows up as \`subagent#N\`.
-   - the read-only contract restated in your dispatch instructions so the rule is present twice (the subagent's system prompt also enforces it). The test: would this call still be a no-op if reverted? If not (PR comments, branch pushes, issue updates, set_output, label changes, dependency installs, etc.), don't make it.
-   - if the lens touches external contracts, instruct the subagent to verify load-bearing claims via web search and quote source URLs. action runs are non-interactive \u2014 there's no human to catch "I'm pretty sure Stripe does X."
+   - **a Task \`description\` set to the lens name** \u2014 the harness reads this field to label log lines so parallel runs can be told apart.
+   - if the lens touches external contracts, instruct the subagent to verify load-bearing claims via web search and quote source URLs.
    - ask the subagent to report findings with file paths and NEW line numbers from the full PR diff so you can anchor inline comments.
    delegation discipline:
-   - do NOT lens-review the diff yourself in parallel with the subagents
    - do NOT summarize the changes for them (biases toward validation frame)
    - do NOT hand them a curated reading list (let them discover scope)
    - do NOT pre-shape their output with a finding schema
    - do NOT mention the other lenses (independence is the point)
-6. **aggregate, draft, self-critique**: merge findings; de-dup overlaps; trace each finding yourself. drop praise, style preferences, speculative/unverified claims, findings about pre-existing code unrelated to the new commits, anything not actionable, and anything that re-states prior review feedback (heuristic: if the finding's root cause lives in lines the *new commits* added or modified, it's in scope; otherwise drop). also drop **bloat-shaped findings** \u2014 proposed fixes that would add defensive checks for cases that can't happen, abstractions used once, comments restating obvious code, tests asserting tautologies, or "just-in-case" guards. subagents are fallible and bias toward recommending changes; the bar for an actionable inline comment is sound + correct + elegant. recommending a change that improves only one of the three (or degrades elegance to nominally improve correctness) makes the codebase worse, not better. To compute "lines the new commits added or modified": if \`incrementalDiffPath\` from step 2 is present, use it directly. Otherwise, take the prior Pullfrog review's \`commit_id\` (returned alongside each entry from \`${t("list_pull_request_reviews")}\` in step 4) and run \`git diff <prior-review-sha>..HEAD\` to isolate the lines added since that review. draft inline comments with NEW line numbers from the full PR diff \u2014 every comment must be actionable, 2-3 sentences max.
+8. **aggregate, draft, self-critique**: merge findings (yours + any subagent output if you went multi-lens); de-dup overlaps; trace each finding yourself. drop praise, style preferences, speculative/unverified claims, findings about pre-existing code unrelated to the new commits, anything not actionable, and anything that re-states prior review feedback (heuristic: if the finding's root cause lives in lines the *new commits* added or modified, it's in scope; otherwise drop). also drop **bloat-shaped findings** \u2014 proposed fixes that would add defensive checks for cases that can't happen, abstractions used once, comments restating obvious code, tests asserting tautologies, or "just-in-case" guards. subagents are fallible and bias toward recommending changes; the bar for an actionable inline comment is sound + correct + elegant. recommending a change that improves only one of the three (or degrades elegance to nominally improve correctness) makes the codebase worse, not better. To compute "lines the new commits added or modified": if \`incrementalDiffPath\` from step 2 is present, use it directly. Otherwise, take the prior Pullfrog review's \`commit_id\` (returned alongside each entry from \`${t("list_pull_request_reviews")}\` in step 4) and run \`git diff <prior-review-sha>..HEAD\` to isolate the lines added since that review. draft inline comments with NEW line numbers from the full PR diff \u2014 every comment must be actionable, 2-3 sentences max.
-7. **build the review body** \u2014 a single "Reviewed changes" section: summarize at the logical-change level, not per-file. each bullet starts with a past-tense verb (e.g. \`- Extracted shared CLI runtime into a single module\`, \`- Renamed package to pullfrog\`). avoid file paths unless they add clarity. if the changes can be described in one sentence, use one sentence \u2014 no bullets needed. do NOT include a separate "Prior review feedback" checklist; that's tracked in the rolling PR summary snapshot for the next agent run, and surfacing it in the user-facing body is noise (changes that addressed prior feedback are already covered by the Reviewed-changes bullets). in some cases you may receive a complete diff for the whole pull request instead of an incremental one \u2014 when this happens, you will need to determine what changes have happened since Pullfrog's most recent review.
+9. **build the review body** \u2014 a single "Reviewed changes" section: summarize at the logical-change level, not per-file. each bullet starts with a past-tense verb (e.g. \`- Extracted shared CLI runtime into a single module\`, \`- Renamed package to pullfrog\`). avoid file paths unless they add clarity. if the changes can be described in one sentence, use one sentence \u2014 no bullets needed. do NOT include a separate "Prior review feedback" checklist; that's tracked in the rolling PR summary snapshot for the next agent run, and surfacing it in the user-facing body is noise (changes that addressed prior feedback are already covered by the Reviewed-changes bullets). in some cases you may receive a complete diff for the whole pull request instead of an incremental one \u2014 when this happens, you will need to determine what changes have happened since Pullfrog's most recent review.
-8. Submit \u2014 every run must end with EXACTLY ONE of \`${t("create_pull_request_review")}\` (substantive review) or \`${t("report_progress")}\` (no-review acknowledgement). do NOT call \`create_issue_comment\` for review output.
+10. Submit \u2014 every run must end with EXACTLY ONE of \`${t("create_pull_request_review")}\` (substantive review) or \`${t("report_progress")}\` (no-review acknowledgement). do NOT call \`create_issue_comment\` for review output.
    Same callout-intensity ladder as Review mode \u2014 \`[!CAUTION]\` (large red, "will break") \u2192 \`[!IMPORTANT]\` (large purple, "must address before merging") \u2192 \`[!NOTE]\` (small blue, "FYI") \u2192 no callout (plain text). And the same Fix-button lever: the footer renders a Fix button on every non-approving review, so \`approved: true\` suppresses it. Wrapping mergeable feedback in \`[!IMPORTANT]\` trains users to click Fix on reviews that don't need fixing \u2014 pick the tier the author's actual next action justifies.

package/dist/models.d.ts CHANGED Viewed

@@ -4,6 +4,21 @@
  * slugs use the format `provider/model-id` (e.g. "anthropic/claude-opus").
  * bump `resolve` when a new model generation ships — the alias (slug) stays stable.
  */
+/**
+ * routing discriminant for entries whose `resolve` is dynamic — looked up
+ * from a separate env var at run time rather than fixed in the catalog.
+ *
+ * `"bedrock"` means the actual model ID comes from `BEDROCK_MODEL_ID`
+ * (an AWS-canonical Bedrock model ID like `us.anthropic.claude-opus-4-7`
+ * or `amazon.nova-pro-v1:0`). enterprise Bedrock customers self-select for
+ * version control — silent alias bumps would break compliance review,
+ * model-access enrollment, and provisioned-throughput contracts. so the
+ * single `bedrock/byok` entry is a routing slug, not a model alias: the
+ * harness reads `BEDROCK_MODEL_ID` and routes to claude-code (when the ID
+ * contains "anthropic") or opencode (everything else, with an
+ * `amazon-bedrock/` prefix).
+ */
+export type ModelRouting = "bedrock";
 export interface ModelAlias {
     /** stable alias stored in DB, e.g. "anthropic/claude-opus" */
     slug: string;
@@ -11,9 +26,9 @@ export interface ModelAlias {
     provider: string;
     /** human-readable name shown in dropdowns */
     displayName: string;
-    /** concrete models.dev specifier, e.g. "anthropic/claude-opus-4-6" */
+    /** concrete models.dev specifier, e.g. "anthropic/claude-opus-4-6". sentinel for routing entries — never passed to a CLI directly. */
     resolve: string;
-    /** full models.dev specifier for the OpenRouter equivalent (undefined for free models) */
+    /** full models.dev specifier for the OpenRouter equivalent (undefined for free models and routing entries) */
     openRouterResolve: string | undefined;
     /** top-tier pick for this provider — preferred during auto-select */
     preferred: boolean;
@@ -21,6 +36,15 @@ export interface ModelAlias {
     isFree: boolean;
     /** slug of a replacement model — presence implies this model is deprecated */
     fallback: string | undefined;
+    /** dynamic-resolution discriminant — see ModelRouting docs */
+    routing: ModelRouting | undefined;
+    /** alias key (within same provider) of the cheaper sibling reviewfrog should
+     * use as its lens-fanout subagent. e.g. claude-opus → "claude-sonnet". */
+    subagentModel: string | undefined;
+    /** hide from selectable lists (UI dropdowns, CLI pickers). does NOT affect
+     * resolution — for that use `fallback`. used for internal-only tier targets
+     * (e.g. gpt-5.4 as a subagent target without exposing it to users). */
+    hidden: boolean;
 }
 interface ModelDef {
     displayName: string;
@@ -33,6 +57,13 @@ interface ModelDef {
     isFree?: boolean;
     /** slug of a replacement model — presence implies this model is deprecated */
     fallback?: string;
+    /** dynamic-resolution discriminant — see ModelRouting docs */
+    routing?: ModelRouting;
+    /** alias key (within same provider) of the cheaper sibling reviewfrog should
+     * use as its lens-fanout subagent (e.g. claude-opus → "claude-sonnet"). */
+    subagentModel?: string;
+    /** hide from selectable lists. does NOT affect resolution; for that use `fallback`. */
+    hidden?: boolean;
 }
 export interface ProviderConfig {
     displayName: string;
@@ -47,6 +78,7 @@ export declare const providers: {
     deepseek: ProviderConfig;
     moonshotai: ProviderConfig;
     opencode: ProviderConfig;
+    bedrock: ProviderConfig;
     openrouter: ProviderConfig;
 };
 export type ModelProvider = keyof typeof providers;
@@ -67,7 +99,7 @@ export declare function resolveModelSlug(slug: string): string | undefined;
  * use this in UI display sites (dropdown trigger labels, PR-comment footers,
  * etc.) so a deprecated stored slug renders as the model the user actually
  * runs against — not the historical name. selectable lists should still hide
- * deprecated aliases by filtering on `!a.fallback`.
+ * deprecated and internal-only aliases by filtering on `!a.fallback && !a.hidden`.
  */
 export declare function resolveDisplayAlias(slug: string): ModelAlias | undefined;
 /**
@@ -83,4 +115,32 @@ export declare function resolveCliModel(slug: string): string | undefined;
  * (e.g. free opencode models).
  */
 export declare function resolveOpenRouterModel(slug: string): string | undefined;
+/** env var that supplies the Bedrock model ID for the `bedrock/byok` slug. */
+export declare const BEDROCK_MODEL_ID_ENV = "BEDROCK_MODEL_ID";
+/**
+ * the Bedrock model ID passed to claude-code or opencode is whatever the
+ * user set in `BEDROCK_MODEL_ID` — Pullfrog never resolves or upgrades it.
+ * we route by checking whether the ID names an Anthropic model: claude-code
+ * handles Anthropic-on-Bedrock natively (with `CLAUDE_CODE_USE_BEDROCK=1`),
+ * everything else goes through opencode's `amazon-bedrock` provider.
+ *
+ * AWS Bedrock IDs come in two shapes:
+ *   - dotted foundation IDs: `us.anthropic.claude-opus-4-7`,
+ *     `anthropic.claude-haiku-4-5-20251001-v1:0`, `amazon.nova-pro-v1:0`,
+ *     `meta.llama4-scout-17b-instruct-v1:0`. AWS-published, lowercase, the
+ *     foundation provider always appears as a discrete dot-segment.
+ *   - inference-profile ARNs: `arn:aws:bedrock:us-east-2:<acct>:application-inference-profile/<user-name>`.
+ *     `<user-name>` is operator-chosen, so a naive substring check is fragile
+ *     in both directions (Anthropic profile named without "anthropic" → routes
+ *     to opencode and misses CLAUDE_CODE_USE_BEDROCK; non-Anthropic profile
+ *     whose name happens to contain "anthropic" → routes to claude-code).
+ *
+ * we anchor on a discrete dot-segment match (case-insensitive). this catches
+ * every published foundation ID and is conservative for ARN-form IDs: ARN
+ * names that don't include "anthropic" as their own dot-segment route to
+ * opencode by default. operators using ARN-form IDs whose backing model is
+ * Anthropic should set `PULLFROG_AGENT=claude` to force the right route, or
+ * include the foundation segment in the profile name.
+ */
+export declare function isBedrockAnthropicId(bedrockModelId: string): boolean;
 export {};

package/dist/utils/agent.d.ts CHANGED Viewed

@@ -6,8 +6,11 @@ import type { Agent } from "../agents/index.ts";
  *   1. PULLFROG_MODEL env var — resolved through the alias registry first,
  *      so values like "anthropic/claude-opus" become "anthropic/claude-opus-4-7".
  *      raw specifiers (e.g. "anthropic/claude-opus-4-6") pass through unchanged.
- *   2. slug from repo config / payload → alias registry
- *   3. undefined — agent will auto-select
+ *      always wins — bypasses Bedrock routing entirely. to test a different
+ *      Bedrock model, change `BEDROCK_MODEL_ID`, not `PULLFROG_MODEL`.
+ *   2. slug from repo config / payload → alias registry. routing slugs
+ *      (e.g. `bedrock/byok`) defer to a separate env var (`BEDROCK_MODEL_ID`).
+ *   3. undefined — agent will auto-select.
  */
 export declare function resolveModel(ctx: {
     slug?: string | undefined;

package/dist/utils/apiKeys.d.ts CHANGED Viewed

@@ -8,3 +8,21 @@ export declare function validateAgentApiKey(params: {
     owner: string;
     name: string;
 }): void;
+/**
+ * Detect agent-runtime auth failures that should be reformatted as an actionable
+ * key-fix CTA before being shown to the user. Covers the two shapes we see:
+ *   - missing key (validateAgentApiKey throw): contains MISSING_KEY_MARKER
+ *   - revoked / invalid key (Claude CLI 401 surfaced via api_error_status):
+ *     "Invalid API key · Fix external API key" + similar provider variants
+ */
+export declare function isApiKeyAuthError(text: string): boolean;
+/**
+ * Friendly Markdown summary for both the missing-key and invalid-key cases.
+ * Used in the catch / result-failure paths in `main.ts` to overwrite the raw
+ * agent error before it's posted to the PR progress comment.
+ */
+export declare function formatApiKeyErrorSummary(params: {
+    owner: string;
+    name: string;
+    raw: string;
+}): string;

package/dist/utils/instructions.d.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 import { type AgentId } from "../external.ts";
 import type { Mode } from "../modes.ts";
 import type { ResolvedPayload } from "./payload.ts";
+import type { LearningsHeading } from "./runContext.ts";
 import type { RunContextData } from "./runContextData.ts";
 interface InstructionsContext {
     payload: ResolvedPayload;
@@ -12,6 +13,10 @@ interface InstructionsContext {
      * couldn't be seeded for some reason. main.ts always seeds, so in
      * practice this is always set; the null case keeps the type honest. */
     learningsFilePath: string | null;
+    /** server-parsed TOC for the body of the learnings tmpfile. rendered
+     * inline into the LEARNINGS prompt section so the agent can `read_file`
+     * targeted line ranges instead of pulling the whole file into context. */
+    learningsHeadings: LearningsHeading[];
 }
 export interface ResolvedInstructions {
     full: string;
@@ -21,5 +26,19 @@ export interface ResolvedInstructions {
     event: string;
     runtime: string;
 }
+/** render the heading list as an indented bullet TOC. ranges shown in
+ * parentheses (`(L3-L18)`); the start line is always the heading line
+ * itself, so reading the listed range gives the agent the heading +
+ * body together. shallowest heading depth in the body sits at the root
+ * column; deeper levels indent by `(depth - rootDepth) * 2` spaces. */
+export declare function renderLearningsToc(headings: LearningsHeading[]): string;
+/** assemble the LEARNINGS prompt section: file path + intro + either
+ * the rendered heading TOC (when the body has structure) or a no-headings
+ * affordance pointing the agent at the reflection turn for restructuring.
+ * empty string when the seed step failed and there's no path to surface. */
+export declare function buildLearningsSection(ctx: {
+    filePath: string | null;
+    headings: LearningsHeading[];
+}): string;
 export declare function resolveInstructions(ctx: InstructionsContext): ResolvedInstructions;
 export {};