clementine-agent 1.18.35 → 1.18.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -33,7 +33,7 @@ import { searchSkills as searchSkillsSync } from './skill-extractor.js';
33
33
  import { classifyIntent, getStrategyGuidance } from './intent-classifier.js';
34
34
  import { getEventLog } from './session-event-log.js';
35
35
  import { applyServiceDedup, routeToolSurface, TOOL_SURFACE_HARD_LIMIT, TOOL_SURFACE_WARN_THRESHOLD } from './tool-router.js';
36
- import { isRestrictedToolset, toolsetAllowsLocalWrites } from './toolsets.js';
36
+ import { isRestrictedToolset, toolsetAllowsLocalWrites, toolsetDisablesAllTools } from './toolsets.js';
37
37
  import { looksLikeApprovalPrompt } from './local-turn.js';
38
38
  import { decideTurn } from './turn-policy.js';
39
39
  import { loadClementineJson } from '../config/clementine-json.js';
@@ -2011,7 +2011,9 @@ You have a cost budget per message — not a hard turn limit. Work until the tas
2011
2011
  async buildOptions(opts = {}) {
2012
2012
  const { isHeartbeat = false, cronTier = null, maxTurns = null, model = null, enableTeams = true, retrievalContext = '', profile = null, sessionKey = null, streaming = false, isPlanStep = false, isUnleashed = false, sourceOverride, disableAllTools = false, verboseLevel, abortController, effort, maxBudgetUsd, toolScopeText, thinking, outputFormat, stallGuard, intentClassification, turnPolicy, contextRoutingText, toolset = 'auto', } = opts;
2013
2013
  const isCron = cronTier !== null;
2014
- const toolsDisabledForCall = disableAllTools || (isHeartbeat && !isCron);
2014
+ const toolsDisabledForCall = disableAllTools
2015
+ || (isHeartbeat && !isCron)
2016
+ || toolsetDisablesAllTools(toolset);
2015
2017
  const promptScopeText = toolScopeText ?? '';
2016
2018
  const profileScopeText = [profile?.description, profile?.systemPromptBody]
2017
2019
  .filter(Boolean)
@@ -85,6 +85,7 @@ export function buildAlwaysOnParallelizationHint() {
85
85
  return [
86
86
  '## Sub-agent fan-out',
87
87
  'When you process multiple items, spawn ONE Agent sub-agent per batch of 3–5 items. Sub-agents return ONE-LINE summaries (no raw tool output). Do not iterate sequentially in this conversation — that fills your context and aborts the run.',
88
+ 'Cost: pass `model: "haiku"` to Agent for routine extraction, summarization, or per-item lookups. Use Sonnet only when the sub-agent must reason across many sources or write something durable.',
88
89
  ].join('\n');
89
90
  }
90
91
  /**
@@ -109,6 +110,7 @@ export function buildFanoutDirective(signals) {
109
110
  'Use the `Agent` tool to spawn parallel sub-agents. Each sub-agent runs in its own isolated context, so big tool responses live and die there — your context only sees the summary.',
110
111
  '',
111
112
  '- **Batch size**: 3–5 items per sub-agent (or one slice of work per sub-agent for research tasks)',
113
+ '- **Sub-agent model**: pass `model: "haiku"` to the Agent tool by default — sub-agents that just extract fields, summarize a single email, or pull a single record do not need Sonnet. Reserve Sonnet for sub-agents that must reason across multiple sources or write something durable.',
112
114
  '- **Sub-agent prompt MUST include**: the narrow task, the exact return format (e.g. `Return ONE LINE: <id> | <status> | <next-action>`), and an explicit "do not include raw tool output" directive',
113
115
  '- **Parent context keeps**: only the sub-agent return strings, not their tool transcripts',
114
116
  '',
@@ -1,4 +1,4 @@
1
- export type ToolsetName = 'auto' | 'safe' | 'diagnostic' | 'communications' | 'memory' | 'full';
1
+ export type ToolsetName = 'auto' | 'safe' | 'diagnostic' | 'communications' | 'memory' | 'none' | 'full';
2
2
  export interface ToolsetPreset {
3
3
  name: ToolsetName;
4
4
  label: string;
@@ -11,4 +11,10 @@ export declare function getToolsetPreset(name: ToolsetName): ToolsetPreset;
11
11
  export declare function formatToolsetChoices(): string;
12
12
  export declare function isRestrictedToolset(name: ToolsetName): boolean;
13
13
  export declare function toolsetAllowsLocalWrites(name: ToolsetName): boolean;
14
+ /**
15
+ * "none" toolset: not just restricted, but actively disables ALL tools
16
+ * including the core Clementine MCP server. Used by builder/JSON-gen
17
+ * chats where tool schemas in the system prompt are pure cost overhead.
18
+ */
19
+ export declare function toolsetDisablesAllTools(name: ToolsetName): boolean;
14
20
  //# sourceMappingURL=toolsets.d.ts.map
@@ -29,6 +29,12 @@ export const TOOLSET_PRESETS = [
29
29
  description: 'Memory, transcript, and relationship tools only unless explicitly changed.',
30
30
  directive: 'Toolset memory: use memory_read, memory_search, memory_recall, transcript_search, working_memory, and user_model. Avoid external integrations and local shell/file writes.',
31
31
  },
32
+ {
33
+ name: 'none',
34
+ label: 'None',
35
+ description: 'No tools at all — pure-LLM conversation. Used by builders and JSON-generating chats where tool schemas are dead weight in the system prompt.',
36
+ directive: 'Toolset none: do not call any tools. Respond from the prompt context only. Generate JSON, summaries, or text directly.',
37
+ },
32
38
  {
33
39
  name: 'full',
34
40
  label: 'Full',
@@ -60,9 +66,17 @@ export function formatToolsetChoices() {
60
66
  .join('\n');
61
67
  }
62
68
  export function isRestrictedToolset(name) {
63
- return name === 'safe' || name === 'diagnostic' || name === 'memory';
69
+ return name === 'safe' || name === 'diagnostic' || name === 'memory' || name === 'none';
64
70
  }
65
71
  export function toolsetAllowsLocalWrites(name) {
66
72
  return name === 'auto' || name === 'full';
67
73
  }
74
+ /**
75
+ * "none" toolset: not just restricted, but actively disables ALL tools
76
+ * including the core Clementine MCP server. Used by builder/JSON-gen
77
+ * chats where tool schemas in the system prompt are pure cost overhead.
78
+ */
79
+ export function toolsetDisablesAllTools(name) {
80
+ return name === 'none';
81
+ }
68
82
  //# sourceMappingURL=toolsets.js.map
@@ -8053,6 +8053,12 @@ If the tool returns nothing or errors, return an empty array \`[]\`.`,
8053
8053
  }
8054
8054
  try {
8055
8055
  const gateway = await getGateway();
8056
+ // Builder generates JSON artifacts — no tool calls. Pin the session
8057
+ // toolset to 'none' so buildOptions strips all MCP servers and tool
8058
+ // schemas from the system prompt. Without this, every tiny builder
8059
+ // turn writes 60–280 KB of cache_creation for tool schemas the
8060
+ // model never uses.
8061
+ gateway.setSessionToolset(sessionKey, 'none');
8056
8062
  const response = await gateway.handleMessage(sessionKey, enrichedMessage);
8057
8063
  // Parse any json-artifact blocks from the response
8058
8064
  let artifact = null;
@@ -8180,6 +8186,9 @@ If the tool returns nothing or errors, return an empty array \`[]\`.`,
8180
8186
  try {
8181
8187
  writeEvent('progress', { status: 'thinking…' });
8182
8188
  const gateway = await getGateway();
8189
+ // Builder generates JSON artifacts — no tool calls. Pin to 'none'
8190
+ // toolset so the SDK system prompt drops the tool inventory.
8191
+ gateway.setSessionToolset(sessionKey, 'none');
8183
8192
  let lastText = '';
8184
8193
  const response = await gateway.handleMessage(sessionKey, enrichedMessage, async (text) => {
8185
8194
  lastText = text ?? '';
@@ -171,7 +171,15 @@ export class AgentHeartbeatScheduler {
171
171
  catch {
172
172
  signals.latestGoalUpdate = '';
173
173
  }
174
- // 3. Latest cron run for any of this agent's crons (file mtime is enough)
174
+ // 3. Latest cron run for any of this agent's crons (file mtime is enough).
175
+ //
176
+ // INFO ONLY — kept in `signals` for the LLM prompt context, but NOT
177
+ // included in the fingerprint hash below. A cron firing is expected
178
+ // background activity, not a "wake up Sonnet" signal. Including it
179
+ // caused every cron run to bump the fingerprint → fire a $1+ Sonnet
180
+ // pass that just confirmed "yep, the cron ran, nothing else to do."
181
+ // Actionable wake-ups are pendingTasks growing and goal-state changes;
182
+ // those still trip the fingerprint below.
175
183
  try {
176
184
  const runsDir = path.join(this.baseDir, 'cron', 'runs');
177
185
  let latestMs = 0;
@@ -195,8 +203,15 @@ export class AgentHeartbeatScheduler {
195
203
  catch {
196
204
  signals.latestCronRunMs = 0;
197
205
  }
206
+ // Fingerprint only includes ACTIONABLE signals. latestCronRunMs is
207
+ // info-only and explicitly excluded.
208
+ const fingerprintSource = {
209
+ slug: signals.slug,
210
+ pendingTasks: signals.pendingTasks,
211
+ latestGoalUpdate: signals.latestGoalUpdate,
212
+ };
198
213
  const fingerprint = createHash('sha1')
199
- .update(JSON.stringify(signals))
214
+ .update(JSON.stringify(fingerprintSource))
200
215
  .digest('hex')
201
216
  .slice(0, 16);
202
217
  return { fingerprint, signals };
@@ -23,7 +23,7 @@ import { listBackgroundTasks, markDone as markBgTaskDone, markFailed as markBgTa
23
23
  import { outcomeStatusFromGoalDisposition, recentDecisions, recordDecisionOutcome, } from '../agent/proactive-ledger.js';
24
24
  import { formatCreditBlock, getBackgroundCreditBlock, isCreditBalanceError, markBackgroundCreditBlocked, } from './credit-guard.js';
25
25
  import { isRunHealthFailure } from './job-health.js';
26
- import { analyzeLongTaskPreflight, compactLongTaskPreflight, formatLongTaskPromptPrefix, } from './long-task-preflight.js';
26
+ import { analyzeLongTaskPreflight, compactLongTaskPreflight, shouldDowngradeUnleashed, formatLongTaskPromptPrefix, } from './long-task-preflight.js';
27
27
  const logger = pino({ name: 'clementine.cron' });
28
28
  /** Default timeout for standard cron jobs (10 minutes). */
29
29
  const CRON_STANDARD_TIMEOUT_MS = 10 * 60 * 1000;
@@ -1061,6 +1061,34 @@ export class CronScheduler {
1061
1061
  // Sonnet runs every job by default. Opus 1M is opt-in: set
1062
1062
  // `model: claude-opus-4-7[1m]` in CRON.md per-job, or flip
1063
1063
  // CLEMENTINE_1M_CONTEXT_MODE=on for global enable.
1064
+ // ── Auto-downgrade unleashed → standard ────────────────────────
1065
+ // CRON.md `mode: unleashed` is a CEILING, not a floor. If the
1066
+ // job's history shows it's a quiet probe that completes in 1
1067
+ // phase with __NOTHING__ or short output, the multi-phase
1068
+ // wrapper is wasteful overhead — each phase is a fresh SDK
1069
+ // query with full system prompt + tool schemas in cache_creation.
1070
+ // For a "did anything new come in?" cron firing every 2 hours,
1071
+ // that's 12+ unleashed runs/day at ~$1/each instead of standard
1072
+ // mode at ~$0.05/each.
1073
+ if (job.mode === 'unleashed') {
1074
+ const downgrade = shouldDowngradeUnleashed(this.runLog.readRecent(job.name, 5));
1075
+ if (downgrade.downgrade) {
1076
+ job = { ...job, mode: 'standard' };
1077
+ logger.info({
1078
+ job: job.name,
1079
+ reason: downgrade.reason,
1080
+ quietRatio: downgrade.quietRatio,
1081
+ avgDurationMs: downgrade.avgDurationMs,
1082
+ }, 'Cron mode downgraded unleashed → standard based on run history');
1083
+ this.logAutonomy('mode_downgrade', job, {
1084
+ from: 'unleashed',
1085
+ to: 'standard',
1086
+ reason: downgrade.reason,
1087
+ quietRatio: downgrade.quietRatio,
1088
+ avgDurationMs: downgrade.avgDurationMs,
1089
+ });
1090
+ }
1091
+ }
1064
1092
  let longTaskPreflight;
1065
1093
  const preflight = analyzeLongTaskPreflight(job, jobPrompt, this.runLog.readRecent(job.name, 5));
1066
1094
  if (preflight.risk !== 'normal') {
@@ -15,6 +15,15 @@ export interface LongTaskPreflightOptions {
15
15
  opusModel?: string;
16
16
  sonnetModel?: string;
17
17
  }
18
+ export interface UnleashedDowngradeDecision {
19
+ downgrade: boolean;
20
+ reason: string;
21
+ /** Quiet ratio observed in the sample (for telemetry). */
22
+ quietRatio?: number;
23
+ /** Average duration of recent runs in ms (for telemetry). */
24
+ avgDurationMs?: number;
25
+ }
26
+ export declare function shouldDowngradeUnleashed(recentRuns: CronRunEntry[], now?: number): UnleashedDowngradeDecision;
18
27
  export declare function analyzeLongTaskPreflight(job: CronJobDefinition, prompt: string, recentRuns?: CronRunEntry[], opts?: LongTaskPreflightOptions): LongTaskPreflightDecision;
19
28
  export declare function formatLongTaskPromptPrefix(decision: LongTaskPreflightDecision): string;
20
29
  export declare function compactLongTaskPreflight(decision: LongTaskPreflightDecision): LongTaskPreflightSnapshot;
@@ -48,6 +48,88 @@ function recentContextFailures(recentRuns, now = Date.now()) {
48
48
  }
49
49
  return [...new Set(reasons)];
50
50
  }
51
+ // ── Auto-downgrade unleashed → standard for quiet/probe jobs ──────────
52
+ //
53
+ // `mode: unleashed` wraps a job in multi-phase machinery: each phase is a
54
+ // fresh SDK query with the full system prompt + tool schemas, and the
55
+ // orchestrator chains phases until TASK_COMPLETE or max-phases. That
56
+ // machinery is essential for genuinely-long tasks (sasha briefs, market
57
+ // outreach), but it's pure overhead on quiet probe jobs that finish in
58
+ // 1 phase with `__NOTHING__` or a short output.
59
+ //
60
+ // Detect that pattern from history and downgrade the next run to
61
+ // standard mode. Single SDK call, single cache write, fraction of the
62
+ // cost. The user's CRON.md `mode: unleashed` becomes a "ceiling" rather
63
+ // than a forced floor — actual mode chosen dynamically per-run.
64
+ //
65
+ // Conservative by design: requires 3+ prior runs of evidence, refuses
66
+ // to downgrade if any recent run hit context overflow (the unleashed
67
+ // wrapper might be actively saving us), and only triggers on jobs that
68
+ // historically complete fast with short or empty output.
69
+ const UNLEASHED_DOWNGRADE_SAMPLE_SIZE = 5;
70
+ const UNLEASHED_DOWNGRADE_MIN_HISTORY = 3;
71
+ const UNLEASHED_DOWNGRADE_QUIET_RATIO = 0.6;
72
+ const UNLEASHED_DOWNGRADE_MAX_DURATION_MS = 90_000;
73
+ const UNLEASHED_DOWNGRADE_AVG_DURATION_MS = 60_000;
74
+ const UNLEASHED_DOWNGRADE_QUIET_PREVIEW_CHARS = 200;
75
+ export function shouldDowngradeUnleashed(recentRuns, now = Date.now()) {
76
+ const sample = recentRuns
77
+ .slice(0, UNLEASHED_DOWNGRADE_SAMPLE_SIZE)
78
+ .filter(r => r.status === 'ok' || r.status === 'error');
79
+ if (sample.length < UNLEASHED_DOWNGRADE_MIN_HISTORY) {
80
+ return { downgrade: false, reason: 'insufficient_history' };
81
+ }
82
+ // Refuse to downgrade if any recent run hit a context-window failure —
83
+ // the unleashed multi-phase wrapper might be the only thing keeping
84
+ // this job from thrashing on a single huge SDK query. Pair this guard
85
+ // with the existing fanout-policy directive (1.18.35) so by the next
86
+ // few runs the agent has learned to fan out and the wrapper can be
87
+ // shed safely.
88
+ const cutoff = now - RECENT_CONTEXT_FAILURE_WINDOW_MS;
89
+ const hadOverflow = sample.some(r => {
90
+ const startedMs = Date.parse(r.startedAt);
91
+ if (!Number.isFinite(startedMs) || startedMs < cutoff)
92
+ return false;
93
+ return r.terminalReason === 'rapid_refill_breaker'
94
+ || r.terminalReason === 'prompt_too_long';
95
+ });
96
+ if (hadOverflow) {
97
+ return { downgrade: false, reason: 'recent_context_overflow_protect_unleashed' };
98
+ }
99
+ // Quiet pattern: most recent runs returned __NOTHING__ or a short
100
+ // output. These jobs don't need multi-phase orchestration.
101
+ const quietCount = sample.filter(r => {
102
+ const preview = (r.outputPreview ?? '').trim();
103
+ if (!preview)
104
+ return false;
105
+ if (/__nothing__/i.test(preview))
106
+ return true;
107
+ return preview.length < UNLEASHED_DOWNGRADE_QUIET_PREVIEW_CHARS;
108
+ }).length;
109
+ const quietRatio = quietCount / sample.length;
110
+ if (quietRatio >= UNLEASHED_DOWNGRADE_QUIET_RATIO) {
111
+ return {
112
+ downgrade: true,
113
+ reason: `quiet_pattern_${Math.round(quietRatio * 100)}pct`,
114
+ quietRatio,
115
+ };
116
+ }
117
+ // Fast-completion pattern: every run finishes well under the standard
118
+ // cron timeout, average is short. Multi-phase wrapper is overhead.
119
+ const durations = sample.map(r => r.durationMs || 0).filter(d => d > 0);
120
+ if (durations.length === sample.length) {
121
+ const avgDuration = durations.reduce((a, b) => a + b, 0) / durations.length;
122
+ const allFast = durations.every(d => d < UNLEASHED_DOWNGRADE_MAX_DURATION_MS);
123
+ if (allFast && avgDuration < UNLEASHED_DOWNGRADE_AVG_DURATION_MS) {
124
+ return {
125
+ downgrade: true,
126
+ reason: `fast_completion_avg_${Math.round(avgDuration / 1000)}s`,
127
+ avgDurationMs: Math.round(avgDuration),
128
+ };
129
+ }
130
+ }
131
+ return { downgrade: false, reason: 'workload_warrants_unleashed' };
132
+ }
51
133
  function classifyRisk(args) {
52
134
  const { inputTokens, projectedTokens, signalCount, recentContextIssue, job, oneMillionAvailable } = args;
53
135
  if (inputTokens >= 185_000)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clementine-agent",
3
- "version": "1.18.35",
3
+ "version": "1.18.36",
4
4
  "description": "Clementine — Personal AI Assistant (TypeScript)",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",