omnikey-cli 1.5.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,7 +26,12 @@ function sanitizeMcpField(value, maxLength = 200) {
26
26
  function getAgentPrompt(platform, hasTaskInstructions, installedMcps = []) {
27
27
  const isWindows = config_1.config.terminalPlatform?.toLowerCase() === 'windows' || platform?.toLowerCase() === 'windows';
28
28
  return `
29
- You are an AI assistant with full terminal access. You reason about user requests and execute shell scripts to gather live data.
29
+ You are an AI agent running on the user's machine with the following capabilities:
30
+ - **Shell execution** (\`<shell_script>\` XML tag) — runs commands on the user's machine; output returns as \`TERMINAL OUTPUT:\`.
31
+ - **Web tools** — call \`web_search\` and \`web_fetch\` via native function calling to retrieve live information from the internet.${config_1.config.aiProvider !== 'anthropic' ? '\n- **Image generation** — call `generate_image` via native function calling to produce images.' : ''}${config_1.config.browserDebugPort !== undefined ? '\n- **Browser automation** — control the user\'s running browser via Playwright scripts inside `<shell_script>` blocks.' : ''}
32
+ ${installedMcps.length > 0 ? '- **MCP tools** — native function calls for integrations; see installed servers below.' : ''}
33
+
34
+ Use these capabilities to take real action. Default to doing rather than asking.
30
35
 
31
36
  **Input:**
32
37
  ${hasTaskInstructions
@@ -81,7 +86,7 @@ ${config_1.config.aiProvider === 'anthropic'
81
86
  - Use the built-in \`generate_image\` tool **only** when the user explicitly asks you to create, render, draw, design, or produce an image, picture, artwork, mockup, logo, diagram, or other visual asset.
82
87
  - Do **not** call \`generate_image\` for tasks that are about code, configuration, terminal commands, file manipulation, data extraction, web lookups, debugging, or any non-visual request — even if the user mentions words like "show", "display", "visualize", or "preview" in a non-image sense.
83
88
  - If you are unsure whether an image is required, prefer **not** to call the tool and ask the user (or proceed with a textual answer) instead.
84
- - Prefer the user-provided output path when available. If none is provided, save to \`~/.omniAgent/garbage/\` (e.g. \`~/.omniAgent/garbage/<descriptive-name>.png\`).
89
+ - Use the user-provided output path when given; otherwise follow the generated file output directory above.
85
90
  - After the tool call returns, provide a \`<final_answer>\` that includes the saved file path.
86
91
  `}
87
92
 
@@ -116,18 +121,17 @@ ${installedMcps
116
121
  - No prefix — direct user message; treat as the primary request.
117
122
 
118
123
  **Response format — every response must be exactly one of:**
119
- 1. \`<shell_script>...</shell_script>\` — to run commands and gather more data.
120
- 2. ${config_1.config.aiProvider === 'anthropic' ? 'A `web_search` or `web_fetch`' : 'A `web_search`, `web_fetch`, or `generate_image`'} tool call — to fetch web context or generate images (use native tool calling, not XML tags).
124
+ 1. \`<shell_script>...</shell_script>\` — write this XML tag directly in your text response; the client extracts and runs it on the user's machine. **Not a function call** — calling \`shell_script\` via the function-calling API will always fail.
125
+ 2. ${config_1.config.aiProvider === 'anthropic' ? 'A `web_search` or `web_fetch`' : 'A `web_search`, `web_fetch`, or `generate_image`'} **native function call**use the function-calling API for these only; do NOT wrap them in XML tags.${installedMcps.length > 0 ? ' Same for MCP tools (`mcp_<server>__<tool>`).' : ''}
121
126
  3. \`<final_answer>...</final_answer>\` — your conclusion once you have enough information.
122
127
 
123
- **Critical rule — zero tolerance for text outside tags:**
128
+ **Critical rule — zero tolerance for text outside tags or extra wrappers:**
129
+ - Do NOT wrap \`<shell_script>\` inside any other XML tag (e.g. \`<shell_function_calls>\`, \`<function_calls>\`, \`<invoke>\`). The \`<shell_script>\` tag must be the very first character of your response — no prefix, no envelope.
124
130
  - Your **entire response** — from the very first character to the very last — must be the tag and its contents. Nothing before the opening tag. Nothing after the closing tag.
125
131
  - Do NOT write reasoning, planning, or commentary before acting. Emit the tag immediately. If you need to reason through a step, do it as a comment inside the \`<shell_script>\` block (\`# ...\`), never as free text outside.
126
132
  - After receiving \`TERMINAL OUTPUT:\` or \`COMMAND ERROR:\`, your very next characters must be \`<shell_script>\` or \`<final_answer>\`. No exceptions.
127
133
  - If you feel you need to plan or think before writing the first script — suppress it. Emit \`<shell_script>\` for the first small step immediately. You will have the output to guide the next step.
128
134
 
129
- Never wrap in additional XML/JSON.
130
-
131
135
  **Shell script structure:**
132
136
  ${!isWindows
133
137
  ? `\`\`\`bash
@@ -126,6 +126,21 @@ async function runToolLoop(initialResult, session, sessionId, send, log, tools,
126
126
  });
127
127
  return { id: tc.id, name: tc.name, result: toolResult };
128
128
  }
129
+ // shell_script is not a callable tool — the model should embed commands
130
+ // in its text response using <shell_script>...</shell_script> XML tags.
131
+ // Intercept here so we don't fire a misleading "Fetching URL: undefined"
132
+ // web-call notification and return a clear correction instead.
133
+ if (tc.name === 'shell_script') {
134
+ log.warn('Agent attempted to call shell_script as a function; returning format-correction', {
135
+ sessionId,
136
+ toolIteration: toolIterations,
137
+ });
138
+ return {
139
+ id: tc.id,
140
+ name: tc.name,
141
+ result: 'Error: "shell_script" is not a callable tool. To run shell commands, place them directly in your text response using <shell_script>...</shell_script> XML tags — do not use tool/function calling for this.',
142
+ };
143
+ }
129
144
  // Notify the frontend that a web tool call is about to execute.
130
145
  const webCallContent = tc.name === 'web_search'
131
146
  ? `Searching the web for: "${String(args.query ?? '')}"`
@@ -195,6 +210,33 @@ async function runToolLoop(initialResult, session, sessionId, send, log, tools,
195
210
  const aiModel = (0, ai_client_1.getDefaultModel)(config_1.config.aiProvider, 'smart');
196
211
  const contextWindowSize = (0, ai_client_1.getContextWindowSize)(config_1.config.aiProvider);
197
212
  // ─── DB helpers ───────────────────────────────────────────────────────────────
213
+ /**
214
+ * Sanitize LLM content before processing or forwarding to the client.
215
+ *
216
+ * Two known hallucination patterns are fixed here:
217
+ *
218
+ * 1. <shell_function_calls> wrapper — the model sometimes wraps <shell_script>
219
+ * in a <shell_function_calls> envelope. Stored verbatim it compounds on
220
+ * every turn (double/triple nesting), so we strip every occurrence.
221
+ *
222
+ * 2. Mismatched closing tag — the model opens with <shell_script> but closes
223
+ * with a different tag (e.g. </shell_function>, </shell>, </script>). The
224
+ * macOS client's extractor looks for </shell_script> exactly; a wrong tag
225
+ * makes it treat the entire script as plain reasoning text and call
226
+ * receiveNext(), while the backend waits for terminal output — a deadlock.
227
+ * We normalise any </shell…> variant to </shell_script> when the correct
228
+ * closing tag is absent.
229
+ */
230
+ function sanitizeLLMContent(content) {
231
+ // 1. Strip <shell_function_calls> wrapper tags.
232
+ let result = content.replace(/<\/?shell_function_calls>/gi, '');
233
+ // 2. If <shell_script> is present but </shell_script> is missing,
234
+ // replace any stray </shell…> closing tag with the correct one.
235
+ if (result.includes('<shell_script>') && !result.includes('</shell_script>')) {
236
+ result = result.replace(/<\/shell\w*>/gi, '</shell_script>');
237
+ }
238
+ return result.trim();
239
+ }
198
240
  async function persistSessionToDB(sessionId, state) {
199
241
  try {
200
242
  const historyJson = JSON.stringify(state.history);
@@ -514,7 +556,7 @@ async function runAgentTurnInternal(sessionId, subscription, clientMessage, send
514
556
  });
515
557
  await recordUsage(result);
516
558
  }
517
- let content = result.content.trim();
559
+ let content = sanitizeLLMContent(result.content.trim());
518
560
  if (!content && result.finish_reason !== 'tool_calls') {
519
561
  log.warn('Agent LLM returned empty content; sending generic error to client.');
520
562
  const errorMessage = 'The agent returned an empty response. Please try again.';
@@ -531,7 +573,7 @@ async function runAgentTurnInternal(sessionId, subscription, clientMessage, send
531
573
  turn: session.turns,
532
574
  });
533
575
  const toolLoopResult = await runToolLoop(result, session, sessionId, send, log, tools, mcpBundle.dispatch, recordUsage);
534
- const toolLoopContent = toolLoopResult.content.trim();
576
+ const toolLoopContent = sanitizeLLMContent(toolLoopResult.content.trim());
535
577
  const toolLoopHasShell = toolLoopContent.includes('<shell_script>');
536
578
  const toolLoopHasFinal = toolLoopContent.includes('<final_answer>');
537
579
  const webToolFailed = session.history.some((msg) => msg.role === 'tool' &&
package/package.json CHANGED
@@ -4,7 +4,7 @@
4
4
  "access": "public",
5
5
  "registry": "https://registry.npmjs.org/"
6
6
  },
7
- "version": "1.5.1",
7
+ "version": "1.5.2",
8
8
  "description": "CLI for onboarding users to Omnikey AI and configuring OPENAI_API_KEY. Use Yarn for install/build.",
9
9
  "engines": {
10
10
  "node": ">=14.0.0",