npm - @moxxy/cli - Versions diffs - 1.3.1 → 1.4.0 - Mend

@moxxy/cli 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/package.json +4 -1
package/src/api-client.js +71 -0
package/src/cli.js +3 -2
package/src/commands/init.js +174 -0
package/src/commands/settings.js +108 -6
package/src/tui/hooks/use-command-handler.js +63 -1
package/src/tui/slash-commands.js +1 -0
package/src/tui/voice-recorder.js +117 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@moxxy/cli",
-  "version": "1.3.1",
+  "version": "1.4.0",
   "description": "CLI for the Moxxy agentic framework — manage agents, skills, plugins, channels, and vaults from the terminal",
   "type": "module",
   "license": "MIT",
@@ -50,5 +50,8 @@
     "marked": "^15.0.0",
     "react": "^19.0.0",
     "tsx": "^4.21.0"
+  },
+  "devDependencies": {
+    "react-devtools-core": "file:./stubs/react-devtools-core"
   }
 }

package/src/api-client.js CHANGED Viewed

@@ -127,6 +127,46 @@ export class ApiClient {
     return this.request(`/v1/agents/${encodeURIComponent(agentId)}/runs`, 'POST', { task });
   }
+  /**
+   * Upload a recorded voice clip to the gateway. The server transcribes it
+   * via the configured STT provider and immediately starts a run with the
+   * transcript as the task. Returns `{ transcript, run_id, status, ... }`.
+   */
+  async startRunWithAudio(agentId, { data, mime = 'audio/wav', filename = 'voice.wav' }) {
+    const form = new FormData();
+    const blob = new Blob([data], { type: mime });
+    form.append('audio', blob, filename);
+    const headers = {};
+    if (this.token) {
+      headers['authorization'] = `Bearer ${this.token}`;
+    }
+    // NOTE: do NOT set content-type — fetch will compute the multipart
+    // boundary for us.
+    const url = this.buildUrl(`/v1/agents/${encodeURIComponent(agentId)}/runs/audio`);
+    let resp;
+    try {
+      resp = await fetch(url, { method: 'POST', headers, body: form });
+    } catch (err) {
+      if (isConnectionError(err)) throw gatewayDownError();
+      throw err;
+    }
+    if (!resp.ok) {
+      const err = await resp.json().catch(() => ({
+        error: 'unknown',
+        message: resp.statusText,
+      }));
+      const error = new Error(err.message || `API error ${resp.status}`);
+      error.status = resp.status;
+      error.code = err.error;
+      throw error;
+    }
+    const text = await resp.text();
+    if (!text) return {};
+    return JSON.parse(text);
+  }
   async stopAgent(agentId) {
     return this.request(`/v1/agents/${encodeURIComponent(agentId)}/stop`, 'POST');
   }
@@ -296,6 +336,37 @@ export class ApiClient {
   async setAgentTemplate(name, template) {
     return this.request(`/v1/agents/${encodeURIComponent(name)}/template`, 'PATCH', { template });
   }
+  // --- Settings: Speech-to-text ---------------------------------------
+  /**
+   * Fetch the currently-active STT configuration from the gateway.
+   * Returns `{ enabled: false }` when voice messages are off, or
+   * `{ enabled: true, provider, model, secret_ref, ... }` otherwise.
+   * The API never returns the raw API key.
+   */
+  async getSttSettings() {
+    return this.request('/v1/settings/stt', 'GET');
+  }
+  /**
+   * Configure (or reconfigure) speech-to-text.
+   *
+   * Pass `api_key` to provision a fresh vault secret; omit it to reuse an
+   * existing `secret_ref`. The running gateway swaps providers in-place —
+   * no restart needed.
+   */
+  async updateSttSettings(body) {
+    return this.request('/v1/settings/stt', 'PUT', body);
+  }
+  /**
+   * Disable voice messages. Removes the `stt` block from settings.yaml
+   * and clears the in-memory provider. Does NOT delete the vault secret.
+   */
+  async deleteSttSettings() {
+    return this.request('/v1/settings/stt', 'DELETE');
+  }
 }
 export function createApiClient(baseUrl, token, authMode = 'token') {

package/src/cli.js CHANGED Viewed

@@ -94,8 +94,9 @@ Usage:
   moxxy chat [--agent <id>]                           Alias for tui
   moxxy events tail [--agent <id>] [--run <id>] [--json]
   moxxy settings network-mode [safe|unsafe]           Get or set network mode
-  moxxy settings get [--key <k>] [--json]            View settings
-  moxxy settings set --key <k> --value <v>           Set a setting
+  moxxy settings stt [status|enable|disable]          Configure voice messages (speech-to-text)
+  moxxy settings get [--key <k>] [--json]             View settings
+  moxxy settings set --key <k> --value <v>            Set a setting
   moxxy doctor                                       Diagnose installation
   moxxy update [--check] [--force] [--json]          Check for and install updates
   moxxy update --rollback                            Restore previous gateway version

package/src/commands/init.js CHANGED Viewed

@@ -807,9 +807,183 @@ export async function runInit(client, args) {
     }
   }
+  // Step 8: Voice messages (optional)
+  p.note(
+    'Voice messages let users send audio to the agent on any channel\n' +
+    '(Telegram voice notes, the TUI /voice command, or direct audio upload\n' +
+    'to the gateway). The audio is transcribed to text before the agent\n' +
+    'sees it. The agent does not reply with voice.',
+    'Voice Messages (Speech-to-Text)'
+  );
+  const enableVoice = await p.confirm({
+    message: 'Enable voice messages?',
+    initialValue: false,
+  });
+  handleCancel(enableVoice);
+  if (enableVoice) {
+    const sttProvider = await p.select({
+      message: 'Speech-to-text provider',
+      options: [
+        {
+          value: 'whisper',
+          label: 'OpenAI Whisper',
+          hint: 'Cloud API, requires an OpenAI key',
+        },
+        { value: '__skip__', label: 'Skip', hint: 'configure later' },
+      ],
+    });
+    handleCancel(sttProvider);
+    if (sttProvider === 'whisper') {
+      const configured = await configureWhisperStt(client, moxxyHome);
+      if (configured) {
+        p.log.success('Voice messages enabled (OpenAI Whisper).');
+      } else {
+        p.log.warn('Voice setup skipped. Retry later with: moxxy init');
+      }
+    }
+  }
   p.outro('Setup complete. Run moxxy to see available commands.');
 }
+// ---------------------------------------------------------------------------
+// Speech-to-text (voice message) helpers
+// ---------------------------------------------------------------------------
+const STT_WHISPER_BACKEND_KEY = 'moxxy_stt_whisper';
+const STT_WHISPER_KEY_NAME = 'STT_WHISPER_API_KEY';
+const OPENAI_PROVIDER_BACKEND_KEY = 'moxxy_provider_openai';
+/**
+ * Configure Whisper STT: either reuse an existing OpenAI vault secret or
+ * prompt for a new key, then persist an `stt` block to settings.yaml.
+ * Returns true on success, false if the user bailed or storage failed.
+ */
+async function configureWhisperStt(client, moxxyHome) {
+  // Look for an existing vault entry we can reuse. Prefer a secret already
+  // backing the OpenAI provider install so users don't enter the same key
+  // twice.
+  let reuseBackendKey = null;
+  try {
+    const secrets = await client.listSecrets();
+    const existing = (secrets || []).find(
+      (s) => s.backend_key === OPENAI_PROVIDER_BACKEND_KEY,
+    );
+    if (existing) {
+      const reuse = await p.confirm({
+        message: 'Reuse your existing OpenAI API key for Whisper?',
+        initialValue: true,
+      });
+      handleCancel(reuse);
+      if (reuse) reuseBackendKey = OPENAI_PROVIDER_BACKEND_KEY;
+    }
+  } catch (err) {
+    // Vault listing may fail if the gateway is down — fall through to prompt.
+    p.log.warn(`Could not check existing vault secrets: ${err.message}`);
+  }
+  let secretRef = reuseBackendKey;
+  if (!secretRef) {
+    const apiKey = await p.password({
+      message: 'Enter your OpenAI API key (used for Whisper transcription)',
+      validate: (val) => {
+        if (!val || !val.trim()) return 'API key cannot be empty';
+      },
+    });
+    handleCancel(apiKey);
+    try {
+      await withSpinner(
+        'Storing API key in vault...',
+        async () => {
+          await client.createSecret({
+            key_name: STT_WHISPER_KEY_NAME,
+            backend_key: STT_WHISPER_BACKEND_KEY,
+            policy_label: 'stt-provider',
+            value: apiKey.trim(),
+          });
+        },
+        'Whisper API key stored.',
+      );
+      secretRef = STT_WHISPER_BACKEND_KEY;
+    } catch (err) {
+      p.log.error(`Failed to store API key: ${err.message}`);
+      return false;
+    }
+  }
+  try {
+    saveSttSetting(moxxyHome, {
+      provider: 'whisper',
+      model: 'whisper-1',
+      secret_ref: secretRef,
+    });
+  } catch (err) {
+    p.log.error(`Failed to write settings.yaml: ${err.message}`);
+    return false;
+  }
+  return true;
+}
+/**
+ * Write (or clear) the `stt` block in `{moxxy_home}/settings.yaml`.
+ *
+ * Pass `null` to remove the block. Pass an object with at least `provider`,
+ * `model`, and `secret_ref` to write a fresh block. Any prior `stt:` block
+ * is removed in full — including nested indented child lines — before the
+ * new block is appended, so repeated runs don't accumulate stale entries.
+ */
+export function saveSttSetting(moxxyHome, config) {
+  const settingsFile = join(moxxyHome, 'settings.yaml');
+  let existing = '';
+  try {
+    existing = readFileSync(settingsFile, 'utf-8');
+  } catch { /* no existing settings */ }
+  // Strip any previous `stt:` block. A block is the `stt:` line plus all
+  // subsequent indented (leading whitespace) lines — standard flow YAML.
+  const kept = [];
+  let inSttBlock = false;
+  for (const line of existing.split('\n')) {
+    if (inSttBlock) {
+      if (/^\s+\S/.test(line) || line.trim() === '') {
+        // indented child or blank line: still inside the block
+        if (line.trim() === '') {
+          inSttBlock = false;
+          kept.push(line);
+        }
+        continue;
+      }
+      inSttBlock = false;
+    }
+    if (/^stt:\s*$/.test(line) || /^stt:\s/.test(line)) {
+      inSttBlock = true;
+      continue;
+    }
+    kept.push(line);
+  }
+  // Drop trailing empty lines so we can cleanly append.
+  while (kept.length > 0 && kept[kept.length - 1].trim() === '') kept.pop();
+  if (config) {
+    kept.push('stt:');
+    kept.push(`  provider: ${config.provider}`);
+    kept.push(`  model: ${config.model}`);
+    kept.push(`  secret_ref: ${config.secret_ref}`);
+    if (config.api_base) kept.push(`  api_base: ${config.api_base}`);
+  }
+  mkdirSync(moxxyHome, { recursive: true });
+  writeFileSync(settingsFile, kept.join('\n') + '\n');
+}
 // ---------------------------------------------------------------------------
 // Browser rendering helpers
 // ---------------------------------------------------------------------------

package/src/commands/settings.js CHANGED Viewed

@@ -175,7 +175,103 @@ async function settingsBrowserRendering(flags) {
   }
 }
-export async function runSettings(_client, args) {
+/**
+ * Speech-to-text (voice message) settings. Unlike network_mode and
+ * browser_rendering, STT is configured through the gateway's
+ * `/v1/settings/stt` API so the running bridge picks up the new provider
+ * without a restart AND the vault-stored API key is owned by the gateway.
+ */
+async function settingsStt(client, flags, positional) {
+  if (!client) {
+    throw new Error('STT commands require a running gateway. Start it with: moxxy gateway start');
+  }
+  const sub = positional || 'status';
+  switch (sub) {
+    case 'status':
+    case 'get':
+    case 'show': {
+      const resp = await client.getSttSettings();
+      if (flags.json) {
+        console.log(JSON.stringify(resp, null, 2));
+        return;
+      }
+      if (!resp.enabled) {
+        p.log.info('Voice messages: disabled.');
+        p.log.info('Enable with: moxxy settings stt enable');
+        return;
+      }
+      p.log.info('Voice messages: enabled');
+      p.log.info(`  provider:   ${resp.provider}`);
+      p.log.info(`  model:      ${resp.model}`);
+      p.log.info(`  secret_ref: ${resp.secret_ref}`);
+      if (resp.api_base) p.log.info(`  api_base:   ${resp.api_base}`);
+      p.log.info(`  max_bytes:  ${resp.max_bytes}`);
+      p.log.info(`  max_seconds: ${resp.max_seconds}`);
+      return;
+    }
+    case 'enable':
+    case 'configure':
+    case 'set': {
+      // Non-interactive: `moxxy settings stt enable --api-key sk-... [--provider whisper] [--model whisper-1]`
+      const providerName = flags.provider || 'whisper';
+      const modelName = flags.model || 'whisper-1';
+      const apiBase = flags['api-base'] || flags.api_base || null;
+      let apiKey = flags['api-key'] || flags.api_key || null;
+      const secretRef = flags['secret-ref'] || flags.secret_ref || null;
+      if (!apiKey && !secretRef) {
+        if (!isInteractive()) {
+          throw new Error(
+            'Provide --api-key <key>, or --secret-ref <backend_key> to reuse an existing vault entry.',
+          );
+        }
+        const keyInput = await p.password({
+          message: 'OpenAI API key for Whisper',
+          validate: (v) => {
+            if (!v || !v.trim()) return 'API key cannot be empty';
+          },
+        });
+        if (p.isCancel(keyInput)) return;
+        apiKey = keyInput;
+      }
+      const body = { provider: providerName, model: modelName };
+      if (apiKey) body.api_key = apiKey.trim();
+      if (apiBase) body.api_base = apiBase;
+      if (secretRef) body.secret_ref = secretRef;
+      const resp = await client.updateSttSettings(body);
+      if (flags.json) {
+        console.log(JSON.stringify(resp, null, 2));
+      } else {
+        p.log.success(`Voice messages enabled (${resp.provider}, ${resp.model}).`);
+      }
+      return;
+    }
+    case 'disable':
+    case 'off':
+    case 'clear': {
+      const resp = await client.deleteSttSettings();
+      if (flags.json) {
+        console.log(JSON.stringify(resp, null, 2));
+      } else {
+        p.log.success('Voice messages disabled.');
+      }
+      return;
+    }
+    default:
+      throw new Error(
+        `Unknown stt action '${sub}'. Use: status | enable [--api-key <key>] | disable`,
+      );
+  }
+}
+export async function runSettings(client, args) {
   const { action, flags } = parseSettingsCommand(args);
   // Collect first positional arg after the action for convenience
@@ -198,6 +294,10 @@ export async function runSettings(_client, args) {
     case 'browser-rendering':
       await settingsBrowserRendering(flags);
       break;
+    case 'stt':
+    case 'voice':
+      await settingsStt(client, flags, flags._positional);
+      break;
     default:
       if (isInteractive() && !action) {
         // Interactive: show settings menu
@@ -206,18 +306,20 @@ export async function runSettings(_client, args) {
           options: [
             { value: 'network-mode', label: 'Network mode', hint: 'safe / unsafe domain access' },
             { value: 'browser-rendering', label: 'Browser rendering', hint: 'headless Chrome for JS-heavy sites' },
+            { value: 'stt', label: 'Voice (STT)', hint: 'speech-to-text provider' },
             { value: 'get', label: 'View all settings', hint: 'show current configuration' },
           ],
         });
         if (p.isCancel(selected)) return;
-        await runSettings(_client, [selected]);
+        await runSettings(client, [selected]);
       } else {
         throw new Error(
           'Usage: moxxy settings <action>\n' +
-          '  network-mode [safe|unsafe]         Get or set network mode\n' +
-          '  browser-rendering [true|false]     Enable/disable headless Chrome rendering\n' +
-          '  get [--key <k>]                    View settings\n' +
-          '  set --key <k> --value <v>          Set a setting'
+          '  network-mode [safe|unsafe]                     Get or set network mode\n' +
+          '  browser-rendering [true|false]                 Enable/disable headless Chrome rendering\n' +
+          '  stt [status|enable|disable] [--api-key <key>]  Configure voice messages (speech-to-text)\n' +
+          '  get [--key <k>]                                View settings\n' +
+          '  set --key <k> --value <v>                      Set a setting'
         );
       }
   }

package/src/tui/hooks/use-command-handler.js CHANGED Viewed

@@ -1,5 +1,6 @@
-import { useReducer, useCallback } from 'react';
+import { useReducer, useCallback, useRef } from 'react';
 import { SLASH_COMMANDS } from '../slash-commands.js';
+import { startRecording } from '../voice-recorder.js';
 const INITIAL_STATE = { type: 'idle' };
@@ -23,6 +24,8 @@ function reducer(state, action) {
       return { type: 'mcp_test_id' };
     case 'template_assign_slug':
       return { type: 'template_assign_slug' };
+    case 'voice_recording':
+      return { type: 'voice_recording' };
     case 'reset':
       return INITIAL_STATE;
     default:
@@ -54,9 +57,49 @@ export function useCommandHandler({
   onOpenTemplateAssignWizard,
 }) {
   const [twoStep, dispatch] = useReducer(reducer, INITIAL_STATE);
+  const voiceHandleRef = useRef(null);
   const handleSubmit = useCallback(async (text) => {
     const task = text.trim().replace(/^\/{2,}/, '/');
+    // While a recording is active, ANY submit (including bare Enter) stops
+    // it and ships the clip. This must run before the empty-text early return
+    // below so hitting Enter with no text still ends the capture.
+    if (twoStep.type === 'voice_recording') {
+      const handle = voiceHandleRef.current;
+      dispatch({ type: 'reset' });
+      voiceHandleRef.current = null;
+      if (!handle) {
+        eventsHandler.addSystemMessage('No active recording.');
+        return;
+      }
+      try {
+        const clip = await handle.stop();
+        eventsHandler.addSystemMessage('Transcribing voice message…');
+        if (!agent) {
+          eventsHandler.addSystemMessage('No agent connected. Cannot run task.');
+          return;
+        }
+        try {
+          const result = await client.startRunWithAudio(agent.name, clip);
+          const transcript = (result && result.transcript) || '[voice]';
+          eventsHandler.addUserMessage(transcript);
+          if (onAgentUpdate) onAgentUpdate({ status: 'running' });
+        } catch (err) {
+          if (err.isGatewayDown) {
+            eventsHandler.addSystemMessage(err.message);
+          } else {
+            eventsHandler.addSystemMessage(`Voice error: ${err.message}`);
+          }
+        }
+      } catch (err) {
+        eventsHandler.addSystemMessage(`Recording failed: ${err.message}`);
+      } finally {
+        handle.cleanup();
+      }
+      return;
+    }
     if (!task) return;
     // Pending ask: agent asked for user input
@@ -419,6 +462,25 @@ export function useCommandHandler({
       }
       return;
     }
+    if (task === '/voice') {
+      if (voiceHandleRef.current) {
+        // Defensive: treat a second /voice as a stop even if state drifted.
+        dispatch({ type: 'voice_recording' });
+        return;
+      }
+      try {
+        const handle = await startRecording();
+        voiceHandleRef.current = handle;
+        dispatch({ type: 'voice_recording' });
+        eventsHandler.addSystemMessage(
+          `Recording (${handle.tool})… press Enter or /voice again to stop.`,
+        );
+      } catch (err) {
+        eventsHandler.addSystemMessage(`Cannot record voice: ${err.message}`);
+      }
+      return;
+    }
     if (task === '/template clear') {
       try {
         await client.setAgentTemplate(agentId, null);

package/src/tui/slash-commands.js CHANGED Viewed

@@ -9,6 +9,7 @@ export const SLASH_COMMANDS = [
   { name: '/vault',        description: 'Open vault actions',          aliases: ['/vault delete'] },
   { name: '/mcp',          description: 'Open MCP actions',            aliases: [] },
   { name: '/template',     description: 'Open template actions',       aliases: [] },
+  { name: '/voice',        description: 'Record a voice message (needs sox or ffmpeg)', aliases: [] },
 ];
 export function matchCommands(input) {

package/src/tui/voice-recorder.js ADDED Viewed

@@ -0,0 +1,117 @@
+import { spawn } from 'node:child_process';
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+import { promisify } from 'node:util';
+import { execFile } from 'node:child_process';
+const execFileP = promisify(execFile);
+/**
+ * Check whether a binary is on PATH. Returns the absolute path, or null.
+ */
+async function which(name) {
+  try {
+    const { stdout } = await execFileP('which', [name]);
+    const p = stdout.trim();
+    return p || null;
+  } catch {
+    return null;
+  }
+}
+/**
+ * Detect an available recording tool. Prefers `sox` (via `rec`) because it
+ * speaks WAV out of the box and handles Ctrl-C gracefully. Falls back to
+ * `ffmpeg` with a platform-appropriate input device. Returns `null` if
+ * neither is present.
+ */
+export async function detectRecorder() {
+  const rec = await which('rec');
+  if (rec) return { tool: 'rec', bin: rec };
+  const sox = await which('sox');
+  if (sox) return { tool: 'sox', bin: sox };
+  const ffmpeg = await which('ffmpeg');
+  if (ffmpeg) return { tool: 'ffmpeg', bin: ffmpeg };
+  return null;
+}
+function ffmpegArgs(outPath) {
+  const platform = process.platform;
+  if (platform === 'darwin') {
+    // `avfoundation` default audio input is `:0`.
+    return ['-loglevel', 'error', '-f', 'avfoundation', '-i', ':0', '-ac', '1', '-ar', '16000', '-y', outPath];
+  }
+  // Linux: assume ALSA `default` — user can symlink their own if needed.
+  return ['-loglevel', 'error', '-f', 'alsa', '-i', 'default', '-ac', '1', '-ar', '16000', '-y', outPath];
+}
+/**
+ * Start a recording. Returns a handle with `stop()` that resolves to
+ * `{ path, data, mime }`. The caller owns cleanup of the temp file via
+ * `cleanup()`.
+ *
+ * The audio is written to a platform temp file so that even if the recorder
+ * dies mid-stream we never lose the buffer.
+ */
+export async function startRecording() {
+  const recorder = await detectRecorder();
+  if (!recorder) {
+    throw new Error('No recorder found. Install `sox` (recommended) or `ffmpeg`.');
+  }
+  const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'moxxy-voice-'));
+  const outPath = path.join(tmpDir, 'voice.wav');
+  let args;
+  if (recorder.tool === 'rec' || recorder.tool === 'sox') {
+    // `rec` is sox with sensible defaults; `sox` requires `-d` for default input.
+    args = recorder.tool === 'rec'
+      ? ['-q', '-c', '1', '-r', '16000', outPath]
+      : ['-q', '-d', '-c', '1', '-r', '16000', outPath];
+  } else {
+    args = ffmpegArgs(outPath);
+  }
+  const child = spawn(recorder.bin, args, { stdio: ['ignore', 'ignore', 'pipe'] });
+  let stderr = '';
+  child.stderr.on('data', chunk => {
+    stderr += chunk.toString();
+  });
+  let exited = false;
+  const exitPromise = new Promise((resolve) => {
+    child.on('exit', (code, signal) => {
+      exited = true;
+      resolve({ code, signal });
+    });
+  });
+  return {
+    tool: recorder.tool,
+    outPath,
+    async stop() {
+      if (!exited) {
+        // SIGINT is important: ffmpeg and sox both flush the output file
+        // cleanly on SIGINT. SIGTERM/KILL can leave a truncated WAV header.
+        try { child.kill('SIGINT'); } catch {}
+      }
+      await exitPromise;
+      if (!fs.existsSync(outPath)) {
+        throw new Error(`Recorder produced no output file. stderr: ${stderr.trim() || '<empty>'}`);
+      }
+      const data = fs.readFileSync(outPath);
+      if (data.length < 44) {
+        // 44 bytes is the minimum WAV header.
+        throw new Error('Recording too short or empty.');
+      }
+      return { path: outPath, data, mime: 'audio/wav', filename: 'voice.wav' };
+    },
+    cleanup() {
+      try {
+        fs.rmSync(tmpDir, { recursive: true, force: true });
+      } catch {}
+    },
+  };
+}