npm - codeharbor - Versions diffs - 0.1.20 → 0.1.22 - Mend

codeharbor 0.1.20 → 0.1.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/.env.example +9 -0
package/README.md +21 -5
package/dist/cli.js +361 -48
package/package.json +4 -2
package/scripts/local-whisper-transcribe.py +52 -0

package/.env.example CHANGED Viewed

@@ -68,6 +68,15 @@ CLI_COMPAT_TRANSCRIBE_AUDIO=false
 CLI_COMPAT_AUDIO_TRANSCRIBE_MODEL=gpt-4o-mini-transcribe
 CLI_COMPAT_AUDIO_TRANSCRIBE_TIMEOUT_MS=120000
 CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_CHARS=6000
+CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_RETRIES=1
+CLI_COMPAT_AUDIO_TRANSCRIBE_RETRY_DELAY_MS=800
+# Skip transcription when audio file is larger than this limit (bytes). Default: 25MB.
+CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_BYTES=26214400
+# Optional local whisper command. Use {input} placeholder for the audio file path.
+# Example:
+# CLI_COMPAT_AUDIO_LOCAL_WHISPER_COMMAND=codeharbor-whisper-transcribe --input {input} --model small
+CLI_COMPAT_AUDIO_LOCAL_WHISPER_COMMAND=
+CLI_COMPAT_AUDIO_LOCAL_WHISPER_TIMEOUT_MS=180000
 # Optional JSONL output path for executed prompt recording (for replay benchmarking).
 CLI_COMPAT_RECORD_PATH=

package/README.md CHANGED Viewed

@@ -468,6 +468,17 @@ To make IM behavior closer to local `codex` CLI interaction, enable:
   - timeout for each audio transcription request
 - `CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_CHARS`
   - max transcript length appended to prompt for one attachment
+- `CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_RETRIES`
+  - retry count for local/OpenAI transcription failures (default `1`)
+- `CLI_COMPAT_AUDIO_TRANSCRIBE_RETRY_DELAY_MS`
+  - base retry delay between attempts
+- `CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_BYTES`
+  - skip transcription when attachment is larger than this size
+- `CLI_COMPAT_AUDIO_LOCAL_WHISPER_COMMAND`
+  - optional local whisper command template (use `{input}` placeholder for audio file path)
+  - helper command shipped by package: `codeharbor-whisper-transcribe --input {input} --model small`
+- `CLI_COMPAT_AUDIO_LOCAL_WHISPER_TIMEOUT_MS`
+  - timeout for local whisper command execution
 - `CLI_COMPAT_RECORD_PATH=/abs/path/records.jsonl`
   - append executed prompts as JSONL for replay benchmarking
@@ -514,11 +525,16 @@ When image attachments are present and `CLI_COMPAT_FETCH_MEDIA=true`, CodeHarbor
 When audio attachments are present and both `CLI_COMPAT_FETCH_MEDIA=true` and `CLI_COMPAT_TRANSCRIBE_AUDIO=true`, CodeHarbor will:
 1. download `m.audio` media to a temp file
-2. call OpenAI audio transcription API and append transcript to `[audio_transcripts]` prompt block
-3. continue request even if transcription fails (warn log + no transcript)
-4. best-effort cleanup temp files after the request
-`OPENAI_API_KEY` is required only when audio transcription is enabled.
+2. skip oversized audio files based on `CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_BYTES`
+3. if `CLI_COMPAT_AUDIO_LOCAL_WHISPER_COMMAND` is configured, execute local whisper first
+4. if local whisper fails and `OPENAI_API_KEY` is available, fallback to OpenAI transcription API
+5. retry transient failures using `CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_RETRIES`
+6. append transcript to `[audio_transcripts]` prompt block
+7. continue request even if transcription fails (warn log + no transcript)
+8. best-effort cleanup temp files after the request
+`OPENAI_API_KEY` is optional when local whisper command is configured, and required only for OpenAI fallback.
+For `codeharbor-whisper-transcribe`, install runtime first: `python3 -m pip install faster-whisper`.
 ## Replay Benchmark

package/dist/cli.js CHANGED Viewed

@@ -24,14 +24,14 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
 ));
 // src/cli.ts
-var import_node_child_process7 = require("child_process");
+var import_node_child_process8 = require("child_process");
 var import_node_fs11 = __toESM(require("fs"));
 var import_node_path15 = __toESM(require("path"));
 var import_commander = require("commander");
 // src/app.ts
-var import_node_child_process5 = require("child_process");
-var import_node_util3 = require("util");
+var import_node_child_process6 = require("child_process");
+var import_node_util4 = require("util");
 // src/admin-server.ts
 var import_node_child_process3 = require("child_process");
@@ -366,6 +366,26 @@ var ADMIN_CONSOLE_HTML = `<!doctype html>
             <span class="field-label">Audio transcript max chars</span>
             <input id="global-cli-audio-max-chars" type="number" min="1" />
           </label>
+          <label class="field">
+            <span class="field-label">Audio transcribe max retries</span>
+            <input id="global-cli-audio-max-retries" type="number" min="0" max="10" />
+          </label>
+          <label class="field">
+            <span class="field-label">Audio transcribe retry delay (ms)</span>
+            <input id="global-cli-audio-retry-delay" type="number" min="0" />
+          </label>
+          <label class="field">
+            <span class="field-label">Audio max bytes</span>
+            <input id="global-cli-audio-max-bytes" type="number" min="1" />
+          </label>
+          <label class="field">
+            <span class="field-label">Local whisper command</span>
+            <input id="global-cli-audio-local-command" type="text" placeholder='python3 /opt/whisper/transcribe.py --input {input}' />
+          </label>
+          <label class="field">
+            <span class="field-label">Local whisper timeout (ms)</span>
+            <input id="global-cli-audio-local-timeout" type="number" min="1" />
+          </label>
           <label class="checkbox"><input id="global-agent-enabled" type="checkbox" /><span>Enable multi-agent workflow</span></label>
           <label class="field">
             <span class="field-label">Workflow auto-repair rounds</span>
@@ -704,6 +724,13 @@ var ADMIN_CONSOLE_HTML = `<!doctype html>
             document.getElementById("global-cli-audio-model").value = cliCompat.audioTranscribeModel || "gpt-4o-mini-transcribe";
             document.getElementById("global-cli-audio-timeout").value = String(cliCompat.audioTranscribeTimeoutMs || 120000);
             document.getElementById("global-cli-audio-max-chars").value = String(cliCompat.audioTranscribeMaxChars || 6000);
+            document.getElementById("global-cli-audio-max-retries").value = String(
+              typeof cliCompat.audioTranscribeMaxRetries === "number" ? cliCompat.audioTranscribeMaxRetries : 1
+            );
+            document.getElementById("global-cli-audio-retry-delay").value = String(cliCompat.audioTranscribeRetryDelayMs || 800);
+            document.getElementById("global-cli-audio-max-bytes").value = String(cliCompat.audioTranscribeMaxBytes || 26214400);
+            document.getElementById("global-cli-audio-local-command").value = cliCompat.audioLocalWhisperCommand || "";
+            document.getElementById("global-cli-audio-local-timeout").value = String(cliCompat.audioLocalWhisperTimeoutMs || 180000);
             document.getElementById("global-agent-enabled").checked = Boolean(agentWorkflow.enabled);
             document.getElementById("global-agent-repair-rounds").value = String(
               typeof agentWorkflow.autoRepairMaxRounds === "number" ? agentWorkflow.autoRepairMaxRounds : 1
@@ -749,7 +776,12 @@ var ADMIN_CONSOLE_HTML = `<!doctype html>
                 transcribeAudio: asBool("global-cli-transcribe-audio"),
                 audioTranscribeModel: asText("global-cli-audio-model") || "gpt-4o-mini-transcribe",
                 audioTranscribeTimeoutMs: asNumber("global-cli-audio-timeout", 120000),
-                audioTranscribeMaxChars: asNumber("global-cli-audio-max-chars", 6000)
+                audioTranscribeMaxChars: asNumber("global-cli-audio-max-chars", 6000),
+                audioTranscribeMaxRetries: asNumber("global-cli-audio-max-retries", 1),
+                audioTranscribeRetryDelayMs: asNumber("global-cli-audio-retry-delay", 800),
+                audioTranscribeMaxBytes: asNumber("global-cli-audio-max-bytes", 26214400),
+                audioLocalWhisperCommand: asText("global-cli-audio-local-command"),
+                audioLocalWhisperTimeoutMs: asNumber("global-cli-audio-local-timeout", 180000)
               },
               agentWorkflow: {
                 enabled: asBool("global-agent-enabled"),
@@ -2025,6 +2057,58 @@ var AdminServer = class {
         envUpdates.CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_CHARS = String(value);
         updatedKeys.push("cliCompat.audioTranscribeMaxChars");
       }
+      if ("audioTranscribeMaxRetries" in compat) {
+        const value = normalizePositiveInt(
+          compat.audioTranscribeMaxRetries,
+          this.config.cliCompat.audioTranscribeMaxRetries,
+          0,
+          10
+        );
+        this.config.cliCompat.audioTranscribeMaxRetries = value;
+        envUpdates.CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_RETRIES = String(value);
+        updatedKeys.push("cliCompat.audioTranscribeMaxRetries");
+      }
+      if ("audioTranscribeRetryDelayMs" in compat) {
+        const value = normalizeNonNegativeInt(
+          compat.audioTranscribeRetryDelayMs,
+          this.config.cliCompat.audioTranscribeRetryDelayMs
+        );
+        this.config.cliCompat.audioTranscribeRetryDelayMs = value;
+        envUpdates.CLI_COMPAT_AUDIO_TRANSCRIBE_RETRY_DELAY_MS = String(value);
+        updatedKeys.push("cliCompat.audioTranscribeRetryDelayMs");
+      }
+      if ("audioTranscribeMaxBytes" in compat) {
+        const value = normalizePositiveInt(
+          compat.audioTranscribeMaxBytes,
+          this.config.cliCompat.audioTranscribeMaxBytes,
+          1,
+          Number.MAX_SAFE_INTEGER
+        );
+        this.config.cliCompat.audioTranscribeMaxBytes = value;
+        envUpdates.CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_BYTES = String(value);
+        updatedKeys.push("cliCompat.audioTranscribeMaxBytes");
+      }
+      if ("audioLocalWhisperCommand" in compat) {
+        const value = normalizeString(
+          compat.audioLocalWhisperCommand,
+          this.config.cliCompat.audioLocalWhisperCommand ?? "",
+          "cliCompat.audioLocalWhisperCommand"
+        );
+        this.config.cliCompat.audioLocalWhisperCommand = value || null;
+        envUpdates.CLI_COMPAT_AUDIO_LOCAL_WHISPER_COMMAND = this.config.cliCompat.audioLocalWhisperCommand ?? "";
+        updatedKeys.push("cliCompat.audioLocalWhisperCommand");
+      }
+      if ("audioLocalWhisperTimeoutMs" in compat) {
+        const value = normalizePositiveInt(
+          compat.audioLocalWhisperTimeoutMs,
+          this.config.cliCompat.audioLocalWhisperTimeoutMs,
+          1,
+          Number.MAX_SAFE_INTEGER
+        );
+        this.config.cliCompat.audioLocalWhisperTimeoutMs = value;
+        envUpdates.CLI_COMPAT_AUDIO_LOCAL_WHISPER_TIMEOUT_MS = String(value);
+        updatedKeys.push("cliCompat.audioLocalWhisperTimeoutMs");
+      }
     }
     if ("agentWorkflow" in body) {
       const workflow = asObject(body.agentWorkflow, "agentWorkflow");
@@ -3715,20 +3799,32 @@ var import_async_mutex = require("async-mutex");
 var import_promises5 = __toESM(require("fs/promises"));
 // src/audio-transcriber.ts
+var import_node_child_process5 = require("child_process");
 var import_promises3 = __toESM(require("fs/promises"));
 var import_node_path8 = __toESM(require("path"));
+var import_node_util3 = require("util");
+var execAsync = (0, import_node_util3.promisify)(import_node_child_process5.exec);
+var RETRYABLE_OPENAI_STATUS = /* @__PURE__ */ new Set([408, 425, 429, 500, 502, 503, 504]);
 var AudioTranscriber = class {
   enabled;
   apiKey;
   model;
   timeoutMs;
   maxChars;
+  maxRetries;
+  retryDelayMs;
+  localWhisperCommand;
+  localWhisperTimeoutMs;
   constructor(options) {
     this.enabled = options.enabled;
     this.apiKey = options.apiKey;
     this.model = options.model;
     this.timeoutMs = options.timeoutMs;
     this.maxChars = options.maxChars;
+    this.maxRetries = options.maxRetries;
+    this.retryDelayMs = options.retryDelayMs;
+    this.localWhisperCommand = options.localWhisperCommand;
+    this.localWhisperTimeoutMs = options.localWhisperTimeoutMs;
   }
   isEnabled() {
     return this.enabled;
@@ -3737,14 +3833,20 @@ var AudioTranscriber = class {
     if (!this.enabled || attachments.length === 0) {
       return [];
     }
-    if (!this.apiKey) {
+    const hasLocalWhisper = Boolean(this.localWhisperCommand);
+    const hasOpenAi = Boolean(this.apiKey);
+    if (!hasLocalWhisper && !hasOpenAi) {
       throw new Error(
-        "Audio transcription is enabled but OPENAI_API_KEY is missing. Set OPENAI_API_KEY or disable CLI_COMPAT_TRANSCRIBE_AUDIO."
+        "Audio transcription is enabled but no backend is configured. Set CLI_COMPAT_AUDIO_LOCAL_WHISPER_COMMAND or OPENAI_API_KEY."
       );
     }
     const transcripts = [];
+    const failures = [];
     for (const attachment of attachments) {
-      const text = await this.transcribeOne(attachment);
+      const text = await this.transcribeWithFallback(attachment, hasLocalWhisper, hasOpenAi).catch((error) => {
+        failures.push(formatError3(error));
+        return "";
+      });
       if (!text) {
         continue;
       }
@@ -3753,9 +3855,73 @@ var AudioTranscriber = class {
         text
       });
     }
+    if (transcripts.length === 0 && failures.length > 0) {
+      throw new Error(`Audio transcription failed: ${failures.join(" | ")}`);
+    }
     return transcripts;
   }
-  async transcribeOne(attachment) {
+  async transcribeWithFallback(attachment, hasLocalWhisper, hasOpenAi) {
+    let localError = null;
+    if (hasLocalWhisper) {
+      try {
+        const localText = await this.transcribeOneWithLocalWhisperWithRetry(attachment);
+        if (localText) {
+          return localText;
+        }
+      } catch (error) {
+        localError = error;
+      }
+    }
+    if (hasOpenAi) {
+      try {
+        return await this.transcribeOneWithOpenAiWithRetry(attachment);
+      } catch (error) {
+        if (!localError) {
+          throw error;
+        }
+        throw new Error(
+          `local whisper failed (${formatError3(localError)}), and OpenAI fallback also failed (${formatError3(error)}).`,
+          { cause: error }
+        );
+      }
+    }
+    if (localError) {
+      throw localError;
+    }
+    return "";
+  }
+  async transcribeOneWithOpenAiWithRetry(attachment) {
+    let attempt = 0;
+    while (true) {
+      try {
+        return await this.transcribeOneWithOpenAi(attachment);
+      } catch (error) {
+        if (!isRetryableOpenAiError(error) || attempt >= this.maxRetries) {
+          throw error;
+        }
+        attempt += 1;
+        await sleep2(this.retryDelayMs * attempt);
+      }
+    }
+  }
+  async transcribeOneWithLocalWhisperWithRetry(attachment) {
+    let attempt = 0;
+    while (true) {
+      try {
+        return await this.transcribeOneWithLocalWhisper(attachment);
+      } catch (error) {
+        if (attempt >= this.maxRetries) {
+          throw error;
+        }
+        attempt += 1;
+        await sleep2(this.retryDelayMs * attempt);
+      }
+    }
+  }
+  async transcribeOneWithOpenAi(attachment) {
+    if (!this.apiKey) {
+      return "";
+    }
     const buffer = await import_promises3.default.readFile(attachment.localPath);
     const formData = new FormData();
     formData.append("model", this.model);
@@ -3784,9 +3950,32 @@ var AudioTranscriber = class {
     const payload = await response.json().catch(() => ({}));
     if (!response.ok) {
       const message = typeof payload?.error?.message === "string" ? payload.error.message : `HTTP ${response.status} ${response.statusText}`;
-      throw new Error(`Audio transcription failed for ${attachment.name}: ${message}`);
+      throw new OpenAiTranscriptionHttpError(response.status, `Audio transcription failed for ${attachment.name}: ${message}`);
     }
     const text = typeof payload.text === "string" ? payload.text.trim() : "";
+    return this.normalizeTranscriptText(text);
+  }
+  async transcribeOneWithLocalWhisper(attachment) {
+    if (!this.localWhisperCommand) {
+      return "";
+    }
+    const command = buildLocalWhisperCommand(this.localWhisperCommand, attachment.localPath);
+    const result = await execAsync(command, {
+      timeout: this.localWhisperTimeoutMs,
+      maxBuffer: 4 * 1024 * 1024,
+      shell: "/bin/bash"
+    });
+    const text = result.stdout.trim();
+    if (!text) {
+      const stderr = result.stderr.trim();
+      throw new Error(
+        stderr ? `Local whisper command produced empty output for ${attachment.name}: ${stderr}` : `Local whisper command produced empty output for ${attachment.name}.`
+      );
+    }
+    return this.normalizeTranscriptText(text);
+  }
+  normalizeTranscriptText(rawText) {
+    const text = rawText.trim();
     if (!text) {
       return "";
     }
@@ -3796,6 +3985,48 @@ var AudioTranscriber = class {
     return text;
   }
 };
+function buildLocalWhisperCommand(template, inputPath) {
+  const escapedInput = shellEscape(inputPath);
+  if (template.includes("{input}")) {
+    return template.replaceAll("{input}", escapedInput);
+  }
+  return `${template} ${escapedInput}`;
+}
+function shellEscape(value) {
+  return `'${value.replace(/'/g, `'"'"'`)}'`;
+}
+function isRetryableOpenAiError(error) {
+  if (error instanceof OpenAiTranscriptionHttpError) {
+    return RETRYABLE_OPENAI_STATUS.has(error.status);
+  }
+  if (error instanceof Error && error.name === "AbortError") {
+    return true;
+  }
+  return true;
+}
+async function sleep2(delayMs) {
+  if (delayMs <= 0) {
+    return;
+  }
+  await new Promise((resolve) => {
+    const timer = setTimeout(resolve, delayMs);
+    timer.unref?.();
+  });
+}
+function formatError3(error) {
+  if (error instanceof Error) {
+    return error.message;
+  }
+  return String(error);
+}
+var OpenAiTranscriptionHttpError = class extends Error {
+  status;
+  constructor(status, message) {
+    super(message);
+    this.name = "OpenAiTranscriptionHttpError";
+    this.status = status;
+  }
+};
 // src/compat/cli-compat-recorder.ts
 var import_node_fs6 = __toESM(require("fs"));
@@ -4693,6 +4924,11 @@ var Orchestrator = class {
       audioTranscribeModel: "gpt-4o-mini-transcribe",
       audioTranscribeTimeoutMs: 12e4,
       audioTranscribeMaxChars: 6e3,
+      audioTranscribeMaxRetries: 1,
+      audioTranscribeRetryDelayMs: 800,
+      audioTranscribeMaxBytes: 26214400,
+      audioLocalWhisperCommand: null,
+      audioLocalWhisperTimeoutMs: 18e4,
       recordPath: null
     };
     this.cliCompatRecorder = this.cliCompat.recordPath ? new CliCompatRecorder(this.cliCompat.recordPath) : null;
@@ -4701,7 +4937,11 @@ var Orchestrator = class {
       apiKey: process.env.OPENAI_API_KEY?.trim() || null,
       model: this.cliCompat.audioTranscribeModel,
       timeoutMs: this.cliCompat.audioTranscribeTimeoutMs,
-      maxChars: this.cliCompat.audioTranscribeMaxChars
+      maxChars: this.cliCompat.audioTranscribeMaxChars,
+      maxRetries: this.cliCompat.audioTranscribeMaxRetries,
+      retryDelayMs: this.cliCompat.audioTranscribeRetryDelayMs,
+      localWhisperCommand: this.cliCompat.audioLocalWhisperCommand,
+      localWhisperTimeoutMs: this.cliCompat.audioLocalWhisperTimeoutMs
     });
     const defaultProgressInterval = options?.progressMinIntervalMs ?? 2500;
     this.progressMinIntervalMs = this.cliCompat.enabled ? this.cliCompat.progressThrottleMs : defaultProgressInterval;
@@ -4832,7 +5072,7 @@ var Orchestrator = class {
             this.logger.error("Workflow request failed", {
               requestId,
               sessionKey,
-              error: formatError3(error)
+              error: formatError4(error)
             });
           } finally {
             rateDecision.release?.();
@@ -4863,7 +5103,7 @@ var Orchestrator = class {
             this.logger.error("AutoDev request failed", {
               requestId,
               sessionKey,
-              error: formatError3(error)
+              error: formatError4(error)
             });
           } finally {
             rateDecision.release?.();
@@ -5006,7 +5246,7 @@ var Orchestrator = class {
             try {
               await this.channel.sendMessage(
                 message.conversationId,
-                `[CodeHarbor] Failed to process request: ${formatError3(error)}`
+                `[CodeHarbor] Failed to process request: ${formatError4(error)}`
               );
             } catch (sendError) {
               this.logger.error("Failed to send error reply to Matrix", sendError);
@@ -5021,7 +5261,7 @@ var Orchestrator = class {
             queueWaitMs,
             executionDurationMs,
             totalDurationMs: Date.now() - receivedAt,
-            error: formatError3(error)
+            error: formatError4(error)
           });
         } finally {
           const running = this.runningExecutions.get(sessionKey);
@@ -5148,7 +5388,7 @@ var Orchestrator = class {
 - runError: ${snapshot.error ?? "N/A"}`
       );
     } catch (error) {
-      await this.channel.sendNotice(message.conversationId, `[CodeHarbor] AutoDev \u72B6\u6001\u8BFB\u53D6\u5931\u8D25: ${formatError3(error)}`);
+      await this.channel.sendNotice(message.conversationId, `[CodeHarbor] AutoDev \u72B6\u6001\u8BFB\u53D6\u5931\u8D25: ${formatError4(error)}`);
     }
   }
   async handleAutoDevRunCommand(taskId, sessionKey, message, requestId, workdir) {
@@ -5256,7 +5496,7 @@ var Orchestrator = class {
         } catch (restoreError) {
           this.logger.warn("Failed to restore AutoDev task status after failure", {
             taskId: activeTask.id,
-            error: formatError3(restoreError)
+            error: formatError4(restoreError)
           });
         }
       }
@@ -5270,7 +5510,7 @@ var Orchestrator = class {
         taskDescription: activeTask.description,
         approved: null,
         repairRounds: 0,
-        error: formatError3(error)
+        error: formatError4(error)
       });
       throw error;
     }
@@ -5352,7 +5592,7 @@ var Orchestrator = class {
         objective: normalizedObjective,
         approved: null,
         repairRounds: 0,
-        error: formatError3(error)
+        error: formatError4(error)
       });
       await this.finishProgress(progressCtx, buildFailureProgressSummary(status, requestStartedAt, error));
       throw error;
@@ -5371,7 +5611,7 @@ var Orchestrator = class {
       await this.channel.sendNotice(conversationId, "[CodeHarbor] Multi-Agent workflow \u5DF2\u53D6\u6D88\u3002");
       return Date.now() - startedAt;
     }
-    await this.channel.sendMessage(conversationId, `[CodeHarbor] Multi-Agent workflow \u5931\u8D25: ${formatError3(error)}`);
+    await this.channel.sendMessage(conversationId, `[CodeHarbor] Multi-Agent workflow \u5931\u8D25: ${formatError4(error)}`);
     return Date.now() - startedAt;
   }
   async sendAutoDevFailure(conversationId, error) {
@@ -5381,7 +5621,7 @@ var Orchestrator = class {
       await this.channel.sendNotice(conversationId, "[CodeHarbor] AutoDev \u5DF2\u53D6\u6D88\u3002");
       return Date.now() - startedAt;
     }
-    await this.channel.sendMessage(conversationId, `[CodeHarbor] AutoDev \u5931\u8D25: ${formatError3(error)}`);
+    await this.channel.sendMessage(conversationId, `[CodeHarbor] AutoDev \u5931\u8D25: ${formatError4(error)}`);
     return Date.now() - startedAt;
   }
   async handleStopCommand(sessionKey, message, requestId) {
@@ -5536,35 +5776,73 @@ var Orchestrator = class {
     if (!this.audioTranscriber.isEnabled()) {
       return [];
     }
-    const audioAttachments = message.attachments.filter((attachment) => attachment.kind === "audio" && Boolean(attachment.localPath)).map((attachment) => ({
-      name: attachment.name,
-      mimeType: attachment.mimeType,
-      localPath: attachment.localPath
-    }));
-    if (audioAttachments.length === 0) {
+    const rawAudioAttachments = message.attachments.filter(
+      (attachment) => attachment.kind === "audio" && Boolean(attachment.localPath)
+    );
+    if (rawAudioAttachments.length === 0) {
       return [];
     }
-    try {
-      const transcripts = await this.audioTranscriber.transcribeMany(audioAttachments);
-      if (transcripts.length > 0) {
-        this.logger.info("Audio transcription completed", {
+    const maxBytes = this.cliCompat.audioTranscribeMaxBytes;
+    const audioAttachments = [];
+    let skippedTooLarge = 0;
+    for (const attachment of rawAudioAttachments) {
+      const localPath = attachment.localPath;
+      const sizeBytes = await this.resolveAudioAttachmentSizeBytes(attachment.sizeBytes, localPath);
+      if (sizeBytes !== null && sizeBytes > maxBytes) {
+        skippedTooLarge += 1;
+        this.logger.warn("Skip audio transcription for oversized attachment", {
           requestId,
           sessionKey,
-          attachmentCount: audioAttachments.length,
-          transcriptCount: transcripts.length
+          name: attachment.name,
+          sizeBytes,
+          maxBytes
         });
+        continue;
       }
+      audioAttachments.push({
+        name: attachment.name,
+        mimeType: attachment.mimeType,
+        localPath
+      });
+    }
+    if (audioAttachments.length === 0) {
+      return [];
+    }
+    const startedAt = Date.now();
+    try {
+      const transcripts = await this.audioTranscriber.transcribeMany(audioAttachments);
+      this.logger.info("Audio transcription completed", {
+        requestId,
+        sessionKey,
+        attachmentCount: audioAttachments.length,
+        transcriptCount: transcripts.length,
+        skippedTooLarge,
+        durationMs: Date.now() - startedAt
+      });
       return transcripts;
     } catch (error) {
       this.logger.warn("Audio transcription failed, continuing without transcripts", {
         requestId,
         sessionKey,
         attachmentCount: audioAttachments.length,
-        error: formatError3(error)
+        skippedTooLarge,
+        durationMs: Date.now() - startedAt,
+        error: formatError4(error)
       });
       return [];
     }
   }
+  async resolveAudioAttachmentSizeBytes(sizeBytes, localPath) {
+    if (sizeBytes !== null) {
+      return sizeBytes;
+    }
+    try {
+      const stats = await import_promises5.default.stat(localPath);
+      return stats.size;
+    } catch {
+      return null;
+    }
+  }
   buildExecutionPrompt(prompt, message, audioTranscripts) {
     if (message.attachments.length === 0 && audioTranscripts.length === 0) {
       return prompt;
@@ -5718,7 +5996,7 @@ function createIdleAutoDevSnapshot() {
 function buildSessionKey(message) {
   return `${message.channel}:${message.conversationId}:${message.senderId}`;
 }
-function formatError3(error) {
+function formatError4(error) {
   if (error instanceof Error) {
     return error.message;
   }
@@ -5827,7 +6105,7 @@ function classifyExecutionOutcome(error) {
   if (error instanceof CodexExecutionCancelledError) {
     return "cancelled";
   }
-  const message = formatError3(error).toLowerCase();
+  const message = formatError4(error).toLowerCase();
   if (message.includes("timed out")) {
     return "timeout";
   }
@@ -5839,9 +6117,9 @@ function buildFailureProgressSummary(status, startedAt, error) {
     return `\u5904\u7406\u5DF2\u53D6\u6D88\uFF08\u8017\u65F6 ${elapsed}\uFF09`;
   }
   if (status === "timeout") {
-    return `\u5904\u7406\u8D85\u65F6\uFF08\u8017\u65F6 ${elapsed}\uFF09: ${formatError3(error)}`;
+    return `\u5904\u7406\u8D85\u65F6\uFF08\u8017\u65F6 ${elapsed}\uFF09: ${formatError4(error)}`;
   }
-  return `\u5904\u7406\u5931\u8D25\uFF08\u8017\u65F6 ${elapsed}\uFF09: ${formatError3(error)}`;
+  return `\u5904\u7406\u5931\u8D25\uFF08\u8017\u65F6 ${elapsed}\uFF09: ${formatError4(error)}`;
 }
 function buildWorkflowResultReply(result) {
   return `[CodeHarbor] Multi-Agent workflow \u5B8C\u6210
@@ -6262,7 +6540,7 @@ function boolToInt(value) {
 }
 // src/app.ts
-var execFileAsync3 = (0, import_node_util3.promisify)(import_node_child_process5.execFile);
+var execFileAsync3 = (0, import_node_util4.promisify)(import_node_child_process6.execFile);
 var CodeHarborApp = class {
   config;
   logger;
@@ -6465,6 +6743,11 @@ var configSchema = import_zod.z.object({
   CLI_COMPAT_AUDIO_TRANSCRIBE_MODEL: import_zod.z.string().default("gpt-4o-mini-transcribe"),
   CLI_COMPAT_AUDIO_TRANSCRIBE_TIMEOUT_MS: import_zod.z.string().default("120000").transform((v) => Number.parseInt(v, 10)).pipe(import_zod.z.number().int().positive()),
   CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_CHARS: import_zod.z.string().default("6000").transform((v) => Number.parseInt(v, 10)).pipe(import_zod.z.number().int().positive()),
+  CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_RETRIES: import_zod.z.string().default("1").transform((v) => Number.parseInt(v, 10)).pipe(import_zod.z.number().int().min(0).max(10)),
+  CLI_COMPAT_AUDIO_TRANSCRIBE_RETRY_DELAY_MS: import_zod.z.string().default("800").transform((v) => Number.parseInt(v, 10)).pipe(import_zod.z.number().int().nonnegative()),
+  CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_BYTES: import_zod.z.string().default("26214400").transform((v) => Number.parseInt(v, 10)).pipe(import_zod.z.number().int().positive()),
+  CLI_COMPAT_AUDIO_LOCAL_WHISPER_COMMAND: import_zod.z.string().default(""),
+  CLI_COMPAT_AUDIO_LOCAL_WHISPER_TIMEOUT_MS: import_zod.z.string().default("180000").transform((v) => Number.parseInt(v, 10)).pipe(import_zod.z.number().int().positive()),
   CLI_COMPAT_RECORD_PATH: import_zod.z.string().default(""),
   DOCTOR_HTTP_TIMEOUT_MS: import_zod.z.string().default("10000").transform((v) => Number.parseInt(v, 10)).pipe(import_zod.z.number().int().positive()),
   ADMIN_BIND_HOST: import_zod.z.string().default("127.0.0.1"),
@@ -6529,6 +6812,11 @@ var configSchema = import_zod.z.object({
     audioTranscribeModel: v.CLI_COMPAT_AUDIO_TRANSCRIBE_MODEL.trim() || "gpt-4o-mini-transcribe",
     audioTranscribeTimeoutMs: v.CLI_COMPAT_AUDIO_TRANSCRIBE_TIMEOUT_MS,
     audioTranscribeMaxChars: v.CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_CHARS,
+    audioTranscribeMaxRetries: v.CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_RETRIES,
+    audioTranscribeRetryDelayMs: v.CLI_COMPAT_AUDIO_TRANSCRIBE_RETRY_DELAY_MS,
+    audioTranscribeMaxBytes: v.CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_BYTES,
+    audioLocalWhisperCommand: v.CLI_COMPAT_AUDIO_LOCAL_WHISPER_COMMAND.trim() ? v.CLI_COMPAT_AUDIO_LOCAL_WHISPER_COMMAND.trim() : null,
+    audioLocalWhisperTimeoutMs: v.CLI_COMPAT_AUDIO_LOCAL_WHISPER_TIMEOUT_MS,
     recordPath: v.CLI_COMPAT_RECORD_PATH.trim() ? import_node_path12.default.resolve(v.CLI_COMPAT_RECORD_PATH) : null
   },
   doctorHttpTimeoutMs: v.DOCTOR_HTTP_TIMEOUT_MS,
@@ -6774,6 +7062,11 @@ var CONFIG_SNAPSHOT_ENV_KEYS = [
   "CLI_COMPAT_AUDIO_TRANSCRIBE_MODEL",
   "CLI_COMPAT_AUDIO_TRANSCRIBE_TIMEOUT_MS",
   "CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_CHARS",
+  "CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_RETRIES",
+  "CLI_COMPAT_AUDIO_TRANSCRIBE_RETRY_DELAY_MS",
+  "CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_BYTES",
+  "CLI_COMPAT_AUDIO_LOCAL_WHISPER_COMMAND",
+  "CLI_COMPAT_AUDIO_LOCAL_WHISPER_TIMEOUT_MS",
   "CLI_COMPAT_RECORD_PATH",
   "DOCTOR_HTTP_TIMEOUT_MS",
   "ADMIN_BIND_HOST",
@@ -6848,6 +7141,21 @@ var envSnapshotSchema = import_zod2.z.object({
   CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_CHARS: integerStringSchema("CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_CHARS", 1).default(
     "6000"
   ),
+  CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_RETRIES: integerStringSchema("CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_RETRIES", 0, 10).default(
+    "1"
+  ),
+  CLI_COMPAT_AUDIO_TRANSCRIBE_RETRY_DELAY_MS: integerStringSchema(
+    "CLI_COMPAT_AUDIO_TRANSCRIBE_RETRY_DELAY_MS",
+    0
+  ).default("800"),
+  CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_BYTES: integerStringSchema("CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_BYTES", 1).default(
+    "26214400"
+  ),
+  CLI_COMPAT_AUDIO_LOCAL_WHISPER_COMMAND: import_zod2.z.string().default(""),
+  CLI_COMPAT_AUDIO_LOCAL_WHISPER_TIMEOUT_MS: integerStringSchema(
+    "CLI_COMPAT_AUDIO_LOCAL_WHISPER_TIMEOUT_MS",
+    1
+  ).default("180000"),
   CLI_COMPAT_RECORD_PATH: import_zod2.z.string(),
   DOCTOR_HTTP_TIMEOUT_MS: integerStringSchema("DOCTOR_HTTP_TIMEOUT_MS", 1),
   ADMIN_BIND_HOST: import_zod2.z.string(),
@@ -7042,6 +7350,11 @@ function buildSnapshotEnv(config) {
     CLI_COMPAT_AUDIO_TRANSCRIBE_MODEL: config.cliCompat.audioTranscribeModel,
     CLI_COMPAT_AUDIO_TRANSCRIBE_TIMEOUT_MS: String(config.cliCompat.audioTranscribeTimeoutMs),
     CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_CHARS: String(config.cliCompat.audioTranscribeMaxChars),
+    CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_RETRIES: String(config.cliCompat.audioTranscribeMaxRetries),
+    CLI_COMPAT_AUDIO_TRANSCRIBE_RETRY_DELAY_MS: String(config.cliCompat.audioTranscribeRetryDelayMs),
+    CLI_COMPAT_AUDIO_TRANSCRIBE_MAX_BYTES: String(config.cliCompat.audioTranscribeMaxBytes),
+    CLI_COMPAT_AUDIO_LOCAL_WHISPER_COMMAND: config.cliCompat.audioLocalWhisperCommand ?? "",
+    CLI_COMPAT_AUDIO_LOCAL_WHISPER_TIMEOUT_MS: String(config.cliCompat.audioLocalWhisperTimeoutMs),
     CLI_COMPAT_RECORD_PATH: config.cliCompat.recordPath ?? "",
     DOCTOR_HTTP_TIMEOUT_MS: String(config.doctorHttpTimeoutMs),
     ADMIN_BIND_HOST: config.adminBindHost,
@@ -7196,11 +7509,11 @@ function jsonArrayStringSchema(key, allowEmpty) {
 }
 // src/preflight.ts
-var import_node_child_process6 = require("child_process");
+var import_node_child_process7 = require("child_process");
 var import_node_fs10 = __toESM(require("fs"));
 var import_node_path14 = __toESM(require("path"));
-var import_node_util4 = require("util");
-var execFileAsync4 = (0, import_node_util4.promisify)(import_node_child_process6.execFile);
+var import_node_util5 = require("util");
+var execFileAsync4 = (0, import_node_util5.promisify)(import_node_child_process7.execFile);
 var REQUIRED_ENV_KEYS = ["MATRIX_HOMESERVER", "MATRIX_USER_ID", "MATRIX_ACCESS_TOKEN"];
 async function runStartupPreflight(options = {}) {
   const env = options.env ?? process.env;
@@ -7416,7 +7729,7 @@ configCommand.command("export").description("Export config snapshot as JSON").op
     const home = ensureRuntimeHomeOrExit();
     await runConfigExportCommand({ outputPath: options.output, cwd: home });
   } catch (error) {
-    process.stderr.write(`Config export failed: ${formatError4(error)}
+    process.stderr.write(`Config export failed: ${formatError5(error)}
 `);
     process.exitCode = 1;
   }
@@ -7430,7 +7743,7 @@ configCommand.command("import").description("Import config snapshot from JSON").
       cwd: home
     });
   } catch (error) {
-    process.stderr.write(`Config import failed: ${formatError4(error)}
+    process.stderr.write(`Config import failed: ${formatError5(error)}
 `);
     process.exitCode = 1;
   }
@@ -7449,7 +7762,7 @@ serviceCommand.command("install").description("Install and enable codeharbor sys
       startNow: options.start ?? true
     });
   } catch (error) {
-    process.stderr.write(`Service install failed: ${formatError4(error)}
+    process.stderr.write(`Service install failed: ${formatError5(error)}
 `);
     process.stderr.write(
       [
@@ -7470,7 +7783,7 @@ serviceCommand.command("uninstall").description("Remove codeharbor systemd servi
       removeAdmin: options.withAdmin ?? false
     });
   } catch (error) {
-    process.stderr.write(`Service uninstall failed: ${formatError4(error)}
+    process.stderr.write(`Service uninstall failed: ${formatError5(error)}
 `);
     process.stderr.write(
       [
@@ -7491,7 +7804,7 @@ serviceCommand.command("restart").description("Restart installed codeharbor syst
       restartAdmin: options.withAdmin ?? false
     });
   } catch (error) {
-    process.stderr.write(`Service restart failed: ${formatError4(error)}
+    process.stderr.write(`Service restart failed: ${formatError5(error)}
 `);
     process.stderr.write(
       [
@@ -7594,7 +7907,7 @@ function maybeReexecServiceCommandWithSudo() {
     return;
   }
   const cliScriptPath = resolveCliScriptPath();
-  const child = (0, import_node_child_process7.spawnSync)("sudo", [process.execPath, cliScriptPath, ...serviceArgs], {
+  const child = (0, import_node_child_process8.spawnSync)("sudo", [process.execPath, cliScriptPath, ...serviceArgs], {
     stdio: "inherit"
   });
   if (child.error) {
@@ -7611,7 +7924,7 @@ function shellQuote(value) {
 function buildExplicitSudoCommand(subcommand) {
   return `sudo ${shellQuote(process.execPath)} ${shellQuote(resolveCliScriptPath())} ${subcommand}`;
 }
-function formatError4(error) {
+function formatError5(error) {
   if (error instanceof Error) {
     return error.message;
   }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "codeharbor",
-  "version": "0.1.20",
+  "version": "0.1.22",
   "description": "Instant-messaging bridge for Codex CLI sessions",
   "license": "MIT",
   "main": "dist/cli.js",
@@ -13,11 +13,13 @@
     "./package.json": "./package.json"
   },
   "bin": {
-    "codeharbor": "dist/cli.js"
+    "codeharbor": "dist/cli.js",
+    "codeharbor-whisper-transcribe": "scripts/local-whisper-transcribe.py"
   },
   "files": [
     "dist",
     "scripts/postinstall-restart.cjs",
+    "scripts/local-whisper-transcribe.py",
     ".env.example",
     "README.md",
     "LICENSE"

package/scripts/local-whisper-transcribe.py ADDED Viewed

@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+import argparse
+import sys
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Local audio transcription for CodeHarbor.")
+    parser.add_argument("--input", required=True, help="Path to input audio file.")
+    parser.add_argument("--model", default="small", help="Whisper model size/name.")
+    parser.add_argument("--device", default="auto", help="Execution device (auto/cpu/cuda).")
+    parser.add_argument(
+        "--compute-type",
+        default="int8",
+        help="faster-whisper compute type (int8/float16/float32).",
+    )
+    parser.add_argument("--language", default=None, help="Optional language hint (for example: zh).")
+    parser.add_argument("--beam-size", type=int, default=5, help="Beam size for decoding.")
+    return parser
+def main() -> int:
+    parser = build_parser()
+    args = parser.parse_args()
+    try:
+        from faster_whisper import WhisperModel
+    except Exception as error:  # pragma: no cover - env dependent
+        print(
+            "faster_whisper is required for local transcription. Install with: python3 -m pip install faster-whisper",
+            file=sys.stderr,
+        )
+        print(str(error), file=sys.stderr)
+        return 2
+    model = WhisperModel(args.model, device=args.device, compute_type=args.compute_type)
+    segments, _ = model.transcribe(
+        args.input,
+        language=args.language,
+        vad_filter=True,
+        beam_size=args.beam_size,
+    )
+    text = " ".join(segment.text.strip() for segment in segments if segment.text and segment.text.strip()).strip()
+    if not text:
+        return 3
+    print(text)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())