npm - arisa - Versions diffs - 2.3.55 → 3.0.0 - Mend

arisa 2.3.55 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/AGENTS.md +102 -0
package/README.md +120 -165
package/cli/openai-transcribe/index.js +51 -0
package/cli/openai-transcribe/package.json +6 -0
package/cli/openai-transcribe/tool.manifest.json +15 -0
package/cli/openai-tts/index.js +58 -0
package/cli/openai-tts/package.json +6 -0
package/cli/openai-tts/tool.manifest.json +20 -0
package/cli/web-browser/index.js +146 -0
package/cli/web-browser/package.json +6 -0
package/cli/web-browser/tool.manifest.json +8 -0
package/package.json +26 -44
package/src/core/agent/agent-manager.js +218 -0
package/src/core/artifacts/artifact-store.js +102 -0
package/src/core/config/config-store.js +20 -0
package/src/core/tools/tool-registry.js +117 -0
package/src/index.js +27 -0
package/src/runtime/bootstrap.js +213 -0
package/src/runtime/create-app.js +22 -0
package/src/transport/telegram/auth.js +13 -0
package/src/transport/telegram/bot.js +214 -0
package/src/transport/telegram/media.js +75 -0
package/CLAUDE.md +0 -191
package/SOUL.md +0 -36
package/bin/arisa.js +0 -644
package/scripts/dump-commands.ts +0 -26
package/scripts/test-secrets.ts +0 -22
package/src/core/attachments.ts +0 -104
package/src/core/auth.ts +0 -58
package/src/core/context.ts +0 -30
package/src/core/file-detector.ts +0 -39
package/src/core/format.ts +0 -159
package/src/core/index.ts +0 -456
package/src/core/intent.ts +0 -119
package/src/core/media.ts +0 -144
package/src/core/onboarding.ts +0 -102
package/src/core/processor.ts +0 -305
package/src/core/router.ts +0 -64
package/src/core/scheduler.ts +0 -193
package/src/daemon/agent-cli.ts +0 -130
package/src/daemon/auto-install.ts +0 -158
package/src/daemon/autofix.ts +0 -116
package/src/daemon/bridge.ts +0 -166
package/src/daemon/channels/base.ts +0 -10
package/src/daemon/channels/telegram.ts +0 -306
package/src/daemon/claude-login.ts +0 -218
package/src/daemon/codex-login.ts +0 -172
package/src/daemon/fallback.ts +0 -73
package/src/daemon/index.ts +0 -272
package/src/daemon/lifecycle.ts +0 -313
package/src/daemon/setup.ts +0 -329
package/src/shared/ai-cli.ts +0 -165
package/src/shared/config.ts +0 -137
package/src/shared/db.ts +0 -304
package/src/shared/deepbase-secure.ts +0 -39
package/src/shared/ink-shim.js +0 -14
package/src/shared/logger.ts +0 -42
package/src/shared/paths.ts +0 -90
package/src/shared/ports.ts +0 -120
package/src/shared/secrets.ts +0 -136
package/src/shared/types.ts +0 -103
package/tsconfig.json +0 -19

package/AGENTS.md ADDED Viewed

@@ -0,0 +1,102 @@
+# Arisa AGENTS
+## Architecture
+- `src/transport/telegram/*`: Telegram inbound and outbound transport.
+- `src/core/agent/*`: Pi Agent sessions, one per authorized chat.
+- `src/core/artifacts/*`: every incoming or generated message/file becomes an artifact.
+- `src/core/tools/*`: CLI tool registry, help lookup, config writes, execution.
+- `cli/*`: isolated tools. Each tool has `package.json`, `config.js`, `tool.manifest.json`, and `index.js`.
+## Main rule: everything is piped through artifacts
+A pipe transforms one input artifact into one output artifact.
+Examples:
+- voice OGG -> transcript TXT
+- text -> MP3 audio
+- URL -> downloaded file -> derived file -> transcript
+Each tool declares in `tool.manifest.json`:
+- `input`: supported input types
+- `output`: produced output types
+- `configSchema`: required config fields
+## Conceptual pipe model
+There are two different moments where pipes can happen:
+1. **Pre-reasoning normalization pipes**
+   - These happen before Pi Agent reasons.
+   - Their job is to convert raw inbound media into a form Pi Agent can reason about well.
+   - Example: incoming Telegram audio must be transcribed first.
+   - In that case, the transcript becomes the effective user message content for Pi Agent.
+   - Pi Agent should reason over the transcript, not treat the raw audio as the primary message.
+2. **Reasoned action pipes**
+   - These happen after Pi Agent starts reasoning.
+   - Pi Agent may decide to chain tools to achieve a user goal.
+   - Example: text -> TTS audio, or future multi-step workflows.
+This distinction is critical. Not every pipe should be decided by Pi Agent at runtime. Some pipes are part of the transport/input normalization layer and must happen before reasoning.
+## Telegram inbound pipeline
+Current conceptual behavior:
+- text -> send directly to Pi Agent
+- audio/voice -> transcribe first -> send transcript to Pi Agent
+- image/document/other media -> keep as artifacts, and add normalization pipes when needed
+If inbound media was normalized before reasoning, Pi Agent should use the normalized result as the actual message content.
+For example, if a voice note was transcribed, Pi Agent should answer the meaning of the transcript, not simply return the raw transcript unless the user explicitly asked for transcription.
+## How to inspect CLI tools
+Before using a tool, inspect its help:
+- via the custom tool: `tool_help`
+- or by running the CLI with `--help`
+Every CLI must support:
+- `node index.js --help`
+- `node index.js run --request-file <json>`
+## Pipe behavior in V1
+V1 does not have a full automatic planner yet. The agent should:
+1. understand whether the needed pipe belongs to pre-reasoning normalization or post-reasoning tool chaining
+2. use `list_tools`
+3. use `tool_help` when it needs operational details
+4. execute a tool with `run_tool`
+5. if another step is needed, use the returned `artifactId` as input for the next tool
+Example manual pipe:
+1. `run_tool(openai-transcribe, artifact audio)`
+2. take the returned text `artifactId`
+3. `run_tool(openai-tts, artifact text)` or `send_audio_reply(text)`
+## Missing config flow
+If `run_tool` returns `missingConfig`, the agent should:
+1. ask the user naturally in Telegram for the missing value
+2. write the value into `cli/<tool>/config.js` with `set_tool_config`
+3. retry the tool
+Do not assume a rigid question/answer protocol. Continue the conversation naturally and infer the config value from the user reply when possible.
+## Telegram security
+- The first chat that messages the bot is authorized if `telegram.maxChatIds` allows it.
+- Do not authorize more chats than configured.
+- Access control is based on chat ids, not usernames.
+## Tool creation
+Do not assume specific future tools such as YouTube support exist.
+If the user asks for a capability that is not currently available, first check whether an existing registered tool can satisfy the task.
+If no existing tool can do it, the default attitude should be to propose creating a new CLI tool under `cli/<tool-name>` following the project conventions.
+All newly created tools must document their help text, usage instructions, manifests, and user-facing operational strings in English.
+Do not stop at "I cannot do that" when the task is realistically implementable through a new tool.
+Prefer responses like:
+- identify that no current tool satisfies the request
+- state that the missing capability can be added
+- propose or start creating the tool needed to fulfill the request
+For example, if the user asks for live weather and no weather tool exists, the correct attitude is to propose building a weather tool for the bot rather than only saying real-time access is unavailable.
+Consult the local skill for that workflow when building new tools.
+## Safety
+- Do not install or run arbitrary tools outside registered `cli/*` manifests in V1.
+- Prefer tool manifests and CLI help over assumptions.
+- Keep tool configs inside `cli/<tool>/config.js`.
+- Be proactive about extending capabilities, but do it through the project's tool architecture, not through ad hoc one-off behavior.

package/README.md CHANGED Viewed

@@ -1,215 +1,170 @@
-# What is Arisa
+# Arisa
-Arisa is a Bun + TypeScript agent runtime with a two-process architecture: **Daemon** (stable channel I/O) and **Core** (message processing, media, scheduling, CLI routing). Telegram is one access channel, not the identity of the system.
+Arisa is a modular Telegram assistant powered by Pi Agent.
-Inspired by the architecture of [`jlia0/tinyclaw`](https://github.com/jlia0/tinyclaw).
+It is designed around a simple idea:
-Arisa is intentionally dynamic: the project grows as the user builds a relationship with it. Many capabilities are added live during real conversations (for example, Whisper support), so the system evolves through use instead of staying static.
+- **Telegram is the human interface**
+- **Pi Agent is the reasoning engine**
+- **everything is an artifact**
+- **capabilities live in isolated CLI tools**
+- **tools can be chained through pipes**
-## Security Notice
+Arisa is meant to grow like Lego blocks. If a capability does not exist yet, the system should prefer adding a new tool instead of stopping at "I can't do that".
-Arisa can execute actions with operational control over the system where it runs. Before deploying it, make sure you understand and accept the associated security risks. It is strongly recommended to run Arisa in an isolated environment (for example, a Docker container or a dedicated VPS) that does not store sensitive information or critical assets.
+## Core concept
-## Requirements and Installation
+Arisa separates two different kinds of pipes:
-```bash
-curl -fsSL https://bun.sh/install | bash  # Install Bun https://bun.sh
-bun add -g arisa                          # Install Arisa CLI
+1. **Pre-reasoning normalization pipes**
+   - These happen before Pi Agent reasons.
+   - Example: a Telegram voice message is transcribed first.
+   - Pi Agent then reasons over the transcript, not over the raw audio.
-# @anthropic-ai/claude-code and @openai/codex (auto-installed if missing)
-```
+2. **Reasoned action pipes**
+   - These happen after Pi Agent starts reasoning.
+   - Example: text -> TTS audio.
+   - Future tools can form larger chains.
-## Commands
+This distinction is important. Some transformations belong to the transport/input layer, not to the agent's runtime decision making.
-```bash
-arisa                    # Foreground daemon mode (Ctrl+C to stop)
-arisa start              # Start as service (enables autostart with systemd --user)
-arisa stop               # Stop service
-arisa status             # Service status
-arisa restart            # Restart service
-arisa daemon             # Foreground daemon mode (manual/dev)
-arisa core               # Foreground core-only mode
-arisa dev                # Foreground core watch mode
-```
+## Current behavior
+### Telegram input
+- text messages go directly to Pi Agent
+- audio/voice messages are transcribed first, then passed to Pi Agent as text
+- media is stored as artifacts
-On Linux with `systemd --user`, `arisa start` enables auto-start on reboot. To keep it running even without an active login session:
+### Tool model
+Each tool lives in its own folder under `cli/<tool-name>` and contains:
-```bash
-sudo loginctl enable-linger "$USER"
-```
+- `package.json`
+- `config.js`
+- `tool.manifest.json`
+- `index.js`
-## Architecture: Daemon + Core
+Each tool is isolated from the root project and from other tools.
+That isolation is part of the architecture:
-```
-Daemon (:51778)                         Core (:51777)
-├── Telegram adapter (grammy)           ├── HTTP server /message, /health
-├── HTTP server /send (for scheduler)   ├── Claude CLI with model routing
-├── Bridge: HTTP client to Core         ├── Media: voice (Whisper), vision, speech (ElevenLabs)
-├── Lifecycle: spawn Core --watch       ├── Scheduler (croner)
-└── In-memory queue if Core is down     ├── Format: HTML + chunking
-                                        └── File detection in responses
-```
-**Message flow:**
-1. Telegram → Daemon receives message (text/voice/photo)
-2. Daemon → POST Core:51777/message (media as base64)
-3. Core processes media → routes model → calls `claude CLI` → formats response
-4. Core returns response → Daemon sends to Telegram
+- each tool has its own folder
+- each tool keeps its own `config.js`
+- each tool can have its own dependencies
+- one tool can be changed or replaced without tightly coupling the rest of the system
-**Scheduler flow:**
-Scheduled task fires → Core POSTs to Daemon:51778/send → Telegram
+Each tool must support:
-### Principle of separation
+```bash
+node index.js --help
+node index.js run --request-file <json>
+```
-- **Daemon** = Channel I/O only. Receives/sends messages. Never processes content. Stable process that never needs restarting.
-- **Core** = Everything else. Media processing, Claude CLI, formatting, scheduling. Runs with `bun --watch` for hot-reload when code changes.
+### Configuration model
+- Telegram runtime config is stored in `data/state/config.json`
+- tool-specific secrets/config live in `cli/<tool>/config.js`
+- Pi authentication can use either:
+  - an API key entered during bootstrap
+  - or Pi's existing OAuth login when supported, such as `openai-codex`
-## File Structure
+## Install globally
-```
-src/
-├── daemon/
-│   ├── index.ts            # Entry: channel + HTTP server + spawn Core
-│   ├── channels/
-│   │   ├── base.ts         # Re-exports Channel interface
-│   │   └── telegram.ts     # Telegram adapter (grammy)
-│   ├── bridge.ts           # HTTP client to Core with retry + in-memory queue
-│   └── lifecycle.ts        # Spawn Core with --watch, auto-restart
-│
-├── core/
-│   ├── index.ts            # HTTP server with /message and /health endpoints
-│   ├── processor.ts        # Executes claude CLI with model routing
-│   ├── router.ts           # Selects model (haiku/sonnet/opus) by message pattern
-│   ├── media.ts            # Voice transcription (Whisper), image analysis (Vision), speech synthesis (ElevenLabs)
-│   ├── scheduler.ts        # Cron + one-time tasks with croner, persists via deepbase
-│   ├── format.ts           # Telegram chunking (4096 char limit)
-│   ├── file-detector.ts     # Detect file paths in responses for auto-sending
-│   └── context.ts          # Manage -c flag and reset_flag
-│
-└── shared/
-    ├── types.ts            # All shared interfaces
-    ├── config.ts            # Env vars, ports, paths
-    ├── logger.ts           # Logger → .arisa/logs/
-    └── db.ts               # Unified persistence layer (deepbase)
+```bash
+npm install -g arisa
 ```
-## Model Routing
+Then run:
-The router (`src/core/router.ts`) selects Claude models based on message patterns:
-- **Haiku**: Reminders, acknowledgments, simple yes/no
-- **Sonnet** (default): General conversation, queries
-- **Opus**: Code changes, debugging, complex multi-step tasks
+```bash
+arisa
+```
-## Bot Commands
+## Bootstrap flow
-Available Telegram bot commands:
-- `/reset` — Clear conversation history and start fresh
-- `/cancel` — Cancel all scheduled tasks for this chat
-- `/claude` — Switch to Claude backend (default)
-- `/codex` — Switch to Codex backend
-- `/speak <text>` — Generate speech from text using ElevenLabs (requires ELEVENLABS_API_KEY)
+On first run, Arisa will:
-## Adding a New Channel
+1. ask for a Telegram bot token
+2. ask for the maximum number of authorized chat ids
+3. show a list of Pi models
+4. resolve authentication for the selected Pi provider
+5. validate that Pi Agent works
+6. only then start listening to Telegram
-Implement the `Channel` interface from `src/shared/types.ts` and register it in `src/daemon/index.ts`. The interface requires: `connect()`, `onMessage()`, `send()`, `sendFile()`.
+Telegram bot tokens can be created with:
-## Hooks
+- https://t.me/BotFather
-Configured in `.claude/settings.json`:
-- **SessionStart**: Runs `session-start.sh` — outputs Arisa context reminder
-- **PostToolUse** (async): Runs `log-activity.sh` — logs tool usage to `.arisa/logs/activity.log`
+## Using Pi authentication
-## Runtime Data
+For providers with internal Pi login support, such as Codex, leaving the API key empty during bootstrap will start the internal login flow automatically if no existing auth is found.
-All runtime data lives under `~/.arisa/` (with automatic migration from legacy project-local `.tinyclaw/` or `.arisa/`):
-- `logs/` — per-component log files (core, daemon, telegram, scheduler)
-- `db/arisa.json` — unified persistence with deepbase
-- `attachments/` — saved media files organized by `{chatId}/`
-- `.env` — TELEGRAM_BOT_TOKEN, OPENAI_API_KEY, ELEVENLABS_API_KEY
-- `voice_temp/` — temporary directory for voice transcription
-- `reset_flag` — conversation reset marker
+For example, selecting:
-### Persistence with DeepBase
+- `openai-codex/gpt-5.4`
-All persistent data is managed by **deepbase** (`src/shared/db.ts`). Location: `~/.arisa/db/arisa.json`.
+allows Arisa to authenticate through Pi's Codex OAuth flow instead of requiring a normal OpenAI API key.
-| Collection      | Key           | Value type         | Description                              |
-|-----------------|---------------|--------------------|------------------------------------------|
-| `tasks`         | `task.id`     | `ScheduledTask`    | Cron and one-time scheduled tasks        |
-| `authorized`    | `chatId`      | `{ userId }`       | Authorized Telegram chats                |
-| `onboarded`     | `chatId`      | `{ userId }`       | Chats that completed onboarding          |
-| `queue`         | `message.id`  | queue message      | In-memory queue overflow (Daemon→Core)   |
-| `attachments`   | `chatId_file` | `AttachmentRecord` | Metadata for saved media (files on disk) |
-| `messages`      | `chatId_msgId`| `MessageRecord`    | Message ledger for reply context         |
-| `settings`      | key name      | `{ value }`        | App settings (auth_token, etc.)          |
+## Running model
-- **API**: `db.get(collection, key)`, `db.set(collection, key, data)`, `db.del(collection, key)`
-- **Helper functions**: `src/shared/db.ts` provides type-safe wrappers per collection
+Arisa keeps one Pi session per authorized Telegram chat.
-## Response Formatting
+If a message arrives while Pi Agent is still processing another one:
-Telegram responses are sent with `parse_mode: 'HTML'`. When composing responses that will be sent through Telegram, use HTML formatting instead of Markdown. For example, use `<b>bold</b>` instead of `**bold**`, `<code>inline code</code>` instead of backticks, and `<pre>code block</pre>` instead of triple backticks.
+- the current message keeps running
+- the new message is appended to a queued buffer
+- additional incoming messages are concatenated to that same buffer
+- once the current processing finishes, the buffered messages are sent together as the next prompt
-## Workflow Orchestration
+Conceptually:
-### 1. Plan Mode (On Request Only)
-- Do NOT enter plan mode automatically — only when the user explicitly asks for it
-- If something goes sideways, STOP and re-assess, but don't force plan mode
-- When user requests planning: write detailed specs upfront to reduce ambiguity
+```txt
+message 1 is processing
+message 2 arrives -> queued
+message 3 arrives -> appended to queued
+message 1 finishes
+queued batch is processed next
+```
-### 2. Subagent Strategy to keep main context window clean
-- Offload research, exploration, and parallel analysis to subagents
-- For complex problems, throw more compute at it via subagents
-- One task per subagent for focused execution
+## Project structure
-### 3. Self-Improvement Loop
-- After ANY correction from the user: update 'tasks/lessons.md' with the pattern
-- Write rules for yourself that prevent the same mistake
-- Ruthlessly iterate on these lessons until mistake rate drops
-- Review lessons at session start for relevant project
+```txt
+src/
+  runtime/      bootstrap + app startup
+  transport/    Telegram integration
+  core/         agent, tools, artifacts, config
+cli/
+  openai-transcribe/
+  openai-tts/
+data/
+  state/
+  artifacts/
+  chats/
+```
-### 4. Verification Before Done
-- Never mark a task complete without proving it works
-- Diff behavior between main and your changes when relevant
-- Ask yourself: "Would a staff engineer approve this?"
-- Run tests, check logs, demonstrate correctness
+## Philosophy
-### 5. Demand Elegance (Balanced)
-- For non-trivial changes: pause and ask "is there a more elegant way?"
-- If a fix feels hacky: "Knowing everything I know now, implement the elegant solution"
-- Skip this for simple, obvious fixes - don't over-engineer
-- Challenge your own work before presenting it
+Arisa should not default to passive answers like "I can't do that" when a missing capability can realistically be implemented as a new tool.
-### 6. Autonomous Bug Fixing
-- When given a bug report: just fix it. Don't ask for hand-holding
-- Point at logs, errors, failing tests -> then resolve them
-- Zero context switching required from the user
-- Go fix failing CI tests without being told how
+The preferred behavior is:
-## Task Management
-1. **Plan First**: Write plan to 'tasks/todo.md' with checkable items
-2. **Verify Plan**: Check in before starting implementation
-3. **Track Progress**: Mark items complete as you go
-4. **Explain Changes**: High-level summary at each step
-5. **Document Results**: Add review to 'tasks/todo.md'
-6. **Capture Lessons**: Update 'tasks/lessons.md' after corrections
+1. check whether an existing tool can solve the task
+2. if not, propose creating the missing tool
+3. keep the solution inside the tool architecture
-## Voice Messages (ElevenLabs)
+## Notes
-When you want to send a voice message to the user, wrap the spoken text in `[VOICE]...[/VOICE]` tags:
+- `AGENTS.md` defines the project-level behavioral rules for Pi Agent
+- `src/transport/telegram/bot.js` builds the per-message runtime prompt
+- tool help is part of the architecture and should be consulted before use when details are unclear
-```
-[VOICE]Hello, this will be converted to audio[/VOICE]
-```
+## Status
+This is currently a functional V1 focused on:
-- The text inside `[VOICE]` gets synthesized via ElevenLabs and sent as a Telegram voice message
-- The `[VOICE]` tags are stripped from the text response — only the audio is sent
-- Use it when the user asks you to "hablame", "mandame un audio", "decime con voz", etc.
-- Keep voice texts concise — long texts cost more and take longer to generate
-- You can combine voice with text: write a text response AND include a `[VOICE]` block
+- Telegram transport
+- Pi Agent integration
+- artifact-based message handling
+- isolated CLI tools
+- audio transcription before reasoning
+- text-to-speech replies
+- queued follow-up message batching
-## Core Principles
-- **Simplicity First**: Make every change as simple as possible. Impact minimal code.
-- **No Laziness**: Find root causes. No temporary fixes. Senior developer standards.
-- **Minimal Impact**: Changes should only touch what's necessary. Avoid introducing bugs.
+Future capabilities should be added as new tools and pipes, not as tightly coupled one-off code paths.

package/cli/openai-transcribe/index.js ADDED Viewed

@@ -0,0 +1,51 @@
+import { readFile, stat } from "node:fs/promises";
+import path from "node:path";
+import config from "./config.js";
+function printHelp() {
+  console.log(`openai-transcribe\n\nUso:\n  node index.js --help\n  node index.js run --request-file <json>\n\nInput esperado:\n  {\n    \"artifact\": { \"path\": \"/abs/audio.ogg\", \"mimeType\": \"audio/ogg\" },\n    \"args\": {}\n  }\n\nConfig en cli/openai-transcribe/config.js:\n  OPENAI_API_KEY\n  MODEL\n`);
+}
+async function run(requestFile) {
+  if (!config.OPENAI_API_KEY) {
+    console.log(JSON.stringify({ ok: false, missingConfig: ["OPENAI_API_KEY"], configPath: path.resolve("config.js") }));
+    return;
+  }
+  const request = JSON.parse(await readFile(requestFile, "utf8"));
+  const artifact = request.artifact;
+  if (!artifact?.path) {
+    console.log(JSON.stringify({ ok: false, error: "artifact.path is required" }));
+    return;
+  }
+  await stat(artifact.path);
+  const form = new FormData();
+  const data = await readFile(artifact.path);
+  form.append("file", new Blob([data]), path.basename(artifact.path));
+  form.append("model", config.MODEL);
+  const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
+    method: "POST",
+    headers: { Authorization: `Bearer ${config.OPENAI_API_KEY}` },
+    body: form
+  });
+  const payload = await response.json();
+  if (!response.ok) {
+    console.log(JSON.stringify({ ok: false, error: payload.error?.message || "OpenAI transcription failed" }));
+    return;
+  }
+  console.log(JSON.stringify({ ok: true, output: { text: payload.text || "" } }));
+}
+const args = process.argv.slice(2);
+if (!args.length || args.includes("--help") || args[0] === "help") {
+  printHelp();
+} else if (args[0] === "run") {
+  const fileIndex = args.indexOf("--request-file");
+  await run(args[fileIndex + 1]);
+} else {
+  printHelp();
+}

package/cli/openai-transcribe/package.json ADDED Viewed

@@ -0,0 +1,6 @@
+{
+  "name": "openai-transcribe-cli",
+  "private": true,
+  "type": "module",
+  "version": "1.0.0"
+}

package/cli/openai-transcribe/tool.manifest.json ADDED Viewed

@@ -0,0 +1,15 @@
+{
+  "name": "openai-transcribe",
+  "description": "Transcribe audio files with OpenAI audio transcription API.",
+  "entry": "index.js",
+  "input": ["audio/ogg", "audio/mpeg", "audio/wav", "audio/mp4"],
+  "output": ["text/plain"],
+  "configSchema": {
+    "OPENAI_API_KEY": {
+      "type": "string",
+      "required": true,
+      "secret": true,
+      "prompt": "Necesito tu OPENAI_API_KEY para transcribir audio."
+    }
+  }
+}

package/cli/openai-tts/index.js ADDED Viewed

@@ -0,0 +1,58 @@
+import { mkdir, readFile, writeFile } from "node:fs/promises";
+import path from "node:path";
+import config from "./config.js";
+function printHelp() {
+  console.log(`openai-tts\n\nUso:\n  node index.js --help\n  node index.js run --request-file <json>\n\nInput esperado:\n  {\n    \"text\": \"hola\",\n    \"artifact\": { \"text\": \"hola\" },\n    \"args\": { \"voice\": \"alloy\" }\n  }\n\nConfig en cli/openai-tts/config.js:\n  OPENAI_API_KEY\n  MODEL\n  VOICE\n`);
+}
+async function run(requestFile) {
+  if (!config.OPENAI_API_KEY) {
+    console.log(JSON.stringify({ ok: false, missingConfig: ["OPENAI_API_KEY"], configPath: path.resolve("config.js") }));
+    return;
+  }
+  const request = JSON.parse(await readFile(requestFile, "utf8"));
+  const inputText = request.text || request.artifact?.text;
+  if (!inputText) {
+    console.log(JSON.stringify({ ok: false, error: "text or artifact.text is required" }));
+    return;
+  }
+  const response = await fetch("https://api.openai.com/v1/audio/speech", {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${config.OPENAI_API_KEY}`,
+      "Content-Type": "application/json"
+    },
+    body: JSON.stringify({
+      model: config.MODEL,
+      voice: request.args?.voice || config.VOICE,
+      input: inputText,
+      format: "mp3"
+    })
+  });
+  if (!response.ok) {
+    const payload = await response.text();
+    console.log(JSON.stringify({ ok: false, error: payload }));
+    return;
+  }
+  const outDir = path.resolve("out");
+  await mkdir(outDir, { recursive: true });
+  const filePath = path.join(outDir, `speech-${Date.now()}.mp3`);
+  const buffer = Buffer.from(await response.arrayBuffer());
+  await writeFile(filePath, buffer);
+  console.log(JSON.stringify({ ok: true, output: { filePath, fileName: path.basename(filePath), mimeType: "audio/mpeg", kind: "audio" } }));
+}
+const args = process.argv.slice(2);
+if (!args.length || args.includes("--help") || args[0] === "help") {
+  printHelp();
+} else if (args[0] === "run") {
+  const fileIndex = args.indexOf("--request-file");
+  await run(args[fileIndex + 1]);
+} else {
+  printHelp();
+}

package/cli/openai-tts/package.json ADDED Viewed

@@ -0,0 +1,6 @@
+{
+  "name": "openai-tts-cli",
+  "private": true,
+  "type": "module",
+  "version": "1.0.0"
+}

package/cli/openai-tts/tool.manifest.json ADDED Viewed

@@ -0,0 +1,20 @@
+{
+  "name": "openai-tts",
+  "description": "Convert text into MP3 audio using OpenAI speech API.",
+  "entry": "index.js",
+  "input": ["text/plain"],
+  "output": ["audio/mpeg"],
+  "configSchema": {
+    "OPENAI_API_KEY": {
+      "type": "string",
+      "required": true,
+      "secret": true,
+      "prompt": "Necesito tu OPENAI_API_KEY para generar audio."
+    },
+    "VOICE": {
+      "type": "string",
+      "required": false,
+      "prompt": "Voz a usar, por ejemplo alloy."
+    }
+  }
+}