arisa 2.3.55 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/AGENTS.md +102 -0
  2. package/README.md +120 -165
  3. package/cli/openai-transcribe/index.js +51 -0
  4. package/cli/openai-transcribe/package.json +6 -0
  5. package/cli/openai-transcribe/tool.manifest.json +15 -0
  6. package/cli/openai-tts/index.js +58 -0
  7. package/cli/openai-tts/package.json +6 -0
  8. package/cli/openai-tts/tool.manifest.json +20 -0
  9. package/cli/web-browser/index.js +146 -0
  10. package/cli/web-browser/package.json +6 -0
  11. package/cli/web-browser/tool.manifest.json +8 -0
  12. package/package.json +26 -44
  13. package/src/core/agent/agent-manager.js +218 -0
  14. package/src/core/artifacts/artifact-store.js +102 -0
  15. package/src/core/config/config-store.js +20 -0
  16. package/src/core/tools/tool-registry.js +117 -0
  17. package/src/index.js +27 -0
  18. package/src/runtime/bootstrap.js +213 -0
  19. package/src/runtime/create-app.js +22 -0
  20. package/src/transport/telegram/auth.js +13 -0
  21. package/src/transport/telegram/bot.js +214 -0
  22. package/src/transport/telegram/media.js +75 -0
  23. package/CLAUDE.md +0 -191
  24. package/SOUL.md +0 -36
  25. package/bin/arisa.js +0 -644
  26. package/scripts/dump-commands.ts +0 -26
  27. package/scripts/test-secrets.ts +0 -22
  28. package/src/core/attachments.ts +0 -104
  29. package/src/core/auth.ts +0 -58
  30. package/src/core/context.ts +0 -30
  31. package/src/core/file-detector.ts +0 -39
  32. package/src/core/format.ts +0 -159
  33. package/src/core/index.ts +0 -456
  34. package/src/core/intent.ts +0 -119
  35. package/src/core/media.ts +0 -144
  36. package/src/core/onboarding.ts +0 -102
  37. package/src/core/processor.ts +0 -305
  38. package/src/core/router.ts +0 -64
  39. package/src/core/scheduler.ts +0 -193
  40. package/src/daemon/agent-cli.ts +0 -130
  41. package/src/daemon/auto-install.ts +0 -158
  42. package/src/daemon/autofix.ts +0 -116
  43. package/src/daemon/bridge.ts +0 -166
  44. package/src/daemon/channels/base.ts +0 -10
  45. package/src/daemon/channels/telegram.ts +0 -306
  46. package/src/daemon/claude-login.ts +0 -218
  47. package/src/daemon/codex-login.ts +0 -172
  48. package/src/daemon/fallback.ts +0 -73
  49. package/src/daemon/index.ts +0 -272
  50. package/src/daemon/lifecycle.ts +0 -313
  51. package/src/daemon/setup.ts +0 -329
  52. package/src/shared/ai-cli.ts +0 -165
  53. package/src/shared/config.ts +0 -137
  54. package/src/shared/db.ts +0 -304
  55. package/src/shared/deepbase-secure.ts +0 -39
  56. package/src/shared/ink-shim.js +0 -14
  57. package/src/shared/logger.ts +0 -42
  58. package/src/shared/paths.ts +0 -90
  59. package/src/shared/ports.ts +0 -120
  60. package/src/shared/secrets.ts +0 -136
  61. package/src/shared/types.ts +0 -103
  62. package/tsconfig.json +0 -19
package/AGENTS.md ADDED
@@ -0,0 +1,102 @@
1
+ # Arisa AGENTS
2
+
3
+ ## Architecture
4
+ - `src/transport/telegram/*`: Telegram inbound and outbound transport.
5
+ - `src/core/agent/*`: Pi Agent sessions, one per authorized chat.
6
+ - `src/core/artifacts/*`: every incoming or generated message/file becomes an artifact.
7
+ - `src/core/tools/*`: CLI tool registry, help lookup, config writes, execution.
8
+ - `cli/*`: isolated tools. Each tool has `package.json`, `config.js`, `tool.manifest.json`, and `index.js`.
9
+
10
+ ## Main rule: everything is piped through artifacts
11
+ A pipe transforms one input artifact into one output artifact.
12
+ Examples:
13
+ - voice OGG -> transcript TXT
14
+ - text -> MP3 audio
15
+ - URL -> downloaded file -> derived file -> transcript
16
+
17
+ Each tool declares in `tool.manifest.json`:
18
+ - `input`: supported input types
19
+ - `output`: produced output types
20
+ - `configSchema`: required config fields
21
+
22
+ ## Conceptual pipe model
23
+ There are two different moments where pipes can happen:
24
+
25
+ 1. **Pre-reasoning normalization pipes**
26
+ - These happen before Pi Agent reasons.
27
+ - Their job is to convert raw inbound media into a form Pi Agent can reason about well.
28
+ - Example: incoming Telegram audio must be transcribed first.
29
+ - In that case, the transcript becomes the effective user message content for Pi Agent.
30
+ - Pi Agent should reason over the transcript, not treat the raw audio as the primary message.
31
+
32
+ 2. **Reasoned action pipes**
33
+ - These happen after Pi Agent starts reasoning.
34
+ - Pi Agent may decide to chain tools to achieve a user goal.
35
+ - Example: text -> TTS audio, or future multi-step workflows.
36
+
37
+ This distinction is critical. Not every pipe should be decided by Pi Agent at runtime. Some pipes are part of the transport/input normalization layer and must happen before reasoning.
38
+
39
+ ## Telegram inbound pipeline
40
+ Current conceptual behavior:
41
+ - text -> send directly to Pi Agent
42
+ - audio/voice -> transcribe first -> send transcript to Pi Agent
43
+ - image/document/other media -> keep as artifacts, and add normalization pipes when needed
44
+
45
+ If inbound media was normalized before reasoning, Pi Agent should use the normalized result as the actual message content.
46
+ For example, if a voice note was transcribed, Pi Agent should answer the meaning of the transcript, not simply return the raw transcript unless the user explicitly asked for transcription.
47
+
48
+ ## How to inspect CLI tools
49
+ Before using a tool, inspect its help:
50
+ - via the custom tool: `tool_help`
51
+ - or by running the CLI with `--help`
52
+
53
+ Every CLI must support:
54
+ - `node index.js --help`
55
+ - `node index.js run --request-file <json>`
56
+
57
+ ## Pipe behavior in V1
58
+ V1 does not have a full automatic planner yet. The agent should:
59
+ 1. understand whether the needed pipe belongs to pre-reasoning normalization or post-reasoning tool chaining
60
+ 2. use `list_tools`
61
+ 3. use `tool_help` when it needs operational details
62
+ 4. execute a tool with `run_tool`
63
+ 5. if another step is needed, use the returned `artifactId` as input for the next tool
64
+
65
+ Example manual pipe:
66
+ 1. `run_tool(openai-transcribe, artifact audio)`
67
+ 2. take the returned text `artifactId`
68
+ 3. `run_tool(openai-tts, artifact text)` or `send_audio_reply(text)`
69
+
70
+ ## Missing config flow
71
+ If `run_tool` returns `missingConfig`, the agent should:
72
+ 1. ask the user naturally in Telegram for the missing value
73
+ 2. write the value into `cli/<tool>/config.js` with `set_tool_config`
74
+ 3. retry the tool
75
+
76
+ Do not assume a rigid question/answer protocol. Continue the conversation naturally and infer the config value from the user reply when possible.
77
+
78
+ ## Telegram security
79
+ - The first chat that messages the bot is authorized if `telegram.maxChatIds` allows it.
80
+ - Do not authorize more chats than configured.
81
+ - Access control is based on chat ids, not usernames.
82
+
83
+ ## Tool creation
84
+ Do not assume specific future tools such as YouTube support exist.
85
+ If the user asks for a capability that is not currently available, first check whether an existing registered tool can satisfy the task.
86
+ If no existing tool can do it, the default attitude should be to propose creating a new CLI tool under `cli/<tool-name>` following the project conventions.
87
+ All newly created tools must document their help text, usage instructions, manifests, and user-facing operational strings in English.
88
+ Do not stop at "I cannot do that" when the task is realistically implementable through a new tool.
89
+ Prefer responses like:
90
+ - identify that no current tool satisfies the request
91
+ - state that the missing capability can be added
92
+ - propose or start creating the tool needed to fulfill the request
93
+
94
+ For example, if the user asks for live weather and no weather tool exists, the correct attitude is to propose building a weather tool for the bot rather than only saying real-time access is unavailable.
95
+
96
+ Consult the local skill for that workflow when building new tools.
97
+
98
+ ## Safety
99
+ - Do not install or run arbitrary tools outside registered `cli/*` manifests in V1.
100
+ - Prefer tool manifests and CLI help over assumptions.
101
+ - Keep tool configs inside `cli/<tool>/config.js`.
102
+ - Be proactive about extending capabilities, but do it through the project's tool architecture, not through ad hoc one-off behavior.
package/README.md CHANGED
@@ -1,215 +1,170 @@
1
- # What is Arisa
1
+ # Arisa
2
2
 
3
- Arisa is a Bun + TypeScript agent runtime with a two-process architecture: **Daemon** (stable channel I/O) and **Core** (message processing, media, scheduling, CLI routing). Telegram is one access channel, not the identity of the system.
3
+ Arisa is a modular Telegram assistant powered by Pi Agent.
4
4
 
5
- Inspired by the architecture of [`jlia0/tinyclaw`](https://github.com/jlia0/tinyclaw).
5
+ It is designed around a simple idea:
6
6
 
7
- Arisa is intentionally dynamic: the project grows as the user builds a relationship with it. Many capabilities are added live during real conversations (for example, Whisper support), so the system evolves through use instead of staying static.
7
+ - **Telegram is the human interface**
8
+ - **Pi Agent is the reasoning engine**
9
+ - **everything is an artifact**
10
+ - **capabilities live in isolated CLI tools**
11
+ - **tools can be chained through pipes**
8
12
 
9
- ## Security Notice
13
+ Arisa is meant to grow like Lego blocks. If a capability does not exist yet, the system should prefer adding a new tool instead of stopping at "I can't do that".
10
14
 
11
- Arisa can execute actions with operational control over the system where it runs. Before deploying it, make sure you understand and accept the associated security risks. It is strongly recommended to run Arisa in an isolated environment (for example, a Docker container or a dedicated VPS) that does not store sensitive information or critical assets.
15
+ ## Core concept
12
16
 
13
- ## Requirements and Installation
17
+ Arisa separates two different kinds of pipes:
14
18
 
15
- ```bash
16
- curl -fsSL https://bun.sh/install | bash # Install Bun https://bun.sh
17
- bun add -g arisa # Install Arisa CLI
19
+ 1. **Pre-reasoning normalization pipes**
20
+ - These happen before Pi Agent reasons.
21
+ - Example: a Telegram voice message is transcribed first.
22
+ - Pi Agent then reasons over the transcript, not over the raw audio.
18
23
 
19
- # @anthropic-ai/claude-code and @openai/codex (auto-installed if missing)
20
- ```
24
+ 2. **Reasoned action pipes**
25
+ - These happen after Pi Agent starts reasoning.
26
+ - Example: text -> TTS audio.
27
+ - Future tools can form larger chains.
21
28
 
22
- ## Commands
29
+ This distinction is important. Some transformations belong to the transport/input layer, not to the agent's runtime decision making.
23
30
 
24
- ```bash
25
- arisa # Foreground daemon mode (Ctrl+C to stop)
26
- arisa start # Start as service (enables autostart with systemd --user)
27
- arisa stop # Stop service
28
- arisa status # Service status
29
- arisa restart # Restart service
30
- arisa daemon # Foreground daemon mode (manual/dev)
31
- arisa core # Foreground core-only mode
32
- arisa dev # Foreground core watch mode
33
- ```
31
+ ## Current behavior
34
32
 
33
+ ### Telegram input
34
+ - text messages go directly to Pi Agent
35
+ - audio/voice messages are transcribed first, then passed to Pi Agent as text
36
+ - media is stored as artifacts
35
37
 
36
- On Linux with `systemd --user`, `arisa start` enables auto-start on reboot. To keep it running even without an active login session:
38
+ ### Tool model
39
+ Each tool lives in its own folder under `cli/<tool-name>` and contains:
37
40
 
38
- ```bash
39
- sudo loginctl enable-linger "$USER"
40
- ```
41
+ - `package.json`
42
+ - `config.js`
43
+ - `tool.manifest.json`
44
+ - `index.js`
41
45
 
42
- ## Architecture: Daemon + Core
46
+ Each tool is isolated from the root project and from other tools.
47
+ That isolation is part of the architecture:
43
48
 
44
- ```
45
- Daemon (:51778) Core (:51777)
46
- ├── Telegram adapter (grammy) ├── HTTP server /message, /health
47
- ├── HTTP server /send (for scheduler) ├── Claude CLI with model routing
48
- ├── Bridge: HTTP client to Core ├── Media: voice (Whisper), vision, speech (ElevenLabs)
49
- ├── Lifecycle: spawn Core --watch ├── Scheduler (croner)
50
- └── In-memory queue if Core is down ├── Format: HTML + chunking
51
- └── File detection in responses
52
- ```
53
-
54
- **Message flow:**
55
- 1. Telegram → Daemon receives message (text/voice/photo)
56
- 2. Daemon → POST Core:51777/message (media as base64)
57
- 3. Core processes media → routes model → calls `claude CLI` → formats response
58
- 4. Core returns response → Daemon sends to Telegram
49
+ - each tool has its own folder
50
+ - each tool keeps its own `config.js`
51
+ - each tool can have its own dependencies
52
+ - one tool can be changed or replaced without tightly coupling the rest of the system
59
53
 
60
- **Scheduler flow:**
61
- Scheduled task fires → Core POSTs to Daemon:51778/send → Telegram
54
+ Each tool must support:
62
55
 
63
- ### Principle of separation
56
+ ```bash
57
+ node index.js --help
58
+ node index.js run --request-file <json>
59
+ ```
64
60
 
65
- - **Daemon** = Channel I/O only. Receives/sends messages. Never processes content. Stable process that never needs restarting.
66
- - **Core** = Everything else. Media processing, Claude CLI, formatting, scheduling. Runs with `bun --watch` for hot-reload when code changes.
61
+ ### Configuration model
62
+ - Telegram runtime config is stored in `data/state/config.json`
63
+ - tool-specific secrets/config live in `cli/<tool>/config.js`
64
+ - Pi authentication can use either:
65
+ - an API key entered during bootstrap
66
+ - or Pi's existing OAuth login when supported, such as `openai-codex`
67
67
 
68
- ## File Structure
68
+ ## Install globally
69
69
 
70
- ```
71
- src/
72
- ├── daemon/
73
- │ ├── index.ts # Entry: channel + HTTP server + spawn Core
74
- │ ├── channels/
75
- │ │ ├── base.ts # Re-exports Channel interface
76
- │ │ └── telegram.ts # Telegram adapter (grammy)
77
- │ ├── bridge.ts # HTTP client to Core with retry + in-memory queue
78
- │ └── lifecycle.ts # Spawn Core with --watch, auto-restart
79
-
80
- ├── core/
81
- │ ├── index.ts # HTTP server with /message and /health endpoints
82
- │ ├── processor.ts # Executes claude CLI with model routing
83
- │ ├── router.ts # Selects model (haiku/sonnet/opus) by message pattern
84
- │ ├── media.ts # Voice transcription (Whisper), image analysis (Vision), speech synthesis (ElevenLabs)
85
- │ ├── scheduler.ts # Cron + one-time tasks with croner, persists via deepbase
86
- │ ├── format.ts # Telegram chunking (4096 char limit)
87
- │ ├── file-detector.ts # Detect file paths in responses for auto-sending
88
- │ └── context.ts # Manage -c flag and reset_flag
89
-
90
- └── shared/
91
- ├── types.ts # All shared interfaces
92
- ├── config.ts # Env vars, ports, paths
93
- ├── logger.ts # Logger → .arisa/logs/
94
- └── db.ts # Unified persistence layer (deepbase)
70
+ ```bash
71
+ npm install -g arisa
95
72
  ```
96
73
 
97
- ## Model Routing
74
+ Then run:
98
75
 
99
- The router (`src/core/router.ts`) selects Claude models based on message patterns:
100
- - **Haiku**: Reminders, acknowledgments, simple yes/no
101
- - **Sonnet** (default): General conversation, queries
102
- - **Opus**: Code changes, debugging, complex multi-step tasks
76
+ ```bash
77
+ arisa
78
+ ```
103
79
 
104
- ## Bot Commands
80
+ ## Bootstrap flow
105
81
 
106
- Available Telegram bot commands:
107
- - `/reset` — Clear conversation history and start fresh
108
- - `/cancel` — Cancel all scheduled tasks for this chat
109
- - `/claude` — Switch to Claude backend (default)
110
- - `/codex` — Switch to Codex backend
111
- - `/speak <text>` — Generate speech from text using ElevenLabs (requires ELEVENLABS_API_KEY)
82
+ On first run, Arisa will:
112
83
 
113
- ## Adding a New Channel
84
+ 1. ask for a Telegram bot token
85
+ 2. ask for the maximum number of authorized chat ids
86
+ 3. show a list of Pi models
87
+ 4. resolve authentication for the selected Pi provider
88
+ 5. validate that Pi Agent works
89
+ 6. only then start listening to Telegram
114
90
 
115
- Implement the `Channel` interface from `src/shared/types.ts` and register it in `src/daemon/index.ts`. The interface requires: `connect()`, `onMessage()`, `send()`, `sendFile()`.
91
+ Telegram bot tokens can be created with:
116
92
 
117
- ## Hooks
93
+ - https://t.me/BotFather
118
94
 
119
- Configured in `.claude/settings.json`:
120
- - **SessionStart**: Runs `session-start.sh` — outputs Arisa context reminder
121
- - **PostToolUse** (async): Runs `log-activity.sh` — logs tool usage to `.arisa/logs/activity.log`
95
+ ## Using Pi authentication
122
96
 
123
- ## Runtime Data
97
+ For providers with internal Pi login support, such as Codex, leaving the API key empty during bootstrap will start the internal login flow automatically if no existing auth is found.
124
98
 
125
- All runtime data lives under `~/.arisa/` (with automatic migration from legacy project-local `.tinyclaw/` or `.arisa/`):
126
- - `logs/` — per-component log files (core, daemon, telegram, scheduler)
127
- - `db/arisa.json` — unified persistence with deepbase
128
- - `attachments/` — saved media files organized by `{chatId}/`
129
- - `.env` — TELEGRAM_BOT_TOKEN, OPENAI_API_KEY, ELEVENLABS_API_KEY
130
- - `voice_temp/` — temporary directory for voice transcription
131
- - `reset_flag` — conversation reset marker
99
+ For example, selecting:
132
100
 
133
- ### Persistence with DeepBase
101
+ - `openai-codex/gpt-5.4`
134
102
 
135
- All persistent data is managed by **deepbase** (`src/shared/db.ts`). Location: `~/.arisa/db/arisa.json`.
103
+ allows Arisa to authenticate through Pi's Codex OAuth flow instead of requiring a normal OpenAI API key.
136
104
 
137
- | Collection | Key | Value type | Description |
138
- |-----------------|---------------|--------------------|------------------------------------------|
139
- | `tasks` | `task.id` | `ScheduledTask` | Cron and one-time scheduled tasks |
140
- | `authorized` | `chatId` | `{ userId }` | Authorized Telegram chats |
141
- | `onboarded` | `chatId` | `{ userId }` | Chats that completed onboarding |
142
- | `queue` | `message.id` | queue message | In-memory queue overflow (Daemon→Core) |
143
- | `attachments` | `chatId_file` | `AttachmentRecord` | Metadata for saved media (files on disk) |
144
- | `messages` | `chatId_msgId`| `MessageRecord` | Message ledger for reply context |
145
- | `settings` | key name | `{ value }` | App settings (auth_token, etc.) |
105
+ ## Running model
146
106
 
147
- - **API**: `db.get(collection, key)`, `db.set(collection, key, data)`, `db.del(collection, key)`
148
- - **Helper functions**: `src/shared/db.ts` provides type-safe wrappers per collection
107
+ Arisa keeps one Pi session per authorized Telegram chat.
149
108
 
150
- ## Response Formatting
109
+ If a message arrives while Pi Agent is still processing another one:
151
110
 
152
- Telegram responses are sent with `parse_mode: 'HTML'`. When composing responses that will be sent through Telegram, use HTML formatting instead of Markdown. For example, use `<b>bold</b>` instead of `**bold**`, `<code>inline code</code>` instead of backticks, and `<pre>code block</pre>` instead of triple backticks.
111
+ - the current message keeps running
112
+ - the new message is appended to a queued buffer
113
+ - additional incoming messages are concatenated to that same buffer
114
+ - once the current processing finishes, the buffered messages are sent together as the next prompt
153
115
 
154
- ## Workflow Orchestration
116
+ Conceptually:
155
117
 
156
- ### 1. Plan Mode (On Request Only)
157
- - Do NOT enter plan mode automatically — only when the user explicitly asks for it
158
- - If something goes sideways, STOP and re-assess, but don't force plan mode
159
- - When user requests planning: write detailed specs upfront to reduce ambiguity
118
+ ```txt
119
+ message 1 is processing
120
+ message 2 arrives -> queued
121
+ message 3 arrives -> appended to queued
122
+ message 1 finishes
123
+ queued batch is processed next
124
+ ```
160
125
 
161
- ### 2. Subagent Strategy to keep main context window clean
162
- - Offload research, exploration, and parallel analysis to subagents
163
- - For complex problems, throw more compute at it via subagents
164
- - One task per subagent for focused execution
126
+ ## Project structure
165
127
 
166
- ### 3. Self-Improvement Loop
167
- - After ANY correction from the user: update 'tasks/lessons.md' with the pattern
168
- - Write rules for yourself that prevent the same mistake
169
- - Ruthlessly iterate on these lessons until mistake rate drops
170
- - Review lessons at session start for relevant project
128
+ ```txt
129
+ src/
130
+ runtime/ bootstrap + app startup
131
+ transport/ Telegram integration
132
+ core/ agent, tools, artifacts, config
133
+ cli/
134
+ openai-transcribe/
135
+ openai-tts/
136
+ data/
137
+ state/
138
+ artifacts/
139
+ chats/
140
+ ```
171
141
 
172
- ### 4. Verification Before Done
173
- - Never mark a task complete without proving it works
174
- - Diff behavior between main and your changes when relevant
175
- - Ask yourself: "Would a staff engineer approve this?"
176
- - Run tests, check logs, demonstrate correctness
142
+ ## Philosophy
177
143
 
178
- ### 5. Demand Elegance (Balanced)
179
- - For non-trivial changes: pause and ask "is there a more elegant way?"
180
- - If a fix feels hacky: "Knowing everything I know now, implement the elegant solution"
181
- - Skip this for simple, obvious fixes - don't over-engineer
182
- - Challenge your own work before presenting it
144
+ Arisa should not default to passive answers like "I can't do that" when a missing capability can realistically be implemented as a new tool.
183
145
 
184
- ### 6. Autonomous Bug Fixing
185
- - When given a bug report: just fix it. Don't ask for hand-holding
186
- - Point at logs, errors, failing tests -> then resolve them
187
- - Zero context switching required from the user
188
- - Go fix failing CI tests without being told how
146
+ The preferred behavior is:
189
147
 
190
- ## Task Management
191
- 1. **Plan First**: Write plan to 'tasks/todo.md' with checkable items
192
- 2. **Verify Plan**: Check in before starting implementation
193
- 3. **Track Progress**: Mark items complete as you go
194
- 4. **Explain Changes**: High-level summary at each step
195
- 5. **Document Results**: Add review to 'tasks/todo.md'
196
- 6. **Capture Lessons**: Update 'tasks/lessons.md' after corrections
148
+ 1. check whether an existing tool can solve the task
149
+ 2. if not, propose creating the missing tool
150
+ 3. keep the solution inside the tool architecture
197
151
 
198
- ## Voice Messages (ElevenLabs)
152
+ ## Notes
199
153
 
200
- When you want to send a voice message to the user, wrap the spoken text in `[VOICE]...[/VOICE]` tags:
154
+ - `AGENTS.md` defines the project-level behavioral rules for Pi Agent
155
+ - `src/transport/telegram/bot.js` builds the per-message runtime prompt
156
+ - tool help is part of the architecture and should be consulted before use when details are unclear
201
157
 
202
- ```
203
- [VOICE]Hello, this will be converted to audio[/VOICE]
204
- ```
158
+ ## Status
159
+
160
+ This is currently a functional V1 focused on:
205
161
 
206
- - The text inside `[VOICE]` gets synthesized via ElevenLabs and sent as a Telegram voice message
207
- - The `[VOICE]` tags are stripped from the text response — only the audio is sent
208
- - Use it when the user asks you to "hablame", "mandame un audio", "decime con voz", etc.
209
- - Keep voice texts concise — long texts cost more and take longer to generate
210
- - You can combine voice with text: write a text response AND include a `[VOICE]` block
162
+ - Telegram transport
163
+ - Pi Agent integration
164
+ - artifact-based message handling
165
+ - isolated CLI tools
166
+ - audio transcription before reasoning
167
+ - text-to-speech replies
168
+ - queued follow-up message batching
211
169
 
212
- ## Core Principles
213
- - **Simplicity First**: Make every change as simple as possible. Impact minimal code.
214
- - **No Laziness**: Find root causes. No temporary fixes. Senior developer standards.
215
- - **Minimal Impact**: Changes should only touch what's necessary. Avoid introducing bugs.
170
+ Future capabilities should be added as new tools and pipes, not as tightly coupled one-off code paths.
@@ -0,0 +1,51 @@
1
+ import { readFile, stat } from "node:fs/promises";
2
+ import path from "node:path";
3
+ import config from "./config.js";
4
+
5
+ function printHelp() {
6
+ console.log(`openai-transcribe\n\nUso:\n node index.js --help\n node index.js run --request-file <json>\n\nInput esperado:\n {\n \"artifact\": { \"path\": \"/abs/audio.ogg\", \"mimeType\": \"audio/ogg\" },\n \"args\": {}\n }\n\nConfig en cli/openai-transcribe/config.js:\n OPENAI_API_KEY\n MODEL\n`);
7
+ }
8
+
9
+ async function run(requestFile) {
10
+ if (!config.OPENAI_API_KEY) {
11
+ console.log(JSON.stringify({ ok: false, missingConfig: ["OPENAI_API_KEY"], configPath: path.resolve("config.js") }));
12
+ return;
13
+ }
14
+
15
+ const request = JSON.parse(await readFile(requestFile, "utf8"));
16
+ const artifact = request.artifact;
17
+ if (!artifact?.path) {
18
+ console.log(JSON.stringify({ ok: false, error: "artifact.path is required" }));
19
+ return;
20
+ }
21
+
22
+ await stat(artifact.path);
23
+ const form = new FormData();
24
+ const data = await readFile(artifact.path);
25
+ form.append("file", new Blob([data]), path.basename(artifact.path));
26
+ form.append("model", config.MODEL);
27
+
28
+ const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
29
+ method: "POST",
30
+ headers: { Authorization: `Bearer ${config.OPENAI_API_KEY}` },
31
+ body: form
32
+ });
33
+
34
+ const payload = await response.json();
35
+ if (!response.ok) {
36
+ console.log(JSON.stringify({ ok: false, error: payload.error?.message || "OpenAI transcription failed" }));
37
+ return;
38
+ }
39
+
40
+ console.log(JSON.stringify({ ok: true, output: { text: payload.text || "" } }));
41
+ }
42
+
43
+ const args = process.argv.slice(2);
44
+ if (!args.length || args.includes("--help") || args[0] === "help") {
45
+ printHelp();
46
+ } else if (args[0] === "run") {
47
+ const fileIndex = args.indexOf("--request-file");
48
+ await run(args[fileIndex + 1]);
49
+ } else {
50
+ printHelp();
51
+ }
@@ -0,0 +1,6 @@
1
+ {
2
+ "name": "openai-transcribe-cli",
3
+ "private": true,
4
+ "type": "module",
5
+ "version": "1.0.0"
6
+ }
@@ -0,0 +1,15 @@
1
+ {
2
+ "name": "openai-transcribe",
3
+ "description": "Transcribe audio files with OpenAI audio transcription API.",
4
+ "entry": "index.js",
5
+ "input": ["audio/ogg", "audio/mpeg", "audio/wav", "audio/mp4"],
6
+ "output": ["text/plain"],
7
+ "configSchema": {
8
+ "OPENAI_API_KEY": {
9
+ "type": "string",
10
+ "required": true,
11
+ "secret": true,
12
+ "prompt": "Necesito tu OPENAI_API_KEY para transcribir audio."
13
+ }
14
+ }
15
+ }
@@ -0,0 +1,58 @@
1
+ import { mkdir, readFile, writeFile } from "node:fs/promises";
2
+ import path from "node:path";
3
+ import config from "./config.js";
4
+
5
+ function printHelp() {
6
+ console.log(`openai-tts\n\nUso:\n node index.js --help\n node index.js run --request-file <json>\n\nInput esperado:\n {\n \"text\": \"hola\",\n \"artifact\": { \"text\": \"hola\" },\n \"args\": { \"voice\": \"alloy\" }\n }\n\nConfig en cli/openai-tts/config.js:\n OPENAI_API_KEY\n MODEL\n VOICE\n`);
7
+ }
8
+
9
+ async function run(requestFile) {
10
+ if (!config.OPENAI_API_KEY) {
11
+ console.log(JSON.stringify({ ok: false, missingConfig: ["OPENAI_API_KEY"], configPath: path.resolve("config.js") }));
12
+ return;
13
+ }
14
+
15
+ const request = JSON.parse(await readFile(requestFile, "utf8"));
16
+ const inputText = request.text || request.artifact?.text;
17
+ if (!inputText) {
18
+ console.log(JSON.stringify({ ok: false, error: "text or artifact.text is required" }));
19
+ return;
20
+ }
21
+
22
+ const response = await fetch("https://api.openai.com/v1/audio/speech", {
23
+ method: "POST",
24
+ headers: {
25
+ Authorization: `Bearer ${config.OPENAI_API_KEY}`,
26
+ "Content-Type": "application/json"
27
+ },
28
+ body: JSON.stringify({
29
+ model: config.MODEL,
30
+ voice: request.args?.voice || config.VOICE,
31
+ input: inputText,
32
+ format: "mp3"
33
+ })
34
+ });
35
+
36
+ if (!response.ok) {
37
+ const payload = await response.text();
38
+ console.log(JSON.stringify({ ok: false, error: payload }));
39
+ return;
40
+ }
41
+
42
+ const outDir = path.resolve("out");
43
+ await mkdir(outDir, { recursive: true });
44
+ const filePath = path.join(outDir, `speech-${Date.now()}.mp3`);
45
+ const buffer = Buffer.from(await response.arrayBuffer());
46
+ await writeFile(filePath, buffer);
47
+ console.log(JSON.stringify({ ok: true, output: { filePath, fileName: path.basename(filePath), mimeType: "audio/mpeg", kind: "audio" } }));
48
+ }
49
+
50
+ const args = process.argv.slice(2);
51
+ if (!args.length || args.includes("--help") || args[0] === "help") {
52
+ printHelp();
53
+ } else if (args[0] === "run") {
54
+ const fileIndex = args.indexOf("--request-file");
55
+ await run(args[fileIndex + 1]);
56
+ } else {
57
+ printHelp();
58
+ }
@@ -0,0 +1,6 @@
1
+ {
2
+ "name": "openai-tts-cli",
3
+ "private": true,
4
+ "type": "module",
5
+ "version": "1.0.0"
6
+ }
@@ -0,0 +1,20 @@
1
+ {
2
+ "name": "openai-tts",
3
+ "description": "Convert text into MP3 audio using OpenAI speech API.",
4
+ "entry": "index.js",
5
+ "input": ["text/plain"],
6
+ "output": ["audio/mpeg"],
7
+ "configSchema": {
8
+ "OPENAI_API_KEY": {
9
+ "type": "string",
10
+ "required": true,
11
+ "secret": true,
12
+ "prompt": "Necesito tu OPENAI_API_KEY para generar audio."
13
+ },
14
+ "VOICE": {
15
+ "type": "string",
16
+ "required": false,
17
+ "prompt": "Voz a usar, por ejemplo alloy."
18
+ }
19
+ }
20
+ }