arisa 2.3.55 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +102 -0
- package/README.md +120 -165
- package/cli/openai-transcribe/index.js +51 -0
- package/cli/openai-transcribe/package.json +6 -0
- package/cli/openai-transcribe/tool.manifest.json +15 -0
- package/cli/openai-tts/index.js +58 -0
- package/cli/openai-tts/package.json +6 -0
- package/cli/openai-tts/tool.manifest.json +20 -0
- package/cli/web-browser/index.js +146 -0
- package/cli/web-browser/package.json +6 -0
- package/cli/web-browser/tool.manifest.json +8 -0
- package/package.json +26 -44
- package/src/core/agent/agent-manager.js +218 -0
- package/src/core/artifacts/artifact-store.js +102 -0
- package/src/core/config/config-store.js +20 -0
- package/src/core/tools/tool-registry.js +117 -0
- package/src/index.js +27 -0
- package/src/runtime/bootstrap.js +213 -0
- package/src/runtime/create-app.js +22 -0
- package/src/transport/telegram/auth.js +13 -0
- package/src/transport/telegram/bot.js +214 -0
- package/src/transport/telegram/media.js +75 -0
- package/CLAUDE.md +0 -191
- package/SOUL.md +0 -36
- package/bin/arisa.js +0 -644
- package/scripts/dump-commands.ts +0 -26
- package/scripts/test-secrets.ts +0 -22
- package/src/core/attachments.ts +0 -104
- package/src/core/auth.ts +0 -58
- package/src/core/context.ts +0 -30
- package/src/core/file-detector.ts +0 -39
- package/src/core/format.ts +0 -159
- package/src/core/index.ts +0 -456
- package/src/core/intent.ts +0 -119
- package/src/core/media.ts +0 -144
- package/src/core/onboarding.ts +0 -102
- package/src/core/processor.ts +0 -305
- package/src/core/router.ts +0 -64
- package/src/core/scheduler.ts +0 -193
- package/src/daemon/agent-cli.ts +0 -130
- package/src/daemon/auto-install.ts +0 -158
- package/src/daemon/autofix.ts +0 -116
- package/src/daemon/bridge.ts +0 -166
- package/src/daemon/channels/base.ts +0 -10
- package/src/daemon/channels/telegram.ts +0 -306
- package/src/daemon/claude-login.ts +0 -218
- package/src/daemon/codex-login.ts +0 -172
- package/src/daemon/fallback.ts +0 -73
- package/src/daemon/index.ts +0 -272
- package/src/daemon/lifecycle.ts +0 -313
- package/src/daemon/setup.ts +0 -329
- package/src/shared/ai-cli.ts +0 -165
- package/src/shared/config.ts +0 -137
- package/src/shared/db.ts +0 -304
- package/src/shared/deepbase-secure.ts +0 -39
- package/src/shared/ink-shim.js +0 -14
- package/src/shared/logger.ts +0 -42
- package/src/shared/paths.ts +0 -90
- package/src/shared/ports.ts +0 -120
- package/src/shared/secrets.ts +0 -136
- package/src/shared/types.ts +0 -103
- package/tsconfig.json +0 -19
package/AGENTS.md
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Arisa AGENTS
|
|
2
|
+
|
|
3
|
+
## Architecture
|
|
4
|
+
- `src/transport/telegram/*`: Telegram inbound and outbound transport.
|
|
5
|
+
- `src/core/agent/*`: Pi Agent sessions, one per authorized chat.
|
|
6
|
+
- `src/core/artifacts/*`: every incoming or generated message/file becomes an artifact.
|
|
7
|
+
- `src/core/tools/*`: CLI tool registry, help lookup, config writes, execution.
|
|
8
|
+
- `cli/*`: isolated tools. Each tool has `package.json`, `config.js`, `tool.manifest.json`, and `index.js`.
|
|
9
|
+
|
|
10
|
+
## Main rule: everything is piped through artifacts
|
|
11
|
+
A pipe transforms one input artifact into one output artifact.
|
|
12
|
+
Examples:
|
|
13
|
+
- voice OGG -> transcript TXT
|
|
14
|
+
- text -> MP3 audio
|
|
15
|
+
- URL -> downloaded file -> derived file -> transcript
|
|
16
|
+
|
|
17
|
+
Each tool declares in `tool.manifest.json`:
|
|
18
|
+
- `input`: supported input types
|
|
19
|
+
- `output`: produced output types
|
|
20
|
+
- `configSchema`: required config fields
|
|
21
|
+
|
|
22
|
+
## Conceptual pipe model
|
|
23
|
+
There are two different moments where pipes can happen:
|
|
24
|
+
|
|
25
|
+
1. **Pre-reasoning normalization pipes**
|
|
26
|
+
- These happen before Pi Agent reasons.
|
|
27
|
+
- Their job is to convert raw inbound media into a form Pi Agent can reason about well.
|
|
28
|
+
- Example: incoming Telegram audio must be transcribed first.
|
|
29
|
+
- In that case, the transcript becomes the effective user message content for Pi Agent.
|
|
30
|
+
- Pi Agent should reason over the transcript, not treat the raw audio as the primary message.
|
|
31
|
+
|
|
32
|
+
2. **Reasoned action pipes**
|
|
33
|
+
- These happen after Pi Agent starts reasoning.
|
|
34
|
+
- Pi Agent may decide to chain tools to achieve a user goal.
|
|
35
|
+
- Example: text -> TTS audio, or future multi-step workflows.
|
|
36
|
+
|
|
37
|
+
This distinction is critical. Not every pipe should be decided by Pi Agent at runtime. Some pipes are part of the transport/input normalization layer and must happen before reasoning.
|
|
38
|
+
|
|
39
|
+
## Telegram inbound pipeline
|
|
40
|
+
Current conceptual behavior:
|
|
41
|
+
- text -> send directly to Pi Agent
|
|
42
|
+
- audio/voice -> transcribe first -> send transcript to Pi Agent
|
|
43
|
+
- image/document/other media -> keep as artifacts, and add normalization pipes when needed
|
|
44
|
+
|
|
45
|
+
If inbound media was normalized before reasoning, Pi Agent should use the normalized result as the actual message content.
|
|
46
|
+
For example, if a voice note was transcribed, Pi Agent should answer the meaning of the transcript, not simply return the raw transcript unless the user explicitly asked for transcription.
|
|
47
|
+
|
|
48
|
+
## How to inspect CLI tools
|
|
49
|
+
Before using a tool, inspect its help:
|
|
50
|
+
- via the custom tool: `tool_help`
|
|
51
|
+
- or by running the CLI with `--help`
|
|
52
|
+
|
|
53
|
+
Every CLI must support:
|
|
54
|
+
- `node index.js --help`
|
|
55
|
+
- `node index.js run --request-file <json>`
|
|
56
|
+
|
|
57
|
+
## Pipe behavior in V1
|
|
58
|
+
V1 does not have a full automatic planner yet. The agent should:
|
|
59
|
+
1. understand whether the needed pipe belongs to pre-reasoning normalization or post-reasoning tool chaining
|
|
60
|
+
2. use `list_tools`
|
|
61
|
+
3. use `tool_help` when it needs operational details
|
|
62
|
+
4. execute a tool with `run_tool`
|
|
63
|
+
5. if another step is needed, use the returned `artifactId` as input for the next tool
|
|
64
|
+
|
|
65
|
+
Example manual pipe:
|
|
66
|
+
1. `run_tool(openai-transcribe, artifact audio)`
|
|
67
|
+
2. take the returned text `artifactId`
|
|
68
|
+
3. `run_tool(openai-tts, artifact text)` or `send_audio_reply(text)`
|
|
69
|
+
|
|
70
|
+
## Missing config flow
|
|
71
|
+
If `run_tool` returns `missingConfig`, the agent should:
|
|
72
|
+
1. ask the user naturally in Telegram for the missing value
|
|
73
|
+
2. write the value into `cli/<tool>/config.js` with `set_tool_config`
|
|
74
|
+
3. retry the tool
|
|
75
|
+
|
|
76
|
+
Do not assume a rigid question/answer protocol. Continue the conversation naturally and infer the config value from the user reply when possible.
|
|
77
|
+
|
|
78
|
+
## Telegram security
|
|
79
|
+
- The first chat that messages the bot is authorized if `telegram.maxChatIds` allows it.
|
|
80
|
+
- Do not authorize more chats than configured.
|
|
81
|
+
- Access control is based on chat ids, not usernames.
|
|
82
|
+
|
|
83
|
+
## Tool creation
|
|
84
|
+
Do not assume specific future tools such as YouTube support exist.
|
|
85
|
+
If the user asks for a capability that is not currently available, first check whether an existing registered tool can satisfy the task.
|
|
86
|
+
If no existing tool can do it, the default attitude should be to propose creating a new CLI tool under `cli/<tool-name>` following the project conventions.
|
|
87
|
+
All newly created tools must document their help text, usage instructions, manifests, and user-facing operational strings in English.
|
|
88
|
+
Do not stop at "I cannot do that" when the task is realistically implementable through a new tool.
|
|
89
|
+
Prefer responses like:
|
|
90
|
+
- identify that no current tool satisfies the request
|
|
91
|
+
- state that the missing capability can be added
|
|
92
|
+
- propose or start creating the tool needed to fulfill the request
|
|
93
|
+
|
|
94
|
+
For example, if the user asks for live weather and no weather tool exists, the correct attitude is to propose building a weather tool for the bot rather than only saying real-time access is unavailable.
|
|
95
|
+
|
|
96
|
+
Consult the local skill for that workflow when building new tools.
|
|
97
|
+
|
|
98
|
+
## Safety
|
|
99
|
+
- Do not install or run arbitrary tools outside registered `cli/*` manifests in V1.
|
|
100
|
+
- Prefer tool manifests and CLI help over assumptions.
|
|
101
|
+
- Keep tool configs inside `cli/<tool>/config.js`.
|
|
102
|
+
- Be proactive about extending capabilities, but do it through the project's tool architecture, not through ad hoc one-off behavior.
|
package/README.md
CHANGED
|
@@ -1,215 +1,170 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Arisa
|
|
2
2
|
|
|
3
|
-
Arisa is a
|
|
3
|
+
Arisa is a modular Telegram assistant powered by Pi Agent.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
It is designed around a simple idea:
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
- **Telegram is the human interface**
|
|
8
|
+
- **Pi Agent is the reasoning engine**
|
|
9
|
+
- **everything is an artifact**
|
|
10
|
+
- **capabilities live in isolated CLI tools**
|
|
11
|
+
- **tools can be chained through pipes**
|
|
8
12
|
|
|
9
|
-
|
|
13
|
+
Arisa is meant to grow like Lego blocks. If a capability does not exist yet, the system should prefer adding a new tool instead of stopping at "I can't do that".
|
|
10
14
|
|
|
11
|
-
|
|
15
|
+
## Core concept
|
|
12
16
|
|
|
13
|
-
|
|
17
|
+
Arisa separates two different kinds of pipes:
|
|
14
18
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
19
|
+
1. **Pre-reasoning normalization pipes**
|
|
20
|
+
- These happen before Pi Agent reasons.
|
|
21
|
+
- Example: a Telegram voice message is transcribed first.
|
|
22
|
+
- Pi Agent then reasons over the transcript, not over the raw audio.
|
|
18
23
|
|
|
19
|
-
|
|
20
|
-
|
|
24
|
+
2. **Reasoned action pipes**
|
|
25
|
+
- These happen after Pi Agent starts reasoning.
|
|
26
|
+
- Example: text -> TTS audio.
|
|
27
|
+
- Future tools can form larger chains.
|
|
21
28
|
|
|
22
|
-
|
|
29
|
+
This distinction is important. Some transformations belong to the transport/input layer, not to the agent's runtime decision making.
|
|
23
30
|
|
|
24
|
-
|
|
25
|
-
arisa # Foreground daemon mode (Ctrl+C to stop)
|
|
26
|
-
arisa start # Start as service (enables autostart with systemd --user)
|
|
27
|
-
arisa stop # Stop service
|
|
28
|
-
arisa status # Service status
|
|
29
|
-
arisa restart # Restart service
|
|
30
|
-
arisa daemon # Foreground daemon mode (manual/dev)
|
|
31
|
-
arisa core # Foreground core-only mode
|
|
32
|
-
arisa dev # Foreground core watch mode
|
|
33
|
-
```
|
|
31
|
+
## Current behavior
|
|
34
32
|
|
|
33
|
+
### Telegram input
|
|
34
|
+
- text messages go directly to Pi Agent
|
|
35
|
+
- audio/voice messages are transcribed first, then passed to Pi Agent as text
|
|
36
|
+
- media is stored as artifacts
|
|
35
37
|
|
|
36
|
-
|
|
38
|
+
### Tool model
|
|
39
|
+
Each tool lives in its own folder under `cli/<tool-name>` and contains:
|
|
37
40
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
+
- `package.json`
|
|
42
|
+
- `config.js`
|
|
43
|
+
- `tool.manifest.json`
|
|
44
|
+
- `index.js`
|
|
41
45
|
|
|
42
|
-
|
|
46
|
+
Each tool is isolated from the root project and from other tools.
|
|
47
|
+
That isolation is part of the architecture:
|
|
43
48
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
├── Bridge: HTTP client to Core ├── Media: voice (Whisper), vision, speech (ElevenLabs)
|
|
49
|
-
├── Lifecycle: spawn Core --watch ├── Scheduler (croner)
|
|
50
|
-
└── In-memory queue if Core is down ├── Format: HTML + chunking
|
|
51
|
-
└── File detection in responses
|
|
52
|
-
```
|
|
53
|
-
|
|
54
|
-
**Message flow:**
|
|
55
|
-
1. Telegram → Daemon receives message (text/voice/photo)
|
|
56
|
-
2. Daemon → POST Core:51777/message (media as base64)
|
|
57
|
-
3. Core processes media → routes model → calls `claude CLI` → formats response
|
|
58
|
-
4. Core returns response → Daemon sends to Telegram
|
|
49
|
+
- each tool has its own folder
|
|
50
|
+
- each tool keeps its own `config.js`
|
|
51
|
+
- each tool can have its own dependencies
|
|
52
|
+
- one tool can be changed or replaced without tightly coupling the rest of the system
|
|
59
53
|
|
|
60
|
-
|
|
61
|
-
Scheduled task fires → Core POSTs to Daemon:51778/send → Telegram
|
|
54
|
+
Each tool must support:
|
|
62
55
|
|
|
63
|
-
|
|
56
|
+
```bash
|
|
57
|
+
node index.js --help
|
|
58
|
+
node index.js run --request-file <json>
|
|
59
|
+
```
|
|
64
60
|
|
|
65
|
-
|
|
66
|
-
-
|
|
61
|
+
### Configuration model
|
|
62
|
+
- Telegram runtime config is stored in `data/state/config.json`
|
|
63
|
+
- tool-specific secrets/config live in `cli/<tool>/config.js`
|
|
64
|
+
- Pi authentication can use either:
|
|
65
|
+
- an API key entered during bootstrap
|
|
66
|
+
- or Pi's existing OAuth login when supported, such as `openai-codex`
|
|
67
67
|
|
|
68
|
-
##
|
|
68
|
+
## Install globally
|
|
69
69
|
|
|
70
|
-
```
|
|
71
|
-
|
|
72
|
-
├── daemon/
|
|
73
|
-
│ ├── index.ts # Entry: channel + HTTP server + spawn Core
|
|
74
|
-
│ ├── channels/
|
|
75
|
-
│ │ ├── base.ts # Re-exports Channel interface
|
|
76
|
-
│ │ └── telegram.ts # Telegram adapter (grammy)
|
|
77
|
-
│ ├── bridge.ts # HTTP client to Core with retry + in-memory queue
|
|
78
|
-
│ └── lifecycle.ts # Spawn Core with --watch, auto-restart
|
|
79
|
-
│
|
|
80
|
-
├── core/
|
|
81
|
-
│ ├── index.ts # HTTP server with /message and /health endpoints
|
|
82
|
-
│ ├── processor.ts # Executes claude CLI with model routing
|
|
83
|
-
│ ├── router.ts # Selects model (haiku/sonnet/opus) by message pattern
|
|
84
|
-
│ ├── media.ts # Voice transcription (Whisper), image analysis (Vision), speech synthesis (ElevenLabs)
|
|
85
|
-
│ ├── scheduler.ts # Cron + one-time tasks with croner, persists via deepbase
|
|
86
|
-
│ ├── format.ts # Telegram chunking (4096 char limit)
|
|
87
|
-
│ ├── file-detector.ts # Detect file paths in responses for auto-sending
|
|
88
|
-
│ └── context.ts # Manage -c flag and reset_flag
|
|
89
|
-
│
|
|
90
|
-
└── shared/
|
|
91
|
-
├── types.ts # All shared interfaces
|
|
92
|
-
├── config.ts # Env vars, ports, paths
|
|
93
|
-
├── logger.ts # Logger → .arisa/logs/
|
|
94
|
-
└── db.ts # Unified persistence layer (deepbase)
|
|
70
|
+
```bash
|
|
71
|
+
npm install -g arisa
|
|
95
72
|
```
|
|
96
73
|
|
|
97
|
-
|
|
74
|
+
Then run:
|
|
98
75
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
- **Opus**: Code changes, debugging, complex multi-step tasks
|
|
76
|
+
```bash
|
|
77
|
+
arisa
|
|
78
|
+
```
|
|
103
79
|
|
|
104
|
-
##
|
|
80
|
+
## Bootstrap flow
|
|
105
81
|
|
|
106
|
-
|
|
107
|
-
- `/reset` — Clear conversation history and start fresh
|
|
108
|
-
- `/cancel` — Cancel all scheduled tasks for this chat
|
|
109
|
-
- `/claude` — Switch to Claude backend (default)
|
|
110
|
-
- `/codex` — Switch to Codex backend
|
|
111
|
-
- `/speak <text>` — Generate speech from text using ElevenLabs (requires ELEVENLABS_API_KEY)
|
|
82
|
+
On first run, Arisa will:
|
|
112
83
|
|
|
113
|
-
|
|
84
|
+
1. ask for a Telegram bot token
|
|
85
|
+
2. ask for the maximum number of authorized chat ids
|
|
86
|
+
3. show a list of Pi models
|
|
87
|
+
4. resolve authentication for the selected Pi provider
|
|
88
|
+
5. validate that Pi Agent works
|
|
89
|
+
6. only then start listening to Telegram
|
|
114
90
|
|
|
115
|
-
|
|
91
|
+
Telegram bot tokens can be created with:
|
|
116
92
|
|
|
117
|
-
|
|
93
|
+
- https://t.me/BotFather
|
|
118
94
|
|
|
119
|
-
|
|
120
|
-
- **SessionStart**: Runs `session-start.sh` — outputs Arisa context reminder
|
|
121
|
-
- **PostToolUse** (async): Runs `log-activity.sh` — logs tool usage to `.arisa/logs/activity.log`
|
|
95
|
+
## Using Pi authentication
|
|
122
96
|
|
|
123
|
-
|
|
97
|
+
For providers with internal Pi login support, such as Codex, leaving the API key empty during bootstrap will start the internal login flow automatically if no existing auth is found.
|
|
124
98
|
|
|
125
|
-
|
|
126
|
-
- `logs/` — per-component log files (core, daemon, telegram, scheduler)
|
|
127
|
-
- `db/arisa.json` — unified persistence with deepbase
|
|
128
|
-
- `attachments/` — saved media files organized by `{chatId}/`
|
|
129
|
-
- `.env` — TELEGRAM_BOT_TOKEN, OPENAI_API_KEY, ELEVENLABS_API_KEY
|
|
130
|
-
- `voice_temp/` — temporary directory for voice transcription
|
|
131
|
-
- `reset_flag` — conversation reset marker
|
|
99
|
+
For example, selecting:
|
|
132
100
|
|
|
133
|
-
|
|
101
|
+
- `openai-codex/gpt-5.4`
|
|
134
102
|
|
|
135
|
-
|
|
103
|
+
allows Arisa to authenticate through Pi's Codex OAuth flow instead of requiring a normal OpenAI API key.
|
|
136
104
|
|
|
137
|
-
|
|
138
|
-
|-----------------|---------------|--------------------|------------------------------------------|
|
|
139
|
-
| `tasks` | `task.id` | `ScheduledTask` | Cron and one-time scheduled tasks |
|
|
140
|
-
| `authorized` | `chatId` | `{ userId }` | Authorized Telegram chats |
|
|
141
|
-
| `onboarded` | `chatId` | `{ userId }` | Chats that completed onboarding |
|
|
142
|
-
| `queue` | `message.id` | queue message | In-memory queue overflow (Daemon→Core) |
|
|
143
|
-
| `attachments` | `chatId_file` | `AttachmentRecord` | Metadata for saved media (files on disk) |
|
|
144
|
-
| `messages` | `chatId_msgId`| `MessageRecord` | Message ledger for reply context |
|
|
145
|
-
| `settings` | key name | `{ value }` | App settings (auth_token, etc.) |
|
|
105
|
+
## Running model
|
|
146
106
|
|
|
147
|
-
|
|
148
|
-
- **Helper functions**: `src/shared/db.ts` provides type-safe wrappers per collection
|
|
107
|
+
Arisa keeps one Pi session per authorized Telegram chat.
|
|
149
108
|
|
|
150
|
-
|
|
109
|
+
If a message arrives while Pi Agent is still processing another one:
|
|
151
110
|
|
|
152
|
-
|
|
111
|
+
- the current message keeps running
|
|
112
|
+
- the new message is appended to a queued buffer
|
|
113
|
+
- additional incoming messages are concatenated to that same buffer
|
|
114
|
+
- once the current processing finishes, the buffered messages are sent together as the next prompt
|
|
153
115
|
|
|
154
|
-
|
|
116
|
+
Conceptually:
|
|
155
117
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
118
|
+
```txt
|
|
119
|
+
message 1 is processing
|
|
120
|
+
message 2 arrives -> queued
|
|
121
|
+
message 3 arrives -> appended to queued
|
|
122
|
+
message 1 finishes
|
|
123
|
+
queued batch is processed next
|
|
124
|
+
```
|
|
160
125
|
|
|
161
|
-
|
|
162
|
-
- Offload research, exploration, and parallel analysis to subagents
|
|
163
|
-
- For complex problems, throw more compute at it via subagents
|
|
164
|
-
- One task per subagent for focused execution
|
|
126
|
+
## Project structure
|
|
165
127
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
128
|
+
```txt
|
|
129
|
+
src/
|
|
130
|
+
runtime/ bootstrap + app startup
|
|
131
|
+
transport/ Telegram integration
|
|
132
|
+
core/ agent, tools, artifacts, config
|
|
133
|
+
cli/
|
|
134
|
+
openai-transcribe/
|
|
135
|
+
openai-tts/
|
|
136
|
+
data/
|
|
137
|
+
state/
|
|
138
|
+
artifacts/
|
|
139
|
+
chats/
|
|
140
|
+
```
|
|
171
141
|
|
|
172
|
-
|
|
173
|
-
- Never mark a task complete without proving it works
|
|
174
|
-
- Diff behavior between main and your changes when relevant
|
|
175
|
-
- Ask yourself: "Would a staff engineer approve this?"
|
|
176
|
-
- Run tests, check logs, demonstrate correctness
|
|
142
|
+
## Philosophy
|
|
177
143
|
|
|
178
|
-
|
|
179
|
-
- For non-trivial changes: pause and ask "is there a more elegant way?"
|
|
180
|
-
- If a fix feels hacky: "Knowing everything I know now, implement the elegant solution"
|
|
181
|
-
- Skip this for simple, obvious fixes - don't over-engineer
|
|
182
|
-
- Challenge your own work before presenting it
|
|
144
|
+
Arisa should not default to passive answers like "I can't do that" when a missing capability can realistically be implemented as a new tool.
|
|
183
145
|
|
|
184
|
-
|
|
185
|
-
- When given a bug report: just fix it. Don't ask for hand-holding
|
|
186
|
-
- Point at logs, errors, failing tests -> then resolve them
|
|
187
|
-
- Zero context switching required from the user
|
|
188
|
-
- Go fix failing CI tests without being told how
|
|
146
|
+
The preferred behavior is:
|
|
189
147
|
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
3. **Track Progress**: Mark items complete as you go
|
|
194
|
-
4. **Explain Changes**: High-level summary at each step
|
|
195
|
-
5. **Document Results**: Add review to 'tasks/todo.md'
|
|
196
|
-
6. **Capture Lessons**: Update 'tasks/lessons.md' after corrections
|
|
148
|
+
1. check whether an existing tool can solve the task
|
|
149
|
+
2. if not, propose creating the missing tool
|
|
150
|
+
3. keep the solution inside the tool architecture
|
|
197
151
|
|
|
198
|
-
##
|
|
152
|
+
## Notes
|
|
199
153
|
|
|
200
|
-
|
|
154
|
+
- `AGENTS.md` defines the project-level behavioral rules for Pi Agent
|
|
155
|
+
- `src/transport/telegram/bot.js` builds the per-message runtime prompt
|
|
156
|
+
- tool help is part of the architecture and should be consulted before use when details are unclear
|
|
201
157
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
158
|
+
## Status
|
|
159
|
+
|
|
160
|
+
This is currently a functional V1 focused on:
|
|
205
161
|
|
|
206
|
-
-
|
|
207
|
-
-
|
|
208
|
-
-
|
|
209
|
-
-
|
|
210
|
-
-
|
|
162
|
+
- Telegram transport
|
|
163
|
+
- Pi Agent integration
|
|
164
|
+
- artifact-based message handling
|
|
165
|
+
- isolated CLI tools
|
|
166
|
+
- audio transcription before reasoning
|
|
167
|
+
- text-to-speech replies
|
|
168
|
+
- queued follow-up message batching
|
|
211
169
|
|
|
212
|
-
|
|
213
|
-
- **Simplicity First**: Make every change as simple as possible. Impact minimal code.
|
|
214
|
-
- **No Laziness**: Find root causes. No temporary fixes. Senior developer standards.
|
|
215
|
-
- **Minimal Impact**: Changes should only touch what's necessary. Avoid introducing bugs.
|
|
170
|
+
Future capabilities should be added as new tools and pipes, not as tightly coupled one-off code paths.
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { readFile, stat } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import config from "./config.js";
|
|
4
|
+
|
|
5
|
+
function printHelp() {
|
|
6
|
+
console.log(`openai-transcribe\n\nUso:\n node index.js --help\n node index.js run --request-file <json>\n\nInput esperado:\n {\n \"artifact\": { \"path\": \"/abs/audio.ogg\", \"mimeType\": \"audio/ogg\" },\n \"args\": {}\n }\n\nConfig en cli/openai-transcribe/config.js:\n OPENAI_API_KEY\n MODEL\n`);
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
async function run(requestFile) {
|
|
10
|
+
if (!config.OPENAI_API_KEY) {
|
|
11
|
+
console.log(JSON.stringify({ ok: false, missingConfig: ["OPENAI_API_KEY"], configPath: path.resolve("config.js") }));
|
|
12
|
+
return;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
const request = JSON.parse(await readFile(requestFile, "utf8"));
|
|
16
|
+
const artifact = request.artifact;
|
|
17
|
+
if (!artifact?.path) {
|
|
18
|
+
console.log(JSON.stringify({ ok: false, error: "artifact.path is required" }));
|
|
19
|
+
return;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
await stat(artifact.path);
|
|
23
|
+
const form = new FormData();
|
|
24
|
+
const data = await readFile(artifact.path);
|
|
25
|
+
form.append("file", new Blob([data]), path.basename(artifact.path));
|
|
26
|
+
form.append("model", config.MODEL);
|
|
27
|
+
|
|
28
|
+
const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
|
|
29
|
+
method: "POST",
|
|
30
|
+
headers: { Authorization: `Bearer ${config.OPENAI_API_KEY}` },
|
|
31
|
+
body: form
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
const payload = await response.json();
|
|
35
|
+
if (!response.ok) {
|
|
36
|
+
console.log(JSON.stringify({ ok: false, error: payload.error?.message || "OpenAI transcription failed" }));
|
|
37
|
+
return;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
console.log(JSON.stringify({ ok: true, output: { text: payload.text || "" } }));
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const args = process.argv.slice(2);
|
|
44
|
+
if (!args.length || args.includes("--help") || args[0] === "help") {
|
|
45
|
+
printHelp();
|
|
46
|
+
} else if (args[0] === "run") {
|
|
47
|
+
const fileIndex = args.indexOf("--request-file");
|
|
48
|
+
await run(args[fileIndex + 1]);
|
|
49
|
+
} else {
|
|
50
|
+
printHelp();
|
|
51
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "openai-transcribe",
|
|
3
|
+
"description": "Transcribe audio files with OpenAI audio transcription API.",
|
|
4
|
+
"entry": "index.js",
|
|
5
|
+
"input": ["audio/ogg", "audio/mpeg", "audio/wav", "audio/mp4"],
|
|
6
|
+
"output": ["text/plain"],
|
|
7
|
+
"configSchema": {
|
|
8
|
+
"OPENAI_API_KEY": {
|
|
9
|
+
"type": "string",
|
|
10
|
+
"required": true,
|
|
11
|
+
"secret": true,
|
|
12
|
+
"prompt": "Necesito tu OPENAI_API_KEY para transcribir audio."
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import config from "./config.js";
|
|
4
|
+
|
|
5
|
+
function printHelp() {
|
|
6
|
+
console.log(`openai-tts\n\nUso:\n node index.js --help\n node index.js run --request-file <json>\n\nInput esperado:\n {\n \"text\": \"hola\",\n \"artifact\": { \"text\": \"hola\" },\n \"args\": { \"voice\": \"alloy\" }\n }\n\nConfig en cli/openai-tts/config.js:\n OPENAI_API_KEY\n MODEL\n VOICE\n`);
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
async function run(requestFile) {
|
|
10
|
+
if (!config.OPENAI_API_KEY) {
|
|
11
|
+
console.log(JSON.stringify({ ok: false, missingConfig: ["OPENAI_API_KEY"], configPath: path.resolve("config.js") }));
|
|
12
|
+
return;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
const request = JSON.parse(await readFile(requestFile, "utf8"));
|
|
16
|
+
const inputText = request.text || request.artifact?.text;
|
|
17
|
+
if (!inputText) {
|
|
18
|
+
console.log(JSON.stringify({ ok: false, error: "text or artifact.text is required" }));
|
|
19
|
+
return;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const response = await fetch("https://api.openai.com/v1/audio/speech", {
|
|
23
|
+
method: "POST",
|
|
24
|
+
headers: {
|
|
25
|
+
Authorization: `Bearer ${config.OPENAI_API_KEY}`,
|
|
26
|
+
"Content-Type": "application/json"
|
|
27
|
+
},
|
|
28
|
+
body: JSON.stringify({
|
|
29
|
+
model: config.MODEL,
|
|
30
|
+
voice: request.args?.voice || config.VOICE,
|
|
31
|
+
input: inputText,
|
|
32
|
+
format: "mp3"
|
|
33
|
+
})
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
if (!response.ok) {
|
|
37
|
+
const payload = await response.text();
|
|
38
|
+
console.log(JSON.stringify({ ok: false, error: payload }));
|
|
39
|
+
return;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const outDir = path.resolve("out");
|
|
43
|
+
await mkdir(outDir, { recursive: true });
|
|
44
|
+
const filePath = path.join(outDir, `speech-${Date.now()}.mp3`);
|
|
45
|
+
const buffer = Buffer.from(await response.arrayBuffer());
|
|
46
|
+
await writeFile(filePath, buffer);
|
|
47
|
+
console.log(JSON.stringify({ ok: true, output: { filePath, fileName: path.basename(filePath), mimeType: "audio/mpeg", kind: "audio" } }));
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
const args = process.argv.slice(2);
|
|
51
|
+
if (!args.length || args.includes("--help") || args[0] === "help") {
|
|
52
|
+
printHelp();
|
|
53
|
+
} else if (args[0] === "run") {
|
|
54
|
+
const fileIndex = args.indexOf("--request-file");
|
|
55
|
+
await run(args[fileIndex + 1]);
|
|
56
|
+
} else {
|
|
57
|
+
printHelp();
|
|
58
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "openai-tts",
|
|
3
|
+
"description": "Convert text into MP3 audio using OpenAI speech API.",
|
|
4
|
+
"entry": "index.js",
|
|
5
|
+
"input": ["text/plain"],
|
|
6
|
+
"output": ["audio/mpeg"],
|
|
7
|
+
"configSchema": {
|
|
8
|
+
"OPENAI_API_KEY": {
|
|
9
|
+
"type": "string",
|
|
10
|
+
"required": true,
|
|
11
|
+
"secret": true,
|
|
12
|
+
"prompt": "Necesito tu OPENAI_API_KEY para generar audio."
|
|
13
|
+
},
|
|
14
|
+
"VOICE": {
|
|
15
|
+
"type": "string",
|
|
16
|
+
"required": false,
|
|
17
|
+
"prompt": "Voz a usar, por ejemplo alloy."
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
}
|