@ducci/jarvis 1.0.38 → 1.0.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/docs/agent.md +43 -4
  2. package/docs/crons.md +100 -0
  3. package/docs/identity.md +38 -0
  4. package/docs/skills.md +77 -0
  5. package/docs/system-prompt.md +25 -13
  6. package/docs/telegram.md +61 -2
  7. package/package.json +2 -1
  8. package/src/channels/telegram/index.js +65 -0
  9. package/src/server/agent.js +59 -19
  10. package/src/server/app.js +125 -2
  11. package/src/server/config.js +43 -0
  12. package/src/server/cron-scheduler.js +35 -0
  13. package/src/server/crons.js +106 -0
  14. package/src/server/tools.js +234 -72
  15. package/docs/findings/001-context-explosion.md +0 -116
  16. package/docs/findings/002-handoff-edge-cases.md +0 -84
  17. package/docs/findings/003-event-loop-blocking-and-reliability.md +0 -120
  18. package/docs/findings/004-agent-reliability-improvements.md +0 -162
  19. package/docs/findings/005-installation-timeout.md +0 -128
  20. package/docs/findings/006-malformed-tool-schema.md +0 -118
  21. package/docs/findings/007-telegram-errors-and-handoff-stalling.md +0 -271
  22. package/docs/findings/008-exec-timeout-architecture.md +0 -118
  23. package/docs/findings/009-non-string-response-field.md +0 -153
  24. package/docs/findings/010-checkpoint-field-type-safety.md +0 -121
  25. package/docs/findings/011-empty-model-response.md +0 -157
  26. package/docs/findings/012-empty-nudge-loses-recovery-text.md +0 -121
  27. package/docs/findings/013-stderr-visibility-and-truncation.md +0 -59
  28. package/docs/findings/014-exec-stderr-artifact-and-malformed-tool-args.md +0 -202
  29. package/docs/findings/015-failed-run-context-strip.md +0 -142
  30. package/docs/findings/016-file-writing-corruption-and-stderr-loop.md +0 -119
  31. package/docs/findings/017-looping-intervention-and-lossy-checkpoint.md +0 -110
  32. package/docs/findings/018-anthropic-oauth-token-support.md +0 -72
package/docs/agent.md CHANGED
@@ -32,13 +32,22 @@ Respond with your normal JSON, but add a checkpoint field:
32
32
  "logSummary": "Human-readable summary of what happened in this run.",
33
33
  "checkpoint": {
34
34
  "progress": "What has been fully completed so far.",
35
- "remaining": "What still needs to be done to finish the task."
35
+ "remaining": "What still needs to be done to finish the task.",
36
+ "failedApproaches": "Comma-separated list of approaches already tried that did not work.",
37
+ "state": { "key": "value" }
36
38
  }
37
39
  }
38
40
 
39
41
  The checkpoint field will be used to automatically resume the task in the next run.]
40
42
  ```
41
43
 
44
+ The checkpoint object has four fields:
45
+
46
+ - `progress` — what has been fully completed so far.
47
+ - `remaining` — what still needs to be done to finish the task. The server uses this as the starting prompt for the next run.
48
+ - `failedApproaches` — a record of approaches already attempted that did not work, so the next run does not repeat them. This is preserved in `session.metadata` and injected into each subsequent resume prompt.
49
+ - `state` — a flat key-value JSON object for concrete facts confirmed by tool output (file paths, binary locations, config values, etc.). It is merged into `session.metadata.checkpointState` across handoffs and injected as known facts into the next resume prompt, so the agent does not need to re-discover information it already found.
50
+
42
51
  2. The server reads `checkpoint.remaining` from the response and uses it as the starting prompt for a fresh agent run.
43
52
  3. The server marks the run status as `checkpoint_reached`.
44
53
 
@@ -138,17 +147,36 @@ Interaction flow:
138
147
 
139
148
  The authoritative system prompt text lives in [docs/system-prompt.md](./system-prompt.md). It is sent as the first message (`role: "system"`) in every session and stored verbatim in the conversation history.
140
149
 
150
+ Four placeholders are injected at runtime before the system prompt is sent to the model — none of them are ever written back to disk or stored in conversation history:
151
+
152
+ - `{{identity}}` — replaced with the full contents of `~/.jarvis/data/identity.md`. This is freeform text that describes the agent's persona and behavior.
153
+ - `{{skills}}` — replaced with a rendered list of available skills (name + description) loaded from `~/.jarvis/data/skills/`. This lets the model know which skills exist and what they do without embedding full skill content in every request.
154
+ - `{{session_id}}` — replaced with the current session UUID.
155
+ - `{{user_info}}` — replaced with the current contents of `user-info.json`. If no user info exists, replaced with `(none yet)`.
156
+
141
157
  ## Tools
142
158
 
143
159
  All tools — built-in and user-defined — live in a single registry file (`tools.json`) and are executed via the same `new Function()` path. There is no separate execution mechanism for built-ins.
144
160
 
145
161
  **Built-in tools** (seeded into `tools.json` on first server start if missing):
146
162
 
147
- - `get_recent_sessions` — returns the most recent sessions (default: last 2, configurable via `limit`)
148
- - `read_session_log` — returns JSONL log entries for a given session; the agent-accessible way to inspect failures and previous run summaries
163
+ - `list_dir` — lists directory contents (ls -la)
164
+ - `exec` — runs arbitrary shell commands; 5-minute timeout
165
+ - `write_file` — writes a file directly via fs.promises.writeFile, bypassing shell escaping; supports optional `mode` parameter for executable scripts
149
166
  - `save_user_info` — persists user facts to `user-info.json`
150
167
  - `read_user_info` — returns all stored user facts
151
- - `exec` — runs arbitrary shell commands as the server user; no safeguards
168
+ - `get_recent_sessions` — returns the most recent sessions
169
+ - `read_session_log` — returns JSONL log entries for a given session
170
+ - `npm_install` — installs an npm package into the jarvis project directory
171
+ - `system_install` — installs a system binary via brew/apt-get/snap; 5-minute timeout
172
+ - `perplexity_search` — web search via Perplexity AI
173
+ - `read_skill` — reads the full content of a skill by name from `~/.jarvis/data/skills/<name>/skill.md`
174
+ - `get_current_time` — returns current server time; used before scheduling crons with relative times
175
+ - `create_cron` — creates a scheduled cron job and writes to `crons.json`; activates immediately without restart
176
+ - `list_crons` — lists all scheduled cron jobs
177
+ - `delete_cron` — removes a cron job by name or id
178
+ - `send_telegram_message` — sends a proactive message to the Telegram user; used inside cron prompts
179
+ - `read_cron_log` — reads the JSONL execution log for a given cron id
152
180
 
153
181
  If a built-in entry is missing from `tools.json` at startup, the server re-seeds it from its default definition. This means built-ins can be inspected and edited in place, and will be restored if accidentally deleted.
154
182
 
@@ -306,6 +334,10 @@ To support persistent tracking (like `handoffCount`), each file contains a JSON
306
334
 
307
335
  The system prompt is stored as the first message in the `messages` array. The full turn sequence — user → assistant (with tool_calls) → tool → assistant (final) — is stored verbatim so that subsequent requests can be sent to the provider without any transformation.
308
336
 
337
+ ## Sliding Window
338
+
339
+ `prepareMessages()` applies a sliding window before every model call: it always includes the system prompt (`messages[0]`) plus the most recent `contextWindow` messages (default 100, configurable via `settings.json`). The full message history is always preserved on disk — only what is sent to the model is trimmed. This prevents context overflow on long sessions without losing data.
340
+
309
341
  ## Provider Message Format
310
342
 
311
343
  When sending the conversation to OpenRouter, messages must follow the OpenAI-compatible chat format.
@@ -599,3 +631,10 @@ Tool inputs/outputs:
599
631
  - `get_recent_sessions`
600
632
  - Input: `{ "limit": 2 }`
601
633
  - Output: `{ "status": "ok", "sessions": [{ "sessionId": "...", "title": "...", "lastTs": "..." }] }`
634
+
635
+ ## See Also
636
+
637
+ - [docs/system-prompt.md](./system-prompt.md) — the authoritative system prompt text
638
+ - [docs/identity.md](./identity.md) — the agent's persona and identity configuration (`~/.jarvis/data/identity.md`)
639
+ - [docs/skills.md](./skills.md) — the skills system (per-skill `skill.md` files, how they are listed and read)
640
+ - [docs/crons.md](./crons.md) — the cron scheduler (job format, `crons.json`, execution loop, logging)
package/docs/crons.md ADDED
@@ -0,0 +1,100 @@
1
+ # Crons
2
+
3
+ Crons let you schedule recurring or one-time tasks. The agent executes the task autonomously and optionally notifies you via Telegram.
4
+
5
+ ## Storage
6
+
7
+ All cron jobs are stored in `~/.jarvis/data/crons.json`:
8
+
9
+ ```json
10
+ [
11
+ {
12
+ "id": "550e8400-e29b-41d4-a716-446655440000",
13
+ "name": "backup-nightly",
14
+ "schedule": "0 3 * * *",
15
+ "prompt": "Backup folder /home/xyz to /backups/xyz. When done, use send_telegram_message to notify the user with the result.",
16
+ "once": false,
17
+ "createdAt": "2026-03-11T10:00:00.000Z"
18
+ }
19
+ ]
20
+ ```
21
+
22
+ ## How a Cron Runs
23
+
24
+ When a cron fires:
25
+
26
+ 1. A **fresh agent run** starts with no prior conversation context — only the stored `prompt`
27
+ 2. The agent executes the task, optionally calling `send_telegram_message` to notify you
28
+ 3. The result is logged to `~/.jarvis/logs/cron-<id>.jsonl`
29
+ 4. A synthetic message is appended to your Telegram session so the agent has context if you reply:
30
+
31
+ ```
32
+ [Cron "backup-nightly" | 2026-03-11 03:00] Backup completed. 2.3GB written to /backups/xyz.
33
+ ```
34
+
35
+ 5. If `once: true`, the cron deletes itself after firing
36
+
37
+ ## Scheduling
38
+
39
+ Crons use standard cron expressions:
40
+
41
+ | Expression | Meaning |
42
+ |---|---|
43
+ | `0 3 * * *` | Every day at 3am |
44
+ | `0 */2 * * *` | Every 2 hours |
45
+ | `0 9 * * 1` | Every Monday at 9am |
46
+ | `30 14 11 3 *` | Once on March 11 at 14:30 |
47
+
48
+ For one-time tasks specified as relative times ("in 2 hours", "at 3pm today"), the agent calls `get_current_time` first, calculates the exact schedule, and sets `once: true`.
49
+
50
+ ## Notifications
51
+
52
+ Notification is opt-in via the prompt. Include this in the prompt when you want a notification:
53
+
54
+ > "When done, use `send_telegram_message` to notify the user with the result."
55
+
56
+ If you don't want a notification, omit it. The agent follows the prompt literally — conditional notifications work naturally:
57
+
58
+ > "Check disk usage. If any partition is above 90%, use `send_telegram_message` to alert the user. Otherwise do nothing."
59
+
60
+ ## Dynamic Scheduling
61
+
62
+ When `create_cron` runs successfully, the agent loop immediately registers the new cron in the in-memory scheduler — no server restart required. `delete_cron` unregisters it immediately as well.
63
+
64
+ On server restart, all crons in `crons.json` are re-loaded and rescheduled. `once: true` crons that already fired (and deleted themselves) are gone from the file and will not re-run.
65
+
66
+ ## Logs
67
+
68
+ Each cron has its own JSONL log at `~/.jarvis/logs/cron-<id>.jsonl`. One entry per run:
69
+
70
+ ```json
71
+ {
72
+ "ts": "2026-03-11T03:00:01.234Z",
73
+ "cronName": "backup-nightly",
74
+ "status": "ok",
75
+ "response": "Backup completed. 2.3GB written to /backups/xyz.",
76
+ "logSummary": "Ran rsync from /home/xyz to /backups/xyz. Exit code 0."
77
+ }
78
+ ```
79
+
80
+ Use `read_cron_log` to inspect past runs. Ask Jarvis "did my backup run last night?" and it will call `list_crons` + `read_cron_log`.
81
+
82
+ ## Tools
83
+
84
+ | Tool | Purpose |
85
+ |---|---|
86
+ | `create_cron` | Schedule a new cron job |
87
+ | `list_crons` | List all active crons |
88
+ | `delete_cron` | Remove a cron by name or id |
89
+ | `read_cron_log` | Read execution history for a cron |
90
+ | `get_current_time` | Get current server time for relative scheduling |
91
+ | `send_telegram_message` | Send a proactive message to the Telegram user |
92
+
93
+ ## Triggering Without Saying "Cron"
94
+
95
+ The system prompt instructs the agent to recognise scheduling intent from natural language. Examples that will create a cron:
96
+
97
+ - "every night at 3am, backup my projects folder"
98
+ - "remind me in 2 hours"
99
+ - "check my server disk usage every day and alert me if it's getting full"
100
+ - "send me a good morning message every day at 8am"
@@ -0,0 +1,38 @@
1
+ # Identity
2
+
3
+ This document describes how Jarvis's identity is defined and injected.
4
+
5
+ ## What It Is
6
+
7
+ `~/.jarvis/data/identity.md` is a plain Markdown file that defines who the agent is — its name, purpose, tone, and communication style. It is loaded at runtime and injected into the system prompt via the `{{identity}}` placeholder on every request.
8
+
9
+ This means you can change how Jarvis behaves without touching the system prompt or restarting the server. Editing `identity.md` takes effect on the next message.
10
+
11
+ ## Default Content
12
+
13
+ Created automatically on first server start if the file does not exist:
14
+
15
+ ```md
16
+ # Identity
17
+
18
+ You are Jarvis, a fully autonomous agent running on a local server. You have access to tools and can execute shell commands on the machine you run on.
19
+
20
+ Be concise and direct in your responses. Avoid unnecessary filler. When a task is done, say so clearly.
21
+ ```
22
+
23
+ ## How It Is Injected
24
+
25
+ `resolveSystemPrompt()` in `src/server/config.js` reads `identity.md` at call time and substitutes it for `{{identity}}` in the system prompt template. The resolved prompt is sent to the model but never written to disk — the placeholder is always preserved in the stored session history.
26
+
27
+ ## Customisation
28
+
29
+ Edit `~/.jarvis/data/identity.md` directly. Examples of what you can change:
30
+
31
+ - **Name** — rename the agent to anything
32
+ - **Tone** — formal, casual, verbose, terse
33
+ - **Domain** — focus the agent on a specific area (e.g. "You are a security researcher...")
34
+ - **Personality** — add quirks, communication preferences, or constraints
35
+
36
+ ## What Belongs Here vs. the System Prompt
37
+
38
+ `identity.md` is for **who the agent is**. The system prompt (`docs/system-prompt.md`) is for **how the agent must behave** — response format, tool use rules, exec safety, failure recovery. Keep technical rules in the system prompt where they cannot be accidentally deleted.
package/docs/skills.md ADDED
@@ -0,0 +1,77 @@
1
+ # Skills
2
+
3
+ Skills are predefined workflows that guide how the agent approaches specific tasks. Unlike tools (which execute code), skills are instructions written in Markdown — they tell the agent how to do something rather than doing it directly.
4
+
5
+ ## Folder Structure
6
+
7
+ Each skill lives in its own subdirectory under `~/.jarvis/data/skills/`:
8
+
9
+ ```
10
+ ~/.jarvis/data/skills/
11
+ <skill-name>/
12
+ skill.md ← required: frontmatter + instructions
13
+ *.js / *.sh ← optional: bundled scripts the skill references
14
+ ```
15
+
16
+ ## skill.md Format
17
+
18
+ Every `skill.md` starts with YAML frontmatter:
19
+
20
+ ```yaml
21
+ ---
22
+ name: skill-name
23
+ description: What this skill does and when to use it. Use this when the user asks to...
24
+ ---
25
+
26
+ # Skill Title
27
+
28
+ Instructions for the agent...
29
+ ```
30
+
31
+ The `description` field is the only signal the agent has to decide whether to load the skill. Write it so the agent reliably recognises when the skill applies — be specific about the task type and include a "Use this when..." clause.
32
+
33
+ Bad: `"Manages ports."`
34
+ Good: `"Scan a target host for open ports using nmap and return a structured report. Use this when the user asks to scan ports or check what services are running on a host."`
35
+
36
+ ## How Skills Are Used
37
+
38
+ At runtime, `resolveSystemPrompt()` reads all skill directories and builds a list of available skills (name + description only) injected via the `{{skills}}` placeholder in the system prompt. The agent sees this list on every request and decides which skill (if any) is relevant.
39
+
40
+ When the agent decides to use a skill, it calls the `read_skill` tool to fetch the full instructions:
41
+
42
+ ```json
43
+ { "name": "skill-name" }
44
+ ```
45
+
46
+ The tool returns the full `skill.md` content. The agent then follows the instructions.
47
+
48
+ This two-step approach (list in system prompt → full content on demand) keeps the prompt small while making all skills discoverable.
49
+
50
+ ## Bundled Scripts
51
+
52
+ A skill folder can contain scripts that the skill's instructions reference. Scripts are called via `exec`:
53
+
54
+ ```sh
55
+ node ~/.jarvis/data/skills/<name>/script.js <args>
56
+ ```
57
+
58
+ Always reference scripts by their absolute path. Use `write_file` to create scripts — never `exec+echo`.
59
+
60
+ ## Seed Skills
61
+
62
+ Two skills are created on first server start if they do not exist:
63
+
64
+ - **`add-two-integers`** — example skill demonstrating the skill + bundled script pattern
65
+ - **`manage-skill`** — create, edit, or delete skills; includes guidance on what makes a good skill
66
+
67
+ ## Creating and Managing Skills
68
+
69
+ Use the `manage-skill` skill. The agent will read it when asked to create, edit, or list skills.
70
+
71
+ ## What Makes a Good Skill
72
+
73
+ - Describes a **workflow or approach**, not a single command
74
+ - Name is specific and lowercase with hyphens (`scan-open-ports`, not `scanning`)
75
+ - Description reliably signals to the agent when to use it (see example above)
76
+ - Instructions are written for the agent, not the user
77
+ - Uses `write_file` for any file creation inside the skill workflow
@@ -2,21 +2,43 @@
2
2
 
3
3
  This is the authoritative system prompt sent to the model at the start of every session. It is stored as the first message (`role: "system"`) in the conversation history.
4
4
 
5
- Before sending to the model, the server replaces the `{{user_info}}` and `{{session_id}}` placeholders at runtime on every request — these are never stored in the conversation history.
5
+ Before sending to the model, the server replaces the `{{identity}}`, `{{user_info}}` and `{{session_id}}` placeholders at runtime on every request — these are never stored in the conversation history.
6
6
 
7
7
  ---
8
8
 
9
9
  ```
10
- You are Jarvis, a fully autonomous agent running on a local server. You have access to tools and can execute shell commands on the machine you run on.
10
+ ## Identity
11
+
12
+ {{identity}}
11
13
 
12
14
  ## Session
13
15
 
14
16
  Current session ID: {{session_id}}
15
17
 
18
+ Only the most recent messages are included in your context (sliding window). Older messages are stored on disk but not sent to you. If the user references something you cannot find in the conversation, explain that it may have scrolled out of your context window and ask them to repeat the relevant detail.
19
+
16
20
  ## Known User Context
17
21
 
18
22
  {{user_info}}
19
23
 
24
+ ## Crons
25
+
26
+ You can schedule recurring or one-time tasks using cron jobs.
27
+
28
+ - Use `create_cron` when the user wants to schedule something — even if they don't say "cron". Triggers: "every night", "every 2 hours", "remind me at 3pm", "notify me in 2 hours", "check X every Monday", etc.
29
+ - Call `get_current_time` first when the user specifies a relative time (e.g. "in 2 hours") so you can calculate the correct cron expression.
30
+ - The `prompt` stored in the cron is executed by a fresh agent with no prior conversation context. Write it as a complete, self-contained instruction.
31
+ - If the user wants to be notified, include "use send_telegram_message to notify the user with the result" in the prompt. If they explicitly don't want a notification, omit it.
32
+ - For one-time tasks, set `once: true` — the cron deletes itself after firing.
33
+ - Use `list_crons` to show active crons, `delete_cron` to remove one, `read_cron_log` to inspect past runs.
34
+
35
+ ## Skills
36
+
37
+ Skills are predefined workflows that guide how you approach specific tasks. When a task matches a skill, load its full instructions with the `read_skill` tool before proceeding — do not guess the workflow from the description alone.
38
+
39
+ Available skills:
40
+ {{skills}}
41
+
20
42
  ## Response Format
21
43
 
22
44
  There are two types of responses depending on whether you need to use tools:
@@ -40,7 +62,7 @@ You have access to a set of tools. Each tool has a name and description that tel
40
62
 
41
63
  - Always use a tool to perform an action. Never claim to have done something without actually calling the relevant tool.
42
64
  - Call tools one at a time. You will receive the result before deciding on the next step.
43
- - After a tool call, verify the result before declaring the task done.
65
+ - After a tool call, verify the result before declaring the task done. Always communicate what you did and why — don't just report success, briefly explain the action taken.
44
66
  - Stop as soon as the task is complete and verified. Do not do extra work that was not asked for.
45
67
  - If a tool fails, record the error in `logSummary` and decide whether to retry with a corrected call or explain the failure to the user.
46
68
  - If the user shares personal information, persist it using the appropriate tool.
@@ -88,16 +110,6 @@ When a tool or command fails:
88
110
  - **Use `perplexity_search` sparingly.** At most 3 searches per topic per session. If the first search didn't give you what you need, try a different query angle once — then stop searching and work with what you have or report the gap.
89
111
  - **Escalate cleanly.** If you cannot make progress after two distinct approaches, give the user a clear explanation of what was attempted, what failed, and what they can do manually. A useful failure report is better than an infinite retry loop.
90
112
 
91
- ## Tool Creation
92
-
93
- When building a custom tool with `save_tool`:
94
-
95
- - **Prefer npm packages** over reimplementing functionality from scratch. If a well-known package exists for the task (e.g. an API SDK, a parser, a utility library), use it.
96
- - **Installing an npm package**: use the `npm_install` tool — it handles the correct install directory automatically. Then create the tool with `save_tool`. The tool code can `require('<package-name>')` directly.
97
- - **Installing a system binary** (e.g. nuclei, jq, ffmpeg, git): use the `system_install` tool — never use exec for this. It auto-detects the available package manager (brew/apt-get/snap) and has a 5-minute timeout sized for real downloads.
98
- - **Available bindings in tool code**: `args`, `fs`, `path`, `process`, `require`, `__jarvisDir` (absolute path to the jarvis server directory).
99
- - **Long-running custom tools**: if your tool wraps an operation that takes more than 60 seconds (e.g. a network call, a slow computation), pass `timeout` in milliseconds to `save_tool` (max 600000 = 10 minutes). Example: `save_tool({ name: "run_scan", timeout: 300000, ... })`.
100
-
101
113
  ## logSummary Guidelines
102
114
 
103
115
  The `logSummary` is written for a human observer, not for the user. It must:
package/docs/telegram.md CHANGED
@@ -18,7 +18,7 @@ The channel calls the agent layer directly (no HTTP hop) — it imports and call
18
18
 
19
19
  ```
20
20
  Telegram user
21
- ↓ (text message)
21
+ ↓ (text or photo message)
22
22
  Telegram Bot API ←→ grammy-runner (long polling)
23
23
 
24
24
  Channel adapter (src/channels/telegram/index.js)
@@ -196,6 +196,20 @@ Log lines use a simple prefix format, written to stdout (captured by PM2 alongsi
196
196
 
197
197
  No JSONL session logging — that is handled by the agent layer for every run.
198
198
 
199
+ ## Proactive Notifications
200
+
201
+ The Telegram channel supports proactive outbound messages initiated by the agent, not by the user. This is used by the cron system.
202
+
203
+ **`send_telegram_message` tool**: any agent run (including cron runs) can call this tool to send a message directly to the configured Telegram user. The tool reads the bot token from `TELEGRAM_BOT_TOKEN` and the chat_id from `settings.json channels.telegram.allowedUserIds[0]`. For private Telegram chats, `chat_id === user_id`.
204
+
205
+ **Synthetic cron messages**: after a cron run completes, the cron runner appends a synthetic assistant message to the user's normal Telegram session so the agent has context if the user replies:
206
+
207
+ ```
208
+ [Cron "backup-nightly" | 2026-03-11 03:00] Backup completed successfully. 2.3GB written to /backups/xyz.
209
+ ```
210
+
211
+ This uses the session queue (`withSessionLock`) to avoid race conditions if the user is chatting simultaneously.
212
+
199
213
  ## Commands
200
214
 
201
215
  ### `/new` — Start a fresh session
@@ -204,6 +218,10 @@ Sending `/new` resets the conversation. The `chat_id → sessionId` mapping for
204
218
 
205
219
  The next text message after `/new` will create a new session as if the user were messaging for the first time.
206
220
 
221
+ ### `/usage` — Show token usage
222
+
223
+ Sending `/usage` displays the token usage for the current session. Shows input tokens, output tokens, total, and (if non-zero) Anthropic prompt cache read/write tokens. If no session exists or no tokens have been recorded yet, a short message is shown instead.
224
+
207
225
  **Command registration**
208
226
 
209
227
  Commands are registered with the Telegram Bot API at startup via `bot.api.setMyCommands()`. This makes them visible to users in two places:
@@ -216,6 +234,7 @@ Without registration the command still works if typed manually, but users would
216
234
  ```js
217
235
  await bot.api.setMyCommands([
218
236
  { command: 'new', description: 'Start a fresh session' },
237
+ { command: 'usage', description: 'Show token usage for this session' },
219
238
  ]);
220
239
  ```
221
240
 
@@ -227,9 +246,49 @@ await bot.api.setMyCommands([
227
246
  | User sends `/new`, no session exists yet | No-op, same confirmation sent |
228
247
  | Next text message after `/new` | New session created, mapped to `chat_id` |
229
248
 
249
+ ## Photo Support
250
+
251
+ The bot handles incoming photos (`message:photo`) in addition to text. When a user sends a photo, the adapter selects the best resolution under 800px wide to keep token usage reasonable, then passes the image URL and optional caption to the agent as a multimodal content block.
252
+
253
+ ### Photo selection
254
+
255
+ Telegram always delivers multiple resolutions of every photo as an array of `PhotoSize` objects, sorted ascending by resolution. The adapter picks the last entry with `width <= 800`:
256
+
257
+ ```js
258
+ const photo = ctx.message.photo.filter(p => p.width <= 800).at(-1)
259
+ ?? ctx.message.photo[0]; // fallback: smallest if all variants exceed 800px
260
+ ```
261
+
262
+ This gives the highest quality image below the 800px threshold. Sending the full-resolution original would consume significantly more tokens for no practical benefit in most tasks.
263
+
264
+ ### Download and base64 encoding
265
+
266
+ The image is downloaded immediately at receive time using the Telegram file URL (`https://api.telegram.org/file/bot<token>/<file_path>`) and converted to a base64 data URL (`data:image/jpeg;base64,...`). The data URL is stored directly in the session message, so the image remains available across handoffs and future conversation turns without depending on a Telegram URL that would expire after ~1 hour. Base64 encoding does not cost more tokens than a URL — image token cost is based on pixel dimensions, not transport format.
267
+
268
+ ### Agent call
269
+
270
+ Photos are passed to the agent as a multimodal content array instead of a plain string:
271
+
272
+ ```js
273
+ const content = [
274
+ { type: 'image_url', url: fileUrl },
275
+ ];
276
+ if (caption) content.push({ type: 'text', text: caption });
277
+ ```
278
+
279
+ The agent layer must support receiving `content` as either a string or a content array and pass it through to the model accordingly.
280
+
281
+ ### Caption
282
+
283
+ If the user attaches a caption to the photo (`ctx.message.caption`), it is included as a text block alongside the image. If there is no caption, only the image block is sent.
284
+
285
+ ### Unsupported media types
286
+
287
+ Documents, audio, video, stickers, and other non-photo media types are not handled — the bot silently ignores them (same as unauthorized messages).
288
+
230
289
  ## Non-Goals (v1)
231
290
 
232
- - No support for photos, files, or other media types (text only)
291
+ - No support for documents, audio, video, or other non-photo media types
233
292
  - No inline keyboards or callback queries
234
293
  - No group chat support (only private chats)
235
294
  - No message editing or deletion handling
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ducci/jarvis",
3
- "version": "1.0.38",
3
+ "version": "1.0.40",
4
4
  "description": "A fully automated agent system that lives on a server.",
5
5
  "main": "./src/index.js",
6
6
  "type": "module",
@@ -44,6 +44,7 @@
44
44
  "express": "^5.2.1",
45
45
  "grammy": "^1.40.1",
46
46
  "inquirer": "^12.11.1",
47
+ "node-cron": "^4.2.1",
47
48
  "openai": "^6.22.0",
48
49
  "pm2": "^6.0.14"
49
50
  },
@@ -60,6 +60,71 @@ export async function startTelegramChannel(config) {
60
60
  await ctx.reply('New session started.');
61
61
  });
62
62
 
63
+ bot.on('message:photo', async (ctx) => {
64
+ const userId = ctx.from?.id;
65
+ if (!allowedUserIds.includes(userId)) return;
66
+
67
+ const chatId = ctx.chat.id;
68
+ const sessionId = sessions[chatId] || null;
69
+
70
+ console.log(`[telegram] incoming photo chat_id=${chatId}`);
71
+
72
+ await ctx.api.sendChatAction(chatId, 'typing');
73
+ const typingInterval = setInterval(() => {
74
+ ctx.api.sendChatAction(chatId, 'typing').catch(() => {});
75
+ }, 4000);
76
+
77
+ let result;
78
+ try {
79
+ const photo = ctx.message.photo.filter(p => p.width <= 800).at(-1)
80
+ ?? ctx.message.photo[0];
81
+ const file = await ctx.api.getFile(photo.file_id);
82
+ const fileUrl = `https://api.telegram.org/file/bot${token}/${file.file_path}`;
83
+ const imgResponse = await fetch(fileUrl);
84
+ const buffer = await imgResponse.arrayBuffer();
85
+ const base64 = Buffer.from(buffer).toString('base64');
86
+ const dataUrl = `data:image/jpeg;base64,${base64}`;
87
+ const caption = ctx.message.caption || '';
88
+ result = await handleChat(config, sessionId, caption, [{ url: dataUrl }]);
89
+ } catch (e) {
90
+ console.error(`[telegram] agent error chat_id=${chatId}: ${e.message}`);
91
+ const errText = e.message
92
+ ? `Sorry, something went wrong: ${e.message}`
93
+ : 'Sorry, something went wrong. Please try again.';
94
+ await ctx.reply(errText).catch(() => {});
95
+ clearInterval(typingInterval);
96
+ return;
97
+ }
98
+
99
+ if (!sessions[chatId]) {
100
+ sessions[chatId] = result.sessionId;
101
+ save(sessions);
102
+ console.log(`[telegram] session created sessionId=${result.sessionId.slice(0, 8)}`);
103
+ }
104
+
105
+ try {
106
+ const MAX_TG = 4096;
107
+ const rawResponse = typeof result.response === 'string'
108
+ ? result.response
109
+ : result.response != null ? JSON.stringify(result.response, null, 2) : '';
110
+ const text = rawResponse.trim()
111
+ || 'The agent encountered an error and could not produce a response. Please try again.';
112
+ if (text.length <= MAX_TG) {
113
+ await ctx.reply(text);
114
+ } else {
115
+ for (let i = 0; i < text.length; i += MAX_TG) {
116
+ await ctx.reply(text.slice(i, i + MAX_TG));
117
+ }
118
+ }
119
+ console.log(`[telegram] response sent chat_id=${chatId} length=${text.length}`);
120
+ } catch (e) {
121
+ console.error(`[telegram] delivery error chat_id=${chatId}: ${e.message}`);
122
+ await ctx.reply('Sorry, something went wrong sending the response. Please try again.').catch(() => {});
123
+ } finally {
124
+ clearInterval(typingInterval);
125
+ }
126
+ });
127
+
63
128
  bot.on('message:text', async (ctx) => {
64
129
  const userId = ctx.from?.id;
65
130