milaidy 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/LICENSE +8 -0
  2. package/README.md +538 -0
  3. package/dist/argv-CfSowvEA.js +63 -0
  4. package/dist/config-B-mboG4v.js +4 -0
  5. package/dist/eliza-CPJjgw-e.js +1491 -0
  6. package/dist/eliza.js +2192 -0
  7. package/dist/entry.js +232 -0
  8. package/dist/index.js +209 -0
  9. package/dist/links-BFKlWqSe.js +15 -0
  10. package/dist/paths-D_yh1DEJ.js +69 -0
  11. package/dist/plugins-cli-B7kSre2c.js +134 -0
  12. package/dist/program-6KwWwKKh.js +510 -0
  13. package/dist/register.agents-CPVmSjMG.js +17 -0
  14. package/dist/register.browser-B2ooXxNx.js +15 -0
  15. package/dist/register.channels-CMYQ6K6Y.js +42 -0
  16. package/dist/register.cron-D91lY1_Y.js +9 -0
  17. package/dist/register.devices-rU5I5L_y.js +13 -0
  18. package/dist/register.gateway-82SLAvw3.js +22 -0
  19. package/dist/register.hooks-B_XTBEkt.js +9 -0
  20. package/dist/register.logs-BgEGcPd8.js +10 -0
  21. package/dist/register.models-BJt9eVgZ.js +26 -0
  22. package/dist/register.nodes-B5xY1s8a.js +9 -0
  23. package/dist/register.skills-SFQqYIhg.js +10 -0
  24. package/dist/register.subclis-uF_AsbWR.js +187 -0
  25. package/dist/run-main-XODklzS-.js +56 -0
  26. package/dist/theme-DBvtuGeq.js +36 -0
  27. package/dist/utils-C1AUpp_V.js +42 -0
  28. package/dist/version-Cpn3yr5D.js +26 -0
  29. package/dist/workspace-Co3Wul2D.js +206 -0
  30. package/dist/workspace-DCA6MNVK.js +350 -0
  31. package/docs/.i18n/README.md +31 -0
  32. package/docs/.i18n/glossary.zh-CN.json +210 -0
  33. package/docs/.i18n/zh-CN.tm.jsonl +1329 -0
  34. package/docs/CNAME +1 -0
  35. package/docs/automation/cron-jobs.md +468 -0
  36. package/docs/automation/cron-vs-heartbeat.md +254 -0
  37. package/docs/automation/gmail-pubsub.md +256 -0
  38. package/docs/automation/poll.md +69 -0
  39. package/docs/automation/webhook.md +163 -0
  40. package/docs/bedrock.md +176 -0
  41. package/docs/brave-search.md +41 -0
  42. package/docs/broadcast-groups.md +442 -0
  43. package/docs/cli/acp.md +170 -0
  44. package/docs/cli/agent.md +24 -0
  45. package/docs/cli/agents.md +75 -0
  46. package/docs/cli/approvals.md +50 -0
  47. package/docs/cli/browser.md +107 -0
  48. package/docs/cli/channels.md +79 -0
  49. package/docs/cli/config.md +50 -0
  50. package/docs/cli/configure.md +33 -0
  51. package/docs/cli/cron.md +42 -0
  52. package/docs/cli/dashboard.md +16 -0
  53. package/docs/cli/devices.md +67 -0
  54. package/docs/cli/directory.md +63 -0
  55. package/docs/cli/dns.md +23 -0
  56. package/docs/cli/docs.md +15 -0
  57. package/docs/cli/doctor.md +41 -0
  58. package/docs/cli/gateway.md +199 -0
  59. package/docs/cli/health.md +21 -0
  60. package/docs/cli/hooks.md +291 -0
  61. package/docs/cli/index.md +1029 -0
  62. package/docs/cli/logs.md +24 -0
  63. package/docs/cli/memory.md +45 -0
  64. package/docs/cli/message.md +239 -0
  65. package/docs/cli/models.md +79 -0
  66. package/docs/cli/node.md +112 -0
  67. package/docs/cli/nodes.md +73 -0
  68. package/docs/cli/onboard.md +29 -0
  69. package/docs/cli/pairing.md +21 -0
  70. package/docs/cli/plugins.md +62 -0
  71. package/docs/cli/reset.md +17 -0
  72. package/docs/cli/sandbox.md +152 -0
  73. package/docs/cli/security.md +26 -0
  74. package/docs/cli/sessions.md +16 -0
  75. package/docs/cli/setup.md +29 -0
  76. package/docs/cli/skills.md +26 -0
  77. package/docs/cli/status.md +26 -0
  78. package/docs/cli/system.md +60 -0
  79. package/docs/cli/tui.md +23 -0
  80. package/docs/cli/uninstall.md +17 -0
  81. package/docs/cli/update.md +98 -0
  82. package/docs/cli/voicecall.md +34 -0
  83. package/docs/cli/webhooks.md +25 -0
  84. package/docs/concepts/agent-loop.md +146 -0
  85. package/docs/concepts/agent-workspace.md +229 -0
  86. package/docs/concepts/agent.md +122 -0
  87. package/docs/concepts/architecture.md +129 -0
  88. package/docs/concepts/channel-routing.md +114 -0
  89. package/docs/concepts/compaction.md +61 -0
  90. package/docs/concepts/context.md +159 -0
  91. package/docs/concepts/features.md +53 -0
  92. package/docs/concepts/group-messages.md +84 -0
  93. package/docs/concepts/groups.md +373 -0
  94. package/docs/concepts/markdown-formatting.md +130 -0
  95. package/docs/concepts/memory.md +546 -0
  96. package/docs/concepts/messages.md +154 -0
  97. package/docs/concepts/model-failover.md +149 -0
  98. package/docs/concepts/model-providers.md +315 -0
  99. package/docs/concepts/models.md +208 -0
  100. package/docs/concepts/multi-agent.md +376 -0
  101. package/docs/concepts/oauth.md +145 -0
  102. package/docs/concepts/plugins.md +454 -0
  103. package/docs/concepts/presence.md +102 -0
  104. package/docs/concepts/queue.md +89 -0
  105. package/docs/concepts/retry.md +69 -0
  106. package/docs/concepts/secrets.md +300 -0
  107. package/docs/concepts/session-pruning.md +122 -0
  108. package/docs/concepts/session-tool.md +193 -0
  109. package/docs/concepts/session.md +188 -0
  110. package/docs/concepts/sessions.md +10 -0
  111. package/docs/concepts/skills.md +392 -0
  112. package/docs/concepts/streaming.md +135 -0
  113. package/docs/concepts/system-prompt.md +114 -0
  114. package/docs/concepts/timezone.md +91 -0
  115. package/docs/concepts/typebox.md +289 -0
  116. package/docs/concepts/typing-indicators.md +68 -0
  117. package/docs/concepts/usage-tracking.md +35 -0
  118. package/docs/custom.css +4 -0
  119. package/docs/date-time.md +128 -0
  120. package/docs/debugging.md +162 -0
  121. package/docs/docs.json +1599 -0
  122. package/docs/environment.md +81 -0
  123. package/docs/hooks.md +876 -0
  124. package/docs/index.md +179 -0
  125. package/docs/install/ansible.md +208 -0
  126. package/docs/install/bun.md +59 -0
  127. package/docs/install/development-channels.md +75 -0
  128. package/docs/install/docker.md +567 -0
  129. package/docs/install/index.md +185 -0
  130. package/docs/install/installer.md +123 -0
  131. package/docs/install/migrating.md +192 -0
  132. package/docs/install/nix.md +96 -0
  133. package/docs/install/node.md +78 -0
  134. package/docs/install/uninstall.md +128 -0
  135. package/docs/install/updating.md +228 -0
  136. package/docs/logging.md +350 -0
  137. package/docs/multi-agent-sandbox-tools.md +395 -0
  138. package/docs/network.md +54 -0
  139. package/docs/nodes/audio.md +114 -0
  140. package/docs/nodes/camera.md +156 -0
  141. package/docs/nodes/images.md +72 -0
  142. package/docs/nodes/index.md +341 -0
  143. package/docs/nodes/location-command.md +113 -0
  144. package/docs/nodes/media-understanding.md +379 -0
  145. package/docs/nodes/talk.md +90 -0
  146. package/docs/nodes/voicewake.md +65 -0
  147. package/docs/northflank.mdx +53 -0
  148. package/docs/perplexity.md +80 -0
  149. package/docs/platforms/android.md +129 -0
  150. package/docs/platforms/digitalocean.md +262 -0
  151. package/docs/platforms/exe-dev.md +125 -0
  152. package/docs/platforms/fly.md +486 -0
  153. package/docs/platforms/gcp.md +503 -0
  154. package/docs/platforms/hetzner.md +330 -0
  155. package/docs/platforms/index.md +53 -0
  156. package/docs/platforms/ios.md +106 -0
  157. package/docs/platforms/linux.md +94 -0
  158. package/docs/platforms/mac/bundled-gateway.md +73 -0
  159. package/docs/platforms/mac/canvas.md +125 -0
  160. package/docs/platforms/mac/child-process.md +69 -0
  161. package/docs/platforms/mac/dev-setup.md +102 -0
  162. package/docs/platforms/mac/health.md +34 -0
  163. package/docs/platforms/mac/icon.md +31 -0
  164. package/docs/platforms/mac/logging.md +57 -0
  165. package/docs/platforms/mac/menu-bar.md +81 -0
  166. package/docs/platforms/mac/peekaboo.md +65 -0
  167. package/docs/platforms/mac/permissions.md +44 -0
  168. package/docs/platforms/mac/release.md +85 -0
  169. package/docs/platforms/mac/remote.md +83 -0
  170. package/docs/platforms/mac/signing.md +47 -0
  171. package/docs/platforms/mac/skills.md +33 -0
  172. package/docs/platforms/mac/voice-overlay.md +60 -0
  173. package/docs/platforms/mac/voicewake.md +67 -0
  174. package/docs/platforms/mac/webchat.md +41 -0
  175. package/docs/platforms/mac/xpc.md +61 -0
  176. package/docs/platforms/macos-vm.md +281 -0
  177. package/docs/platforms/macos.md +203 -0
  178. package/docs/platforms/oracle.md +303 -0
  179. package/docs/platforms/raspberry-pi.md +358 -0
  180. package/docs/platforms/windows.md +159 -0
  181. package/docs/plugin.md +651 -0
  182. package/docs/plugins/agent-tools.md +99 -0
  183. package/docs/plugins/manifest.md +71 -0
  184. package/docs/plugins/voice-call.md +273 -0
  185. package/docs/plugins/zalouser.md +70 -0
  186. package/docs/providers/anthropic.md +152 -0
  187. package/docs/providers/claude-max-api-proxy.md +148 -0
  188. package/docs/providers/cloudflare-ai-gateway.md +71 -0
  189. package/docs/providers/deepgram.md +93 -0
  190. package/docs/providers/glm.md +33 -0
  191. package/docs/providers/index.md +63 -0
  192. package/docs/providers/minimax.md +208 -0
  193. package/docs/providers/models.md +51 -0
  194. package/docs/providers/moonshot.md +142 -0
  195. package/docs/providers/ollama.md +223 -0
  196. package/docs/providers/openai.md +62 -0
  197. package/docs/providers/opencode.md +36 -0
  198. package/docs/providers/openrouter.md +37 -0
  199. package/docs/providers/qwen.md +53 -0
  200. package/docs/providers/synthetic.md +99 -0
  201. package/docs/providers/venice.md +267 -0
  202. package/docs/providers/vercel-ai-gateway.md +50 -0
  203. package/docs/providers/xiaomi.md +64 -0
  204. package/docs/providers/zai.md +36 -0
  205. package/docs/railway.mdx +99 -0
  206. package/docs/reference/templates/AGENTS.md +9 -0
  207. package/docs/reference/templates/BOOTSTRAP.md +3 -0
  208. package/docs/reference/templates/HEARTBEAT.md +3 -0
  209. package/docs/reference/templates/IDENTITY.md +3 -0
  210. package/docs/reference/templates/TOOLS.md +3 -0
  211. package/docs/reference/templates/USER.md +3 -0
  212. package/docs/render.mdx +165 -0
  213. package/docs/start/docs-directory.md +63 -0
  214. package/docs/start/getting-started.md +212 -0
  215. package/docs/start/milaidy.md +247 -0
  216. package/docs/start/onboarding.md +258 -0
  217. package/docs/start/pairing.md +86 -0
  218. package/docs/start/quickstart.md +81 -0
  219. package/docs/start/setup.md +149 -0
  220. package/docs/start/showcase.md +416 -0
  221. package/docs/start/wizard.md +418 -0
  222. package/docs/testing.md +368 -0
  223. package/docs/token-use.md +112 -0
  224. package/docs/tools/agent-send.md +53 -0
  225. package/docs/tools/apply-patch.md +50 -0
  226. package/docs/tools/browser-linux-troubleshooting.md +139 -0
  227. package/docs/tools/browser-login.md +68 -0
  228. package/docs/tools/browser.md +576 -0
  229. package/docs/tools/chrome-extension.md +178 -0
  230. package/docs/tools/clawhub.md +257 -0
  231. package/docs/tools/creating-skills.md +54 -0
  232. package/docs/tools/elevated.md +57 -0
  233. package/docs/tools/exec-approvals.md +246 -0
  234. package/docs/tools/exec.md +179 -0
  235. package/docs/tools/firecrawl.md +61 -0
  236. package/docs/tools/index.md +508 -0
  237. package/docs/tools/llm-task.md +115 -0
  238. package/docs/tools/reactions.md +22 -0
  239. package/docs/tools/skills-config.md +76 -0
  240. package/docs/tools/skills.md +300 -0
  241. package/docs/tools/slash-commands.md +196 -0
  242. package/docs/tools/subagents.md +151 -0
  243. package/docs/tools/thinking.md +73 -0
  244. package/docs/tools/web.md +261 -0
  245. package/docs/tui.md +159 -0
  246. package/docs/vps.md +43 -0
  247. package/docs/web/control-ui.md +221 -0
  248. package/docs/web/dashboard.md +46 -0
  249. package/docs/web/index.md +116 -0
  250. package/docs/web/webchat.md +49 -0
  251. package/milaidy.mjs +14 -0
  252. package/package.json +271 -0
  253. package/skills/.cache/catalog.json +88519 -0
@@ -0,0 +1,379 @@
1
+ ---
2
+ summary: "Inbound image/audio/video understanding (optional) with provider + CLI fallbacks"
3
+ read_when:
4
+ - Designing or refactoring media understanding
5
+ - Tuning inbound audio/video/image preprocessing
6
+ title: "Media Understanding"
7
+ ---
8
+
9
+ # Media Understanding (Inbound) — 2026-01-17
10
+
11
+ Milaidy can **summarize inbound media** (image/audio/video) before the reply pipeline runs. It auto‑detects when local tools or provider keys are available, and can be disabled or customized. If understanding is off, models still receive the original files/URLs as usual.
12
+
13
+ ## Goals
14
+
15
+ - Optional: pre‑digest inbound media into short text for faster routing + better command parsing.
16
+ - Preserve original media delivery to the model (always).
17
+ - Support **provider APIs** and **CLI fallbacks**.
18
+ - Allow multiple models with ordered fallback (error/size/timeout).
19
+
20
+ ## High‑level behavior
21
+
22
+ 1. Collect inbound attachments (`MediaPaths`, `MediaUrls`, `MediaTypes`).
23
+ 2. For each enabled capability (image/audio/video), select attachments per policy (default: **first**).
24
+ 3. Choose the first eligible model entry (size + capability + auth).
25
+ 4. If a model fails or the media is too large, **fall back to the next entry**.
26
+ 5. On success:
27
+ - `Body` becomes `[Image]`, `[Audio]`, or `[Video]` block.
28
+ - Audio sets `{{Transcript}}`; command parsing uses caption text when present,
29
+ otherwise the transcript.
30
+ - Captions are preserved as `User text:` inside the block.
31
+
32
+ If understanding fails or is disabled, **the reply flow continues** with the original body + attachments.
33
+
34
+ ## Config overview
35
+
36
+ `tools.media` supports **shared models** plus per‑capability overrides:
37
+
38
+ - `tools.media.models`: shared model list (use `capabilities` to gate).
39
+ - `tools.media.image` / `tools.media.audio` / `tools.media.video`:
40
+ - defaults (`prompt`, `maxChars`, `maxBytes`, `timeoutSeconds`, `language`)
41
+ - provider overrides (`baseUrl`, `headers`, `providerOptions`)
42
+ - Deepgram audio options via `tools.media.audio.providerOptions.deepgram`
43
+ - optional **per‑capability `models` list** (preferred before shared models)
44
+ - `attachments` policy (`mode`, `maxAttachments`, `prefer`)
45
+ - `scope` (optional gating by channel/chatType/session key)
46
+ - `tools.media.concurrency`: max concurrent capability runs (default **2**).
47
+
48
+ ```json5
49
+ {
50
+ tools: {
51
+ media: {
52
+ models: [
53
+ /* shared list */
54
+ ],
55
+ image: {
56
+ /* optional overrides */
57
+ },
58
+ audio: {
59
+ /* optional overrides */
60
+ },
61
+ video: {
62
+ /* optional overrides */
63
+ },
64
+ },
65
+ },
66
+ }
67
+ ```
68
+
69
+ ### Model entries
70
+
71
+ Each `models[]` entry can be **provider** or **CLI**:
72
+
73
+ ```json5
74
+ {
75
+ type: "provider", // default if omitted
76
+ provider: "openai",
77
+ model: "gpt-5.2",
78
+ prompt: "Describe the image in <= 500 chars.",
79
+ maxChars: 500,
80
+ maxBytes: 10485760,
81
+ timeoutSeconds: 60,
82
+ capabilities: ["image"], // optional, used for multi‑modal entries
83
+ profile: "vision-profile",
84
+ preferredProfile: "vision-fallback",
85
+ }
86
+ ```
87
+
88
+ ```json5
89
+ {
90
+ type: "cli",
91
+ command: "gemini",
92
+ args: [
93
+ "-m",
94
+ "gemini-3-flash",
95
+ "--allowed-tools",
96
+ "read_file",
97
+ "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters.",
98
+ ],
99
+ maxChars: 500,
100
+ maxBytes: 52428800,
101
+ timeoutSeconds: 120,
102
+ capabilities: ["video", "image"],
103
+ }
104
+ ```
105
+
106
+ CLI templates can also use:
107
+
108
+ - `{{MediaDir}}` (directory containing the media file)
109
+ - `{{OutputDir}}` (scratch dir created for this run)
110
+ - `{{OutputBase}}` (scratch file base path, no extension)
111
+
112
+ ## Defaults and limits
113
+
114
+ Recommended defaults:
115
+
116
+ - `maxChars`: **500** for image/video (short, command‑friendly)
117
+ - `maxChars`: **unset** for audio (full transcript unless you set a limit)
118
+ - `maxBytes`:
119
+ - image: **10MB**
120
+ - audio: **20MB**
121
+ - video: **50MB**
122
+
123
+ Rules:
124
+
125
+ - If media exceeds `maxBytes`, that model is skipped and the **next model is tried**.
126
+ - If the model returns more than `maxChars`, output is trimmed.
127
+ - `prompt` defaults to simple “Describe the {media}.” plus the `maxChars` guidance (image/video only).
128
+ - If `<capability>.enabled: true` but no models are configured, Milaidy tries the
129
+ **active reply model** when its provider supports the capability.
130
+
131
+ ### Auto-detect media understanding (default)
132
+
133
+ If `tools.media.<capability>.enabled` is **not** set to `false` and you haven’t
134
+ configured models, Milaidy auto-detects in this order and **stops at the first
135
+ working option**:
136
+
137
+ 1. **Local CLIs** (audio only; if installed)
138
+ - `sherpa-onnx-offline` (requires `SHERPA_ONNX_MODEL_DIR` with encoder/decoder/joiner/tokens)
139
+ - `whisper-cli` (`whisper-cpp`; uses `WHISPER_CPP_MODEL` or the bundled tiny model)
140
+ - `whisper` (Python CLI; downloads models automatically)
141
+ 2. **Gemini CLI** (`gemini`) using `read_many_files`
142
+ 3. **Provider keys**
143
+ - Audio: OpenAI → Groq → Deepgram → Google
144
+ - Image: OpenAI → Anthropic → Google → MiniMax
145
+ - Video: Google
146
+
147
+ To disable auto-detection, set:
148
+
149
+ ```json5
150
+ {
151
+ tools: {
152
+ media: {
153
+ audio: {
154
+ enabled: false,
155
+ },
156
+ },
157
+ },
158
+ }
159
+ ```
160
+
161
+ Note: Binary detection is best-effort across macOS/Linux/Windows; ensure the CLI is on `PATH` (we expand `~`), or set an explicit CLI model with a full command path.
162
+
163
+ ## Capabilities (optional)
164
+
165
+ If you set `capabilities`, the entry only runs for those media types. For shared
166
+ lists, Milaidy can infer defaults:
167
+
168
+ - `openai`, `anthropic`, `minimax`: **image**
169
+ - `google` (Gemini API): **image + audio + video**
170
+ - `groq`: **audio**
171
+ - `deepgram`: **audio**
172
+
173
+ For CLI entries, **set `capabilities` explicitly** to avoid surprising matches.
174
+ If you omit `capabilities`, the entry is eligible for the list it appears in.
175
+
176
+ ## Provider support matrix (Milaidy integrations)
177
+
178
+ | Capability | Provider integration | Notes |
179
+ | ---------- | ------------------------------------------------ | ------------------------------------------------- |
180
+ | Image | OpenAI / Anthropic / Google / others via `pi-ai` | Any image-capable model in the registry works. |
181
+ | Audio | OpenAI, Groq, Deepgram, Google | Provider transcription (Whisper/Deepgram/Gemini). |
182
+ | Video | Google (Gemini API) | Provider video understanding. |
183
+
184
+ ## Recommended providers
185
+
186
+ **Image**
187
+
188
+ - Prefer your active model if it supports images.
189
+ - Good defaults: `openai/gpt-5.2`, `anthropic/claude-opus-4-5`, `google/gemini-3-pro-preview`.
190
+
191
+ **Audio**
192
+
193
+ - `openai/gpt-5-mini-transcribe`, `groq/whisper-large-v3-turbo`, or `deepgram/nova-3`.
194
+ - CLI fallback: `whisper-cli` (whisper-cpp) or `whisper`.
195
+ - Deepgram setup: [Deepgram (audio transcription)](/providers/deepgram).
196
+
197
+ **Video**
198
+
199
+ - `google/gemini-3-flash-preview` (fast), `google/gemini-3-pro-preview` (richer).
200
+ - CLI fallback: `gemini` CLI (supports `read_file` on video/audio).
201
+
202
+ ## Attachment policy
203
+
204
+ Per‑capability `attachments` controls which attachments are processed:
205
+
206
+ - `mode`: `first` (default) or `all`
207
+ - `maxAttachments`: cap the number processed (default **1**)
208
+ - `prefer`: `first`, `last`, `path`, `url`
209
+
210
+ When `mode: "all"`, outputs are labeled `[Image 1/2]`, `[Audio 2/2]`, etc.
211
+
212
+ ## Config examples
213
+
214
+ ### 1) Shared models list + overrides
215
+
216
+ ```json5
217
+ {
218
+ tools: {
219
+ media: {
220
+ models: [
221
+ { provider: "openai", model: "gpt-5.2", capabilities: ["image"] },
222
+ {
223
+ provider: "google",
224
+ model: "gemini-3-flash-preview",
225
+ capabilities: ["image", "audio", "video"],
226
+ },
227
+ {
228
+ type: "cli",
229
+ command: "gemini",
230
+ args: [
231
+ "-m",
232
+ "gemini-3-flash",
233
+ "--allowed-tools",
234
+ "read_file",
235
+ "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters.",
236
+ ],
237
+ capabilities: ["image", "video"],
238
+ },
239
+ ],
240
+ audio: {
241
+ attachments: { mode: "all", maxAttachments: 2 },
242
+ },
243
+ video: {
244
+ maxChars: 500,
245
+ },
246
+ },
247
+ },
248
+ }
249
+ ```
250
+
251
+ ### 2) Audio + Video only (image off)
252
+
253
+ ```json5
254
+ {
255
+ tools: {
256
+ media: {
257
+ audio: {
258
+ enabled: true,
259
+ models: [
260
+ { provider: "openai", model: "gpt-5-mini-transcribe" },
261
+ {
262
+ type: "cli",
263
+ command: "whisper",
264
+ args: ["--model", "base", "{{MediaPath}}"],
265
+ },
266
+ ],
267
+ },
268
+ video: {
269
+ enabled: true,
270
+ maxChars: 500,
271
+ models: [
272
+ { provider: "google", model: "gemini-3-flash-preview" },
273
+ {
274
+ type: "cli",
275
+ command: "gemini",
276
+ args: [
277
+ "-m",
278
+ "gemini-3-flash",
279
+ "--allowed-tools",
280
+ "read_file",
281
+ "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters.",
282
+ ],
283
+ },
284
+ ],
285
+ },
286
+ },
287
+ },
288
+ }
289
+ ```
290
+
291
+ ### 3) Optional image understanding
292
+
293
+ ```json5
294
+ {
295
+ tools: {
296
+ media: {
297
+ image: {
298
+ enabled: true,
299
+ maxBytes: 10485760,
300
+ maxChars: 500,
301
+ models: [
302
+ { provider: "openai", model: "gpt-5.2" },
303
+ { provider: "anthropic", model: "claude-opus-4-5" },
304
+ {
305
+ type: "cli",
306
+ command: "gemini",
307
+ args: [
308
+ "-m",
309
+ "gemini-3-flash",
310
+ "--allowed-tools",
311
+ "read_file",
312
+ "Read the media at {{MediaPath}} and describe it in <= {{MaxChars}} characters.",
313
+ ],
314
+ },
315
+ ],
316
+ },
317
+ },
318
+ },
319
+ }
320
+ ```
321
+
322
+ ### 4) Multi‑modal single entry (explicit capabilities)
323
+
324
+ ```json5
325
+ {
326
+ tools: {
327
+ media: {
328
+ image: {
329
+ models: [
330
+ {
331
+ provider: "google",
332
+ model: "gemini-3-pro-preview",
333
+ capabilities: ["image", "video", "audio"],
334
+ },
335
+ ],
336
+ },
337
+ audio: {
338
+ models: [
339
+ {
340
+ provider: "google",
341
+ model: "gemini-3-pro-preview",
342
+ capabilities: ["image", "video", "audio"],
343
+ },
344
+ ],
345
+ },
346
+ video: {
347
+ models: [
348
+ {
349
+ provider: "google",
350
+ model: "gemini-3-pro-preview",
351
+ capabilities: ["image", "video", "audio"],
352
+ },
353
+ ],
354
+ },
355
+ },
356
+ },
357
+ }
358
+ ```
359
+
360
+ ## Status output
361
+
362
+ When media understanding runs, `/status` includes a short summary line:
363
+
364
+ ```
365
+ 📎 Media: image ok (openai/gpt-5.2) · audio skipped (maxBytes)
366
+ ```
367
+
368
+ This shows per‑capability outcomes and the chosen provider/model when applicable.
369
+
370
+ ## Notes
371
+
372
+ - Understanding is **best‑effort**. Errors do not block replies.
373
+ - Attachments are still passed to models even when understanding is disabled.
374
+ - Use `scope` to limit where understanding runs (e.g. only DMs).
375
+
376
+ ## Related docs
377
+
378
+ - [Configuration](/gateway/configuration)
379
+ - [Image & Media Support](/nodes/images)
@@ -0,0 +1,90 @@
1
+ ---
2
+ summary: "Talk mode: continuous speech conversations with ElevenLabs TTS"
3
+ read_when:
4
+ - Implementing Talk mode on macOS/iOS/Android
5
+ - Changing voice/TTS/interrupt behavior
6
+ title: "Talk Mode"
7
+ ---
8
+
9
+ # Talk Mode
10
+
11
+ Talk mode is a continuous voice conversation loop:
12
+
13
+ 1. Listen for speech
14
+ 2. Send transcript to the model (main session, chat.send)
15
+ 3. Wait for the response
16
+ 4. Speak it via ElevenLabs (streaming playback)
17
+
18
+ ## Behavior (macOS)
19
+
20
+ - **Always-on overlay** while Talk mode is enabled.
21
+ - **Listening → Thinking → Speaking** phase transitions.
22
+ - On a **short pause** (silence window), the current transcript is sent.
23
+ - Replies are **written to WebChat** (same as typing).
24
+ - **Interrupt on speech** (default on): if the user starts talking while the assistant is speaking, we stop playback and note the interruption timestamp for the next prompt.
25
+
26
+ ## Voice directives in replies
27
+
28
+ The assistant may prefix its reply with a **single JSON line** to control voice:
29
+
30
+ ```json
31
+ { "voice": "<voice-id>", "once": true }
32
+ ```
33
+
34
+ Rules:
35
+
36
+ - First non-empty line only.
37
+ - Unknown keys are ignored.
38
+ - `once: true` applies to the current reply only.
39
+ - Without `once`, the voice becomes the new default for Talk mode.
40
+ - The JSON line is stripped before TTS playback.
41
+
42
+ Supported keys:
43
+
44
+ - `voice` / `voice_id` / `voiceId`
45
+ - `model` / `model_id` / `modelId`
46
+ - `speed`, `rate` (WPM), `stability`, `similarity`, `style`, `speakerBoost`
47
+ - `seed`, `normalize`, `lang`, `output_format`, `latency_tier`
48
+ - `once`
49
+
50
+ ## Config (`~/.milaidy/milaidy.json`)
51
+
52
+ ```json5
53
+ {
54
+ talk: {
55
+ voiceId: "elevenlabs_voice_id",
56
+ modelId: "eleven_v3",
57
+ outputFormat: "mp3_44100_128",
58
+ apiKey: "elevenlabs_api_key",
59
+ interruptOnSpeech: true,
60
+ },
61
+ }
62
+ ```
63
+
64
+ Defaults:
65
+
66
+ - `interruptOnSpeech`: true
67
+ - `voiceId`: falls back to `ELEVENLABS_VOICE_ID` / `SAG_VOICE_ID` (or first ElevenLabs voice when API key is available)
68
+ - `modelId`: defaults to `eleven_v3` when unset
69
+ - `apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available)
70
+ - `outputFormat`: defaults to `pcm_44100` on macOS/iOS and `pcm_24000` on Android (set `mp3_*` to force MP3 streaming)
71
+
72
+ ## macOS UI
73
+
74
+ - Menu bar toggle: **Talk**
75
+ - Config tab: **Talk Mode** group (voice id + interrupt toggle)
76
+ - Overlay:
77
+ - **Listening**: cloud pulses with mic level
78
+ - **Thinking**: sinking animation
79
+ - **Speaking**: radiating rings
80
+ - Click cloud: stop speaking
81
+ - Click X: exit Talk mode
82
+
83
+ ## Notes
84
+
85
+ - Requires Speech + Microphone permissions.
86
+ - Uses `chat.send` against session key `main`.
87
+ - TTS uses ElevenLabs streaming API with `ELEVENLABS_API_KEY` and incremental playback on macOS/iOS/Android for lower latency.
88
+ - `stability` for `eleven_v3` is validated to `0.0`, `0.5`, or `1.0`; other models accept `0..1`.
89
+ - `latency_tier` is validated to `0..4` when set.
90
+ - Android supports `pcm_16000`, `pcm_22050`, `pcm_24000`, and `pcm_44100` output formats for low-latency AudioTrack streaming.
@@ -0,0 +1,65 @@
1
+ ---
2
+ summary: "Global voice wake words (Gateway-owned) and how they sync across nodes"
3
+ read_when:
4
+ - Changing voice wake words behavior or defaults
5
+ - Adding new node platforms that need wake word sync
6
+ title: "Voice Wake"
7
+ ---
8
+
9
+ # Voice Wake (Global Wake Words)
10
+
11
+ Milaidy treats **wake words as a single global list** owned by the **Gateway**.
12
+
13
+ - There are **no per-node custom wake words**.
14
+ - **Any node/app UI may edit** the list; changes are persisted by the Gateway and broadcast to everyone.
15
+ - Each device still keeps its own **Voice Wake enabled/disabled** toggle (local UX + permissions differ).
16
+
17
+ ## Storage (Gateway host)
18
+
19
+ Wake words are stored on the gateway machine at:
20
+
21
+ - `~/.milaidy/settings/voicewake.json`
22
+
23
+ Shape:
24
+
25
+ ```json
26
+ { "triggers": ["milaidy", "claude", "computer"], "updatedAtMs": 1730000000000 }
27
+ ```
28
+
29
+ ## Protocol
30
+
31
+ ### Methods
32
+
33
+ - `voicewake.get` → `{ triggers: string[] }`
34
+ - `voicewake.set` with params `{ triggers: string[] }` → `{ triggers: string[] }`
35
+
36
+ Notes:
37
+
38
+ - Triggers are normalized (trimmed, empties dropped). Empty lists fall back to defaults.
39
+ - Limits are enforced for safety (count/length caps).
40
+
41
+ ### Events
42
+
43
+ - `voicewake.changed` payload `{ triggers: string[] }`
44
+
45
+ Who receives it:
46
+
47
+ - All WebSocket clients (macOS app, WebChat, etc.)
48
+ - All connected nodes (iOS/Android), and also on node connect as an initial “current state” push.
49
+
50
+ ## Client behavior
51
+
52
+ ### macOS app
53
+
54
+ - Uses the global list to gate `VoiceWakeRuntime` triggers.
55
+ - Editing “Trigger words” in Voice Wake settings calls `voicewake.set` and then relies on the broadcast to keep other clients in sync.
56
+
57
+ ### iOS node
58
+
59
+ - Uses the global list for `VoiceWakeManager` trigger detection.
60
+ - Editing Wake Words in Settings calls `voicewake.set` (over the Gateway WS) and also keeps local wake-word detection responsive.
61
+
62
+ ### Android node
63
+
64
+ - Exposes a Wake Words editor in Settings.
65
+ - Calls `voicewake.set` over the Gateway WS so edits sync everywhere.
@@ -0,0 +1,53 @@
1
+ ---
2
+ title: Deploy on Northflank
3
+ ---
4
+
5
+ Deploy Milaidy on Northflank with a one-click template and finish setup in your browser.
6
+ This is the easiest “no terminal on the server” path: Northflank runs the Gateway for you,
7
+ and you configure everything via the `/setup` web wizard.
8
+
9
+ ## How to get started
10
+
11
+ 1. Click [Deploy Milaidy](https://northflank.com/stacks/deploy-milaidy) to open the template.
12
+ 2. Create an [account on Northflank](https://app.northflank.com/signup) if you don’t already have one.
13
+ 3. Click **Deploy Milaidy now**.
14
+ 4. Set the required environment variable: `SETUP_PASSWORD`.
15
+ 5. Click **Deploy stack** to build and run the Milaidy template.
16
+ 6. Wait for the deployment to complete, then click **View resources**.
17
+ 7. Open the Milaidy service.
18
+ 8. Open the public Milaidy URL and complete setup at `/setup`.
19
+ 9. Open the Control UI at `/milaidy`.
20
+
21
+ ## What you get
22
+
23
+ - Hosted Milaidy Gateway + Control UI
24
+ - Web setup wizard at `/setup` (no terminal commands)
25
+ - Persistent storage via Northflank Volume (`/data`) so config/credentials/workspace survive redeploys
26
+
27
+ ## Setup flow
28
+
29
+ 1. Visit `https://<your-northflank-domain>/setup` and enter your `SETUP_PASSWORD`.
30
+ 2. Choose a model/auth provider and paste your key.
31
+ 3. (Optional) Add Telegram/Discord/Slack tokens.
32
+ 4. Click **Run setup**.
33
+ 5. Open the Control UI at `https://<your-northflank-domain>/milaidy`
34
+
35
+ If Telegram DMs are set to pairing, the setup wizard can approve the pairing code.
36
+
37
+ ## Getting chat tokens
38
+
39
+ ### Telegram bot token
40
+
41
+ 1. Message `@BotFather` in Telegram
42
+ 2. Run `/newbot`
43
+ 3. Copy the token (looks like `123456789:AA...`)
44
+ 4. Paste it into `/setup`
45
+
46
+ ### Discord bot token
47
+
48
+ 1. Go to https://discord.com/developers/applications
49
+ 2. **New Application** → choose a name
50
+ 3. **Bot** → **Add Bot**
51
+ 4. **Enable MESSAGE CONTENT INTENT** under Bot → Privileged Gateway Intents (required or the bot will crash on startup)
52
+ 5. Copy the **Bot Token** and paste into `/setup`
53
+ 6. Invite the bot to your server (OAuth2 URL Generator; scopes: `bot`, `applications.commands`)
@@ -0,0 +1,80 @@
1
+ ---
2
+ summary: "Perplexity Sonar setup for web_search"
3
+ read_when:
4
+ - You want to use Perplexity Sonar for web search
5
+ - You need PERPLEXITY_API_KEY or OpenRouter setup
6
+ title: "Perplexity Sonar"
7
+ ---
8
+
9
+ # Perplexity Sonar
10
+
11
+ Milaidy can use Perplexity Sonar for the `web_search` tool. You can connect
12
+ through Perplexity’s direct API or via OpenRouter.
13
+
14
+ ## API options
15
+
16
+ ### Perplexity (direct)
17
+
18
+ - Base URL: https://api.perplexity.ai
19
+ - Environment variable: `PERPLEXITY_API_KEY`
20
+
21
+ ### OpenRouter (alternative)
22
+
23
+ - Base URL: https://openrouter.ai/api/v1
24
+ - Environment variable: `OPENROUTER_API_KEY`
25
+ - Supports prepaid/crypto credits.
26
+
27
+ ## Config example
28
+
29
+ ```json5
30
+ {
31
+ tools: {
32
+ web: {
33
+ search: {
34
+ provider: "perplexity",
35
+ perplexity: {
36
+ apiKey: "pplx-...",
37
+ baseUrl: "https://api.perplexity.ai",
38
+ model: "perplexity/sonar-pro",
39
+ },
40
+ },
41
+ },
42
+ },
43
+ }
44
+ ```
45
+
46
+ ## Switching from Brave
47
+
48
+ ```json5
49
+ {
50
+ tools: {
51
+ web: {
52
+ search: {
53
+ provider: "perplexity",
54
+ perplexity: {
55
+ apiKey: "pplx-...",
56
+ baseUrl: "https://api.perplexity.ai",
57
+ },
58
+ },
59
+ },
60
+ },
61
+ }
62
+ ```
63
+
64
+ If both `PERPLEXITY_API_KEY` and `OPENROUTER_API_KEY` are set, set
65
+ `tools.web.search.perplexity.baseUrl` (or `tools.web.search.perplexity.apiKey`)
66
+ to disambiguate.
67
+
68
+ If no base URL is set, Milaidy chooses a default based on the API key source:
69
+
70
+ - `PERPLEXITY_API_KEY` or `pplx-...` → direct Perplexity (`https://api.perplexity.ai`)
71
+ - `OPENROUTER_API_KEY` or `sk-or-...` → OpenRouter (`https://openrouter.ai/api/v1`)
72
+ - Unknown key formats → OpenRouter (safe fallback)
73
+
74
+ ## Models
75
+
76
+ - `perplexity/sonar` — fast Q&A with web search
77
+ - `perplexity/sonar-pro` (default) — multi-step reasoning + web search
78
+ - `perplexity/sonar-reasoning-pro` — deep research
79
+
80
+ See [Web tools](/tools/web) for the full web_search configuration.