shmakk 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/.env.example +11 -0
  2. package/README.md +75 -1
  3. package/docs/index.html +154 -16
  4. package/docs/mcp.md +78 -0
  5. package/docs/ssh.md +82 -0
  6. package/docs/vibedit-analysis.md +375 -0
  7. package/docs/vim.md +110 -0
  8. package/docs/voice.md +4 -0
  9. package/package.json +9 -5
  10. package/scripts/test-vibedit.js +45 -0
  11. package/scripts/vibedit-demo.sh +52 -0
  12. package/skills/shmakk-skill-creator.md +269 -0
  13. package/src/_check.js +7 -0
  14. package/src/_check_schema.js +5 -0
  15. package/src/_cleanup.js +18 -0
  16. package/src/_fix.js +9 -0
  17. package/src/_test_import.js +15 -0
  18. package/src/agent.js +11 -4
  19. package/src/browser-daemon.js +209 -0
  20. package/src/browser.js +10 -0
  21. package/src/cli/browserDaemon.js +60 -0
  22. package/src/cli/connectBrowser.js +137 -0
  23. package/src/cli.js +235 -8
  24. package/src/completions.js +8 -0
  25. package/src/control.js +273 -1
  26. package/src/core/browserConnector.js +523 -0
  27. package/src/correction.js +6 -0
  28. package/src/electron.js +305 -0
  29. package/src/endpoints.js +74 -9
  30. package/src/index.js +24 -1
  31. package/src/llm.js +501 -61
  32. package/src/mobile.js +307 -0
  33. package/src/notify.js +51 -3
  34. package/src/orchestrator.js +35 -1
  35. package/src/pty.js +11 -6
  36. package/src/review.js +45 -11
  37. package/src/self-commands.js +153 -0
  38. package/src/session-convert.js +508 -0
  39. package/src/session-search.js +31 -0
  40. package/src/session.js +392 -46
  41. package/src/skills/browserActions.ts +984 -0
  42. package/src/skills.js +451 -24
  43. package/src/system-prompt.js +31 -25
  44. package/src/tools.js +81 -0
  45. package/src/vibedit/control.js +534 -0
  46. package/src/vibedit/electron.js +108 -0
  47. package/src/vibedit/files.js +171 -0
  48. package/src/vibedit/index.js +298 -0
  49. package/src/vibedit/overlay.js +1482 -0
  50. package/src/vibedit/prompts.js +245 -0
  51. package/src/vibedit/state.js +32 -0
  52. package/src/vim.js +410 -0
@@ -0,0 +1,269 @@
1
+ ---
2
+ name: shmakk-skill-creator
3
+ description: "Create or convert skills for shmakk. Two modes: (1) CREATE — guided authoring of a new shmakk skill from a plain description; (2) CONVERT — take a Claude Code .skill zip and produce a shmakk-native skill directory. Use CREATE when the user describes a new workflow or capability they want to package. Use CONVERT when the user drops a .skill file or references a .skill path. Both modes output an installable shmakk skill directory ready for ~/.config/shmakk/skills/ and shmakk-desktop."
4
+ category: workflow
5
+ ---
6
+
7
+ # Shmakk Skill Creator
8
+
9
+ Two entry points. Read the user's message and pick one:
10
+
11
+ | Signal | Mode |
12
+ |---|---|
13
+ | User describes a new capability/workflow in plain language | → **CREATE** |
14
+ | User provides a `.skill` file path or drops a `.skill` zip | → **CONVERT** |
15
+
16
+ ---
17
+
18
+ ## MODE A — CREATE
19
+
20
+ Build a new shmakk skill from scratch. The output is a single `SKILL.md` (or a directory if the
21
+ skill has sub-agents) that works with the shmakk runtime and renders correctly in shmakk-desktop's
22
+ Skills Browser and Workflows view.
23
+
24
+ ### Step 1 — Capture intent
25
+
26
+ Ask (or infer from context) only what changes the output meaningfully:
27
+
28
+ 1. **Name** — short kebab-case identifier (`campaign-planner`, `pr-reviewer`)
29
+ 2. **What it does** — one sentence trigger description
30
+ 3. **Single-step or multi-agent?** — does this need parallel/pipeline sub-agents, or is it one focused prompt?
31
+ 4. **Category** — `dev`, `workflow`, `backend`, `frontend`, `media`, `docs`, `system`, `business`, `productivity`, `security`, `planning`, `research`, `general`
32
+ 5. **Argument hint** — what the user passes when invoking (`<business description>`, `<PR number>`, etc.)
33
+
34
+ Do not ask for things you can infer. If the description clearly implies multi-agent (research → synthesis,
35
+ audit → fix → verify, plan → execute → check), call it multi-agent without asking.
36
+
37
+ ### Step 2 — Skill shape decision
38
+
39
+ **Single-step skill** — one `SKILL.md` with a strong system-prompt body. Use when the task is
40
+ self-contained and doesn't need separate agents for sub-phases.
41
+
42
+ **Multi-agent skill** — a directory with `SKILL.md` as the orchestrator plus agent role files. Use
43
+ when phases need isolation (independent research branches, synthesis that must not see each other's
44
+ drafts mid-flight, a verification step that shouldn't share context with the execution step).
45
+
46
+ ```
47
+ <name>/
48
+ SKILL.md ← orchestrator prompt + workflow metadata
49
+ agents/
50
+ <phase>/
51
+ <NN>-<role>.md ← one role file per sub-agent; NN = execution order
52
+ references/ ← shared docs injected into sub-agents as context
53
+ assets/ ← templates, schemas, examples
54
+ ```
55
+
56
+ ### Step 3 — Write the SKILL.md
57
+
58
+ Every SKILL.md starts with this frontmatter:
59
+
60
+ ```yaml
61
+ ---
62
+ name: <kebab-case-name>
63
+ description: '<trigger description — when to use this skill, what it does, key phrases that should load it>'
64
+ category: <category>
65
+ argument-hint: '<what the user passes>'
66
+ # For multi-agent skills only:
67
+ skill-type: orchestration
68
+ workflow:
69
+ topology: <parallel|pipeline|staged>
70
+ phases:
71
+ - name: <phase-name>
72
+ topology: <parallel|pipeline>
73
+ agents: [<role-file-paths>]
74
+ ---
75
+ ```
76
+
77
+ The body of SKILL.md is the **orchestration prompt** — instructions for the agent that runs this
78
+ skill. Write it to the same standard as the rest of the shmakk skill library:
79
+
80
+ - Lead with a one-paragraph summary of what the skill produces.
81
+ - Describe each phase: what agents run, in what order, what each receives and returns.
82
+ - State quality gates explicitly (what a failing output looks like and what to do — bounce back, not
83
+ silently accept).
84
+ - Name the final deliverable precisely: file path, shape, content contract.
85
+ - End with anti-patterns: the most common ways this skill produces bad output.
86
+
87
+ For sub-agent role files (`agents/<phase>/<NN>-<role>.md`), write each as a focused brief:
88
+ - Role in one line
89
+ - Inputs (what the orchestrator passes)
90
+ - What to produce (file path, format)
91
+ - Effort floor or quality bar
92
+ - Anti-fluff rule specific to this role
93
+
94
+ ### Step 4 — Output the skill
95
+
96
+ For a **single-step skill**: write `SKILL.md` directly to the target path.
97
+
98
+ For a **multi-agent skill**: write the full directory tree. Then print:
99
+
100
+ ```
101
+ ✓ Skill created: <name>/
102
+ Install: cp -r <name>/ ~/.config/shmakk/skills/<category>/<name>/
103
+ Or via: shmakk install <name>.skill (after packaging with: zip -r <name>.skill <name>/)
104
+ ```
105
+
106
+ ---
107
+
108
+ ## MODE B — CONVERT
109
+
110
+ Take a Claude Code `.skill` zip and produce a shmakk-native skill that:
111
+ - Works with `shmakk run skill <name>`
112
+ - Renders in shmakk-desktop's Skills Browser (card with name, description, category, status)
113
+ - Shows phases and steps in shmakk-desktop's Workflows view
114
+
115
+ ### Step 1 — Ingest the zip
116
+
117
+ ```bash
118
+ unzip <path>.skill -d /tmp/skill-convert/
119
+ ```
120
+
121
+ Read the extracted tree. Expected shape:
122
+
123
+ ```
124
+ <name>/
125
+ SKILL.md ← Claude Code orchestration prompt (frontmatter + prose)
126
+ agents/
127
+ research/ ← parallel research sub-agents (optional)
128
+ synthesis/ ← synthesis sub-agents; last one = assembler (optional)
129
+ <other-phase>/ ← any other phase name
130
+ references/ ← shared context docs
131
+ assets/ ← templates, schemas
132
+ ```
133
+
134
+ If the shape differs (flat directory, non-standard phase names, etc.) — adapt rather than fail.
135
+ The structure is a convention, not a contract.
136
+
137
+ ### Step 2 — Analyse phases
138
+
139
+ Read `SKILL.md` body to understand the workflow. Then read each agent file header (first 20 lines)
140
+ to understand its role. Build a phase map:
141
+
142
+ | Phase dir | Topology | Notes |
143
+ |---|---|---|
144
+ | `agents/research/` | **parallel** | All research agents run concurrently |
145
+ | `agents/synthesis/` | **staged** | All except last run in parallel; last = assembler runs after |
146
+ | Single `agents/` flat | infer from filenames | NN- prefix → pipeline order; no prefix → parallel |
147
+ | Custom phase dirs | read SKILL.md | The orchestrator prose describes the order |
148
+
149
+ ### Step 3 — Map Claude Code → shmakk conventions
150
+
151
+ Apply these substitutions throughout the orchestration prompt and agent role files:
152
+
153
+ | Claude Code | shmakk equivalent | Notes |
154
+ |---|---|---|
155
+ | `Task(prompt, ...)` / "spawn a sub-agent" | `subagent(role, task, context)` | shmakk's team.js dispatch |
156
+ | `WebSearch(query)` | `WebSearch` | Same name, keep as-is |
157
+ | `WebFetch(url)` / `web_fetch` | `WebFetch` | Same name, keep as-is |
158
+ | `Write(path, content)` | `Write` | Same |
159
+ | `Read(path)` | `Read` | Same |
160
+ | `Bash(cmd)` | `Bash` | Same |
161
+ | "read `references/X.md` first" | inject as `context` field in subagent call | shmakk passes context docs to sub-agents explicitly |
162
+ | "run in parallel in the same turn" | `topology: parallel` in workflow metadata | shmakk team.js runs parallel steps via Promise.all |
163
+ | "run sequentially, each sees prior output" | `topology: pipeline` | shmakk passes prior step output to next |
164
+ | `fallback.py` / stdlib fallback | note in SKILL.md as optional; shmakk uses LLM fallback | Strip if it references Claude-specific APIs |
165
+
166
+ Do **not** strip role files, references, or assets — they carry domain knowledge. Only touch the
167
+ tool-call syntax and the spawn patterns.
168
+
169
+ ### Step 4 — Rewrite the orchestration SKILL.md
170
+
171
+ Keep the original prose and domain logic. Change only:
172
+
173
+ 1. Replace the frontmatter entirely with shmakk frontmatter (see Step 3 of CREATE mode).
174
+ - Detect `name` from the folder name or original frontmatter.
175
+ - Detect `category` from the content (marketing → `business`; code → `dev`; etc.).
176
+ - Copy `description` from original, trim if over 400 chars.
177
+ - Add `skill-type: orchestration` and a `workflow:` block derived from the phase map.
178
+
179
+ 2. In the body, replace every Claude Code spawn pattern with shmakk's:
180
+
181
+ **Before (Claude Code):**
182
+ ```
183
+ Spawn three sub-agents in the same turn. Pass each: the path to its role file, references/research-standards.md, the brief.
184
+ ```
185
+
186
+ **After (shmakk):**
187
+ ```
188
+ Run three sub-agents in parallel (topology: parallel). For each, pass: its role file content, references/research-standards.md as context, the brief. shmakk will run these concurrently via the team runner.
189
+ ```
190
+
191
+ 3. Replace any reference to Claude Code tools by name (`claude`, `claude-code`, `/skill`, slash
192
+ commands) with shmakk equivalents (`shmakk run skill`, `shmakk`). Keep all domain logic intact.
193
+
194
+ ### Step 5 — Generate workflow.json
195
+
196
+ This file drives shmakk-desktop's Workflows view. One JSON object per workflow (most skills have one).
197
+
198
+ ```json
199
+ {
200
+ "id": "<name>",
201
+ "description": "<one-line description>",
202
+ "topology": "staged",
203
+ "stages": [
204
+ {
205
+ "name": "<phase-name>",
206
+ "topology": "parallel",
207
+ "steps": [
208
+ { "role": "<role>", "task": "<one-line task description>", "agentFile": "agents/<phase>/<file>.md" }
209
+ ]
210
+ }
211
+ ]
212
+ }
213
+ ```
214
+
215
+ For a simple pipeline (no parallel phases), use flat `steps` array instead of `stages`, matching
216
+ the format in `src/workflows.js`.
217
+
218
+ ### Step 6 — Write the output directory
219
+
220
+ ```
221
+ <name>/ ← drop-in shmakk skill directory
222
+ SKILL.md ← rewritten orchestration prompt
223
+ workflow.json ← desktop Workflows view descriptor
224
+ agents/ ← agent role files (kept verbatim, paths unchanged)
225
+ references/ ← reference docs (kept verbatim)
226
+ assets/ ← asset templates (kept verbatim)
227
+ ```
228
+
229
+ Then print:
230
+
231
+ ```
232
+ ✓ Converted: <original-name>.skill → <name>/
233
+
234
+ What changed:
235
+ - Frontmatter: replaced with shmakk format
236
+ - Spawn patterns: Claude Code Task() → shmakk subagent dispatch
237
+ - workflow.json: generated for shmakk-desktop Workflows view
238
+ - Tool names: <list any that were remapped>
239
+ - Kept intact: agent role files, references, assets
240
+
241
+ Install:
242
+ cp -r <name>/ ~/.config/shmakk/skills/<category>/<name>/
243
+
244
+ Or package and install:
245
+ zip -r <name>.skill <name>/
246
+ shmakk install <name>.skill
247
+ ```
248
+
249
+ If anything couldn't be cleanly mapped (custom tool calls, Claude-specific APIs, platform-specific
250
+ slash commands), list them explicitly under **Manual review needed** so the user knows what to check.
251
+
252
+ ---
253
+
254
+ ## Output quality bar (both modes)
255
+
256
+ A skill is ready to ship when:
257
+
258
+ - `SKILL.md` frontmatter is valid YAML with `name`, `description`, `category`.
259
+ - The description is specific enough that the shmakk dispatcher will match it correctly — not "does stuff", but the actual trigger phrases.
260
+ - Every sub-agent file has a clear role, explicit inputs, and a defined output (file path + format).
261
+ - Quality gates are named: what failing output looks like and what the orchestrator does about it.
262
+ - `workflow.json` (if present) is valid JSON and its `agentFile` paths resolve in the directory.
263
+ - The skill installs cleanly: `cp -r` to `~/.config/shmakk/skills/<category>/<name>/` and `shmakk run skill <name>` finds it.
264
+
265
+ A skill is **not** ready when:
266
+ - The description would match the wrong user intent (too broad) or never match (too narrow/technical).
267
+ - Sub-agent role files reference paths that don't exist in the directory.
268
+ - The orchestration prompt tells agents to "just do their best" with no quality gate — this produces variable-quality output that can't be improved systematically.
269
+ - `workflow.json` has hardcoded absolute paths or references files outside the skill directory.
package/src/_check.js ADDED
@@ -0,0 +1,7 @@
1
+ const D = require('better-sqlite3');
2
+ const db = new D('/home/marcus/.config/shmakk/sessions.db');
3
+ const s = db.prepare("SELECT * FROM sessions WHERE id = 'import-2026-06-28'").all();
4
+ console.log('Sessions:', JSON.stringify(s));
5
+ const t = db.prepare("SELECT COUNT(*) as c FROM turns WHERE session_id = 'import-2026-06-28'").get();
6
+ console.log('Turns:', t.c);
7
+ db.close();
@@ -0,0 +1,5 @@
1
+ const D = require('better-sqlite3');
2
+ const db = new D('/home/marcus/.config/shmakk/sessions.db');
3
+ const tables = db.prepare("SELECT sql FROM sqlite_master WHERE type='table'").all();
4
+ tables.forEach(t => console.log(t.sql));
5
+ db.close();
@@ -0,0 +1,18 @@
1
+ const D = require('better-sqlite3');
2
+ const db = new D('/home/marcus/.config/shmakk/sessions.db');
3
+
4
+ const toDelete = ['debug-cli', 'debug-cli2', 'debug-cli3', 'debug-cli4', 'debug-import', 'import-2026-06-28'];
5
+
6
+ for (const sid of toDelete) {
7
+ // Delete from turns first (FTS will auto-update via triggers)
8
+ const t = db.prepare('DELETE FROM turns WHERE session_id = ?').run(sid);
9
+ // Delete session record
10
+ const s = db.prepare('DELETE FROM sessions WHERE id = ?').run(sid);
11
+ // Delete files_touched
12
+ const f = db.prepare('DELETE FROM files_touched WHERE session_id = ?').run(sid);
13
+ // Delete project_sessions
14
+ const p = db.prepare('DELETE FROM project_sessions WHERE session_id = ?').run(sid);
15
+ console.log('Deleted ' + sid + ': ' + t.changes + ' turns, ' + s.changes + ' sessions');
16
+ }
17
+ db.close();
18
+ console.log('Done.');
package/src/_fix.js ADDED
@@ -0,0 +1,9 @@
1
+ const D = require('better-sqlite3');
2
+ const db = new D('/home/marcus/.config/shmakk/sessions.db');
3
+
4
+ // Delete stale session record
5
+ const info = db.prepare("DELETE FROM sessions WHERE id = 'import-2026-06-28'").run();
6
+ console.log('Deleted sessions row:', info.changes);
7
+
8
+ // Also check if there's a sessions.jsonl record
9
+ db.close();
@@ -0,0 +1,15 @@
1
+ const { claude2shmakk } = require('./session-convert');
2
+
3
+ async function main() {
4
+ try {
5
+ await claude2shmakk(
6
+ "/home/marcus/.config/Claude/local-agent-mode-sessions/2cebf85a-f27d-41a6-888e-6cff059551a8/f7d639de-d492-4e65-a14b-4d6cd8dfab7a/local_2a11b98e-e761-4401-8b43-0c0ab1fb04f1",
7
+ "import-2026-06-28"
8
+ );
9
+ console.log("SUCCESS");
10
+ } catch(e) {
11
+ console.error("FAILED:", e.message);
12
+ console.error(e.stack);
13
+ }
14
+ }
15
+ main();
package/src/agent.js CHANGED
@@ -7,7 +7,7 @@
7
7
 
8
8
  const fs = require('fs');
9
9
  const path = require('path');
10
- const { makeClient, modelFor, isConfigured, getDeepSeekOptions, supportsVision } = require('./llm');
10
+ const { makeClient, modelFor, isConfigured, getDeepSeekOptions, supportsVision, describeImages } = require('./llm');
11
11
  const {
12
12
  sanitizeAssistantContent,
13
13
  isLeakedToolMarkup,
@@ -561,9 +561,16 @@ async function runAgent({ input, roots, glossary, confirmTool, write, signal, hi
561
561
  );
562
562
  let toolContent = (toolText + (Object.keys(toolMeta).length ? ' ' + JSON.stringify(toolMeta) : '')).trim();
563
563
  if (toolImages.length > 0 && !supportsVision()) {
564
- // Endpoint doesn't support vision — include image metadata as text
565
- const imgDesc = toolImages.map((img, i) => `[Image #${i + 1}: ${img.mimeType}, base64=${img.dataLength} chars${img.truncated ? ', truncated' : ''}]`).join(', ');
566
- toolContent = toolContent ? `${toolContent} ${imgDesc}` : imgDesc;
564
+ // Endpoint doesn't support vision — call a vision-capable endpoint
565
+ // to describe the images as text for the non-vision model.
566
+ const visionDesc = await describeImages(toolImages, signal);
567
+ if (visionDesc) {
568
+ toolContent = toolContent ? `${toolContent}\n${visionDesc}` : visionDesc;
569
+ } else {
570
+ // Fallback: include image metadata as text
571
+ const imgDesc = toolImages.map((img, i) => `[Image #${i + 1}: ${img.mimeType}, base64=${img.dataLength} chars${img.truncated ? ', truncated' : ''}]`).join(', ');
572
+ toolContent = toolContent ? `${toolContent} ${imgDesc}` : imgDesc;
573
+ }
567
574
  }
568
575
  messages.push({ role: 'tool', tool_call_id: c.id, content: toolContent.slice(0, 8000) });
569
576
  if (toolImages.length > 0 && supportsVision()) {
@@ -0,0 +1,209 @@
1
+ const fs = require('fs');
2
+ const http = require('http');
3
+ const os = require('os');
4
+ const path = require('path');
5
+ const { WebSocketServer } = require('ws');
6
+ const llm = require('./llm');
7
+ const { getModelRegistry, getVisionSupport } = require('./endpoints');
8
+ const { automationSystem, automationUser } = require('./vibedit/prompts');
9
+
10
+ const DEFAULT_PORT = 3947;
11
+ const STATE_PATH = path.join(os.homedir(), '.config', 'shmakk', 'browser-daemon.json');
12
+
13
+ function stripFences(s) {
14
+ const match = String(s || '').match(/\{[\s\S]*\}/);
15
+ if (match) return match[0].trim();
16
+ return String(s || '').replace(/^\s*```(?:json)?\s*/i, '').replace(/\s*```\s*$/, '').trim();
17
+ }
18
+
19
+ function saveState(state) {
20
+ try {
21
+ fs.mkdirSync(path.dirname(STATE_PATH), { recursive: true });
22
+ fs.writeFileSync(STATE_PATH, JSON.stringify({ ...state, updatedAt: Date.now() }, null, 2));
23
+ } catch {}
24
+ }
25
+
26
+ function findVisionClient() {
27
+ const registry = getModelRegistry();
28
+ for (const [name, cfg] of Object.entries(registry.models)) {
29
+ if (cfg.vision) return llm.makeClientForEndpoint(name);
30
+ }
31
+ // Fall back to top-level visionSupport key
32
+ const vs = getVisionSupport();
33
+ if (vs) return llm.makeClientForEndpoint('visionSupport');
34
+ return null;
35
+ }
36
+
37
+ async function getClient(visionClient) {
38
+ if (visionClient) return visionClient.client;
39
+ const fast = llm.makeClientForEndpoint('fast');
40
+ if (fast) return fast.client;
41
+ if (!llm.isConfigured()) return null;
42
+ return llm.makeClient();
43
+ }
44
+
45
+ async function chatCompletion(client, messages, model, opts = {}) {
46
+ const response = await client.chat.completions.create({
47
+ model,
48
+ messages,
49
+ temperature: opts.temperature ?? 0.2,
50
+ max_tokens: opts.maxTokens || 2048,
51
+ });
52
+ return ((response?.choices?.[0]?.message?.content) || '').trim();
53
+ }
54
+
55
+ function send(ws, msg) {
56
+ try { ws.send(JSON.stringify(msg)); } catch {}
57
+ }
58
+
59
+ function daemonAutomationSystem() {
60
+ return automationSystem() + `
61
+
62
+ Additional browser-extension actions are allowed:
63
+ - "newTab": { "action": "newTab", "url": "https://...", "active": true, "description": "..." }
64
+ - "reload": { "action": "reload", "description": "..." }
65
+ - "closeTab": { "action": "closeTab", "description": "..." }
66
+ - "switchTab": { "action": "switchTab", "tabId": 123, "description": "..." }
67
+ - "createGroup": { "action": "createGroup", "title": "Group name", "color": "blue", "description": "..." }
68
+ - "moveToGroup": { "action": "moveToGroup", "groupId": 123, "description": "..." }
69
+ - "ungroup": { "action": "ungroup", "description": "..." }
70
+ Use these only when the user asks for tab or tab-group management.`;
71
+ }
72
+
73
+ async function handleAutomation(ws, msg, runtime) {
74
+ const directActions = Array.isArray(msg.directActions) ? msg.directActions : [];
75
+ if (directActions.length) {
76
+ send(ws, {
77
+ type: 'executeActions',
78
+ actions: directActions,
79
+ summary: `Replaying ${directActions.length} recorded action${directActions.length === 1 ? '' : 's'}`,
80
+ notes: '',
81
+ });
82
+ return;
83
+ }
84
+
85
+ const client = await getClient(runtime.visionClient);
86
+ if (!client) {
87
+ send(ws, { type: 'error', text: 'LLM not configured' });
88
+ return;
89
+ }
90
+
91
+ const fast = llm.makeClientForEndpoint('fast');
92
+ const model = runtime.visionModel || (fast ? fast.model : null) || llm.modelFor();
93
+ send(ws, { type: 'status', text: `Building browser automation with ${model}...` });
94
+
95
+ const shots = msg.screenshots && msg.screenshots.length ? msg.screenshots : [];
96
+ const userContent = automationUser(msg);
97
+ const vision = runtime.visionEnabled && shots.length;
98
+ let raw;
99
+
100
+ if (vision) {
101
+ const imageParts = shots.map((s) => ({ type: 'image_url', image_url: { url: `data:image/jpeg;base64,${s}`, detail: 'high' } }));
102
+ try {
103
+ raw = await chatCompletion(client, [
104
+ { role: 'system', content: daemonAutomationSystem() },
105
+ { role: 'user', content: [{ type: 'text', text: userContent }, ...imageParts] },
106
+ ], model);
107
+ } catch {
108
+ raw = await chatCompletion(client, [
109
+ { role: 'system', content: daemonAutomationSystem() },
110
+ { role: 'user', content: userContent },
111
+ ], model);
112
+ }
113
+ } else {
114
+ raw = await chatCompletion(client, [
115
+ { role: 'system', content: daemonAutomationSystem() },
116
+ { role: 'user', content: userContent },
117
+ ], model);
118
+ }
119
+
120
+ let parsed;
121
+ try {
122
+ parsed = JSON.parse(stripFences(raw));
123
+ } catch {
124
+ send(ws, {
125
+ type: 'automationResult',
126
+ ok: false,
127
+ summary: 'Failed to parse automation response.',
128
+ modelOutput: raw.slice(0, 1500),
129
+ });
130
+ return;
131
+ }
132
+
133
+ const actions = Array.isArray(parsed.actions) ? parsed.actions : [];
134
+ if (actions.length) {
135
+ send(ws, {
136
+ type: 'executeActions',
137
+ actions,
138
+ summary: parsed.summary || '',
139
+ notes: parsed.notes || '',
140
+ });
141
+ return;
142
+ }
143
+
144
+ send(ws, {
145
+ type: 'automationResult',
146
+ ok: true,
147
+ summary: parsed.summary || 'No executable actions were produced.',
148
+ notes: parsed.notes || '',
149
+ hasActions: false,
150
+ });
151
+ }
152
+
153
+ async function startBrowserDaemon(opts = {}) {
154
+ const port = Number(opts.port) || DEFAULT_PORT;
155
+ const visionClient = findVisionClient();
156
+ const fast = llm.makeClientForEndpoint('fast');
157
+ const runtime = {
158
+ visionClient,
159
+ visionModel: visionClient ? visionClient.model : null,
160
+ visionEnabled: !!visionClient,
161
+ };
162
+ const model = runtime.visionModel || (fast ? fast.model : null) || (llm.modelFor?.() || 'unknown');
163
+
164
+ const httpServer = http.createServer((req, res) => {
165
+ if (req.url === '/status') {
166
+ res.writeHead(200, { 'Content-Type': 'application/json' });
167
+ res.end(JSON.stringify({ ok: true, port, model, vision: runtime.visionEnabled }));
168
+ return;
169
+ }
170
+ res.writeHead(404);
171
+ res.end();
172
+ });
173
+
174
+ const wss = new WebSocketServer({ server: httpServer });
175
+
176
+ wss.on('connection', (ws) => {
177
+ saveState({ running: true, port, model, vision: runtime.visionEnabled, connectedAt: Date.now() });
178
+ send(ws, { type: 'hello', model, vision: runtime.visionEnabled, daemon: true });
179
+ ws.on('message', async (data) => {
180
+ let msg;
181
+ try { msg = JSON.parse(data.toString()); } catch { return; }
182
+ try {
183
+ if (msg.type === 'automation') await handleAutomation(ws, msg, runtime);
184
+ else if (msg.type === 'status') send(ws, { type: 'status', text: 'Browser daemon connected.' });
185
+ else if (msg.type === 'tabStatus') saveState({ running: true, port, model, vision: runtime.visionEnabled, activeTab: msg.tab || null });
186
+ } catch (err) {
187
+ send(ws, { type: 'error', text: err.message });
188
+ }
189
+ });
190
+ });
191
+
192
+ await new Promise((resolve) => httpServer.listen(port, '127.0.0.1', resolve));
193
+ saveState({ running: true, port, model, vision: runtime.visionEnabled, pid: process.pid, startedAt: Date.now() });
194
+ return {
195
+ port,
196
+ statePath: STATE_PATH,
197
+ close: () => {
198
+ saveState({ running: false, port, stoppedAt: Date.now() });
199
+ wss.close();
200
+ httpServer.close();
201
+ },
202
+ };
203
+ }
204
+
205
+ module.exports = {
206
+ DEFAULT_PORT,
207
+ STATE_PATH,
208
+ startBrowserDaemon,
209
+ };
package/src/browser.js CHANGED
@@ -184,13 +184,23 @@ async function screenshot(args) {
184
184
  const name = `screenshot-${Date.now()}.png`;
185
185
  const filePath = path.join(SCREENSHOT_DIR, name);
186
186
  await p.screenshot({ path: filePath, fullPage: false });
187
+
188
+ const buf = fs.readFileSync(filePath);
189
+ const b64 = buf.toString('base64');
187
190
  const stats = fs.statSync(filePath);
191
+
188
192
  return {
189
193
  ok: true,
190
194
  path: filePath,
191
195
  size: stats.size,
192
196
  url: p.url(),
193
197
  title: await p.title(),
198
+ images: [{
199
+ mimeType: 'image/png',
200
+ data: b64,
201
+ dataLength: b64.length,
202
+ truncated: false,
203
+ }],
194
204
  };
195
205
  } catch (e) {
196
206
  return { error: `screenshot failed: ${e.message}` };
@@ -0,0 +1,60 @@
1
+ const { startBrowserDaemon, DEFAULT_PORT, STATE_PATH } = require('../browser-daemon');
2
+
3
+ function parseArgs(argv) {
4
+ const args = { port: DEFAULT_PORT, help: false };
5
+ for (let i = 0; i < argv.length; i++) {
6
+ const a = argv[i];
7
+ if (a === 'browser-daemon') continue;
8
+ if (a === '--help' || a === '-h') args.help = true;
9
+ else if (a === '--port' || a === '-p') {
10
+ args.port = parseInt(argv[++i], 10);
11
+ if (isNaN(args.port) || args.port < 1 || args.port > 65535) {
12
+ process.stderr.write(`[shmakk] browser-daemon: invalid port: ${argv[i]}\n`);
13
+ process.exit(2);
14
+ }
15
+ } else {
16
+ process.stderr.write(`[shmakk] browser-daemon: unknown option: ${a}\n`);
17
+ args.help = true;
18
+ }
19
+ }
20
+ return args;
21
+ }
22
+
23
+ const HELP = `shmakk browser-daemon — extension automation backend
24
+
25
+ Usage:
26
+ shmakk browser-daemon [--port 3947]
27
+
28
+ Runs a single global WebSocket backend for the Chrome extension. State is
29
+ written to ${STATE_PATH}.
30
+ `;
31
+
32
+ async function main(argv = process.argv.slice(2)) {
33
+ const args = parseArgs(argv);
34
+ if (args.help) {
35
+ process.stdout.write(HELP);
36
+ return 0;
37
+ }
38
+
39
+ const daemon = await startBrowserDaemon({ port: args.port });
40
+ process.stdout.write(`[shmakk browser-daemon] listening on ws://127.0.0.1:${daemon.port}\n`);
41
+ process.stdout.write(`[shmakk browser-daemon] state: ${daemon.statePath}\n`);
42
+
43
+ const shutdown = () => {
44
+ process.stdout.write('\n[shmakk browser-daemon] shutting down\n');
45
+ daemon.close();
46
+ process.exit(0);
47
+ };
48
+ process.on('SIGINT', shutdown);
49
+ process.on('SIGTERM', shutdown);
50
+ return new Promise(() => {});
51
+ }
52
+
53
+ if (require.main === module) {
54
+ main().catch((err) => {
55
+ process.stderr.write(`[shmakk browser-daemon] fatal: ${err && err.stack || err}\n`);
56
+ process.exit(1);
57
+ });
58
+ }
59
+
60
+ module.exports = { main, HELP };