otoro-cli 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/otoro.js +29 -0
- package/lib/agent.js +65 -2
- package/lib/computer.js +74 -0
- package/lib/screen.js +281 -0
- package/lib/tools.js +69 -1
- package/package.json +1 -1
package/bin/otoro.js
CHANGED
|
@@ -68,6 +68,35 @@ program
|
|
|
68
68
|
await generateImage(prompt.join(' '))
|
|
69
69
|
})
|
|
70
70
|
|
|
71
|
+
program
|
|
72
|
+
.command('computer <task...>')
|
|
73
|
+
.alias('use')
|
|
74
|
+
.description('Computer Use — Otoro sees your screen and controls mouse/keyboard')
|
|
75
|
+
.action(async (task) => {
|
|
76
|
+
requireAuth()
|
|
77
|
+
const { runComputerTask } = require('../lib/computer')
|
|
78
|
+
await runComputerTask(task.join(' '))
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
program
|
|
82
|
+
.command('screen')
|
|
83
|
+
.description('Take a screenshot and describe what Otoro sees')
|
|
84
|
+
.action(async () => {
|
|
85
|
+
requireAuth()
|
|
86
|
+
const { analyzeScreen } = require('../lib/screen')
|
|
87
|
+
const chalk = require('chalk')
|
|
88
|
+
const ora = require('ora')
|
|
89
|
+
const spinner = ora({ text: chalk.gray('Looking at screen...'), color: 'cyan' }).start()
|
|
90
|
+
const result = await analyzeScreen()
|
|
91
|
+
if (result.success) {
|
|
92
|
+
spinner.succeed(chalk.green('Screen analyzed'))
|
|
93
|
+
console.log(chalk.cyan('\n What Otoro sees:\n'))
|
|
94
|
+
console.log(' ' + result.description.split('\n').join('\n ') + '\n')
|
|
95
|
+
} else {
|
|
96
|
+
spinner.fail(chalk.red(result.error))
|
|
97
|
+
}
|
|
98
|
+
})
|
|
99
|
+
|
|
71
100
|
program
|
|
72
101
|
.command('start')
|
|
73
102
|
.description('Start Otoro agent daemon — connects to server for remote tasks')
|
package/lib/agent.js
CHANGED
|
@@ -3,7 +3,9 @@ const os = require('os')
|
|
|
3
3
|
const path = require('path')
|
|
4
4
|
const chalk = require('chalk')
|
|
5
5
|
const { getConfig } = require('./config')
|
|
6
|
-
const { readFile, writeFile, editFile, listFiles, runCommand, searchCode } = require('./tools')
|
|
6
|
+
const { readFile, writeFile, editFile, listFiles, runCommand, searchCode, openApp, openUrl, getSystemInfo, takeScreenshot } = require('./tools')
|
|
7
|
+
const { mouseClick, mouseMove, typeText, pressKey, analyzeScreen } = require('./screen')
|
|
8
|
+
const { runComputerTask } = require('./computer')
|
|
7
9
|
const { chatCompletion } = require('./api')
|
|
8
10
|
|
|
9
11
|
class OtoroAgent {
|
|
@@ -117,6 +119,40 @@ class OtoroAgent {
|
|
|
117
119
|
console.log(chalk.yellow(` ⚡ Running: ${payload.cmd}`))
|
|
118
120
|
result = runCommand(payload.cmd, payload.timeout || 30000)
|
|
119
121
|
break
|
|
122
|
+
case 'open_app':
|
|
123
|
+
console.log(chalk.yellow(` 🚀 Opening: ${payload.app}`))
|
|
124
|
+
result = openApp(payload.app)
|
|
125
|
+
break
|
|
126
|
+
case 'open_url':
|
|
127
|
+
console.log(chalk.yellow(` 🌐 Opening: ${payload.url}`))
|
|
128
|
+
result = openUrl(payload.url)
|
|
129
|
+
break
|
|
130
|
+
case 'screenshot':
|
|
131
|
+
console.log(chalk.yellow(` 📸 Taking screenshot...`))
|
|
132
|
+
result = takeScreenshot()
|
|
133
|
+
break
|
|
134
|
+
case 'system_info':
|
|
135
|
+
result = getSystemInfo()
|
|
136
|
+
break
|
|
137
|
+
case 'mouse_click':
|
|
138
|
+
result = mouseClick(payload.x, payload.y, payload.button)
|
|
139
|
+
break
|
|
140
|
+
case 'mouse_move':
|
|
141
|
+
result = mouseMove(payload.x, payload.y)
|
|
142
|
+
break
|
|
143
|
+
case 'type_text':
|
|
144
|
+
result = typeText(payload.text)
|
|
145
|
+
break
|
|
146
|
+
case 'press_key':
|
|
147
|
+
result = pressKey(payload.key)
|
|
148
|
+
break
|
|
149
|
+
case 'analyze_screen':
|
|
150
|
+
result = await analyzeScreen(payload.question)
|
|
151
|
+
break
|
|
152
|
+
case 'computer_use':
|
|
153
|
+
console.log(chalk.cyan(` 🖥️ Computer Use: ${payload.task}`))
|
|
154
|
+
result = await runComputerTask(payload.task, payload.max_steps || 10)
|
|
155
|
+
break
|
|
120
156
|
case 'list_files':
|
|
121
157
|
result = listFiles(payload.dir || '.', payload.pattern || '')
|
|
122
158
|
break
|
|
@@ -158,9 +194,15 @@ Available tools (use XML tags in your response):
|
|
|
158
194
|
- <tool:read path="file"/> — read a file
|
|
159
195
|
- <tool:write path="file">content</tool:write> — write/create a file
|
|
160
196
|
- <tool:edit path="file" old="old text" new="new text"/> — edit a file
|
|
161
|
-
- <tool:run cmd="command"/> — run a shell command
|
|
197
|
+
- <tool:run cmd="command"/> — run a shell command (works on Mac/Windows/Linux)
|
|
162
198
|
- <tool:search pattern="regex"/> — search code
|
|
199
|
+
- <tool:open app="AppName"/> — open an application (Blender, Discord, VS Code, etc.)
|
|
200
|
+
- <tool:url href="https://..."/> — open a URL in the browser
|
|
201
|
+
- <tool:screenshot/> — take a screenshot of the screen
|
|
202
|
+
|
|
203
|
+
You are running LOCALLY on the user's machine. You have full access to their filesystem and can run any command. You can open apps, edit code, run builds, start servers, and interact with their development environment.
|
|
163
204
|
|
|
205
|
+
Platform: ${process.platform} (${process.arch})
|
|
164
206
|
Project context:
|
|
165
207
|
${projectContext}
|
|
166
208
|
|
|
@@ -238,6 +280,27 @@ Be direct. Execute tasks immediately. Don't ask for permission — just do it.`
|
|
|
238
280
|
results.push({ type: 'search', pattern: match[1], ...r })
|
|
239
281
|
}
|
|
240
282
|
|
|
283
|
+
// Process <tool:open>
|
|
284
|
+
for (const match of response.matchAll(/<tool:open\s+app="([^"]+)"\s*\/>/g)) {
|
|
285
|
+
console.log(chalk.yellow(` 🚀 Opening: ${match[1]}`))
|
|
286
|
+
const r = openApp(match[1])
|
|
287
|
+
results.push({ type: 'open_app', app: match[1], ...r })
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// Process <tool:url>
|
|
291
|
+
for (const match of response.matchAll(/<tool:url\s+href="([^"]+)"\s*\/>/g)) {
|
|
292
|
+
console.log(chalk.yellow(` 🌐 Opening: ${match[1]}`))
|
|
293
|
+
const r = openUrl(match[1])
|
|
294
|
+
results.push({ type: 'open_url', url: match[1], ...r })
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
// Process <tool:screenshot>
|
|
298
|
+
for (const match of response.matchAll(/<tool:screenshot\s*\/>/g)) {
|
|
299
|
+
console.log(chalk.yellow(` 📸 Screenshot...`))
|
|
300
|
+
const r = takeScreenshot()
|
|
301
|
+
results.push({ type: 'screenshot', ...r })
|
|
302
|
+
}
|
|
303
|
+
|
|
241
304
|
return results
|
|
242
305
|
}
|
|
243
306
|
}
|
package/lib/computer.js
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
const chalk = require('chalk')
|
|
2
|
+
const ora = require('ora')
|
|
3
|
+
const { analyzeScreen, computerUseStep, executeComputerActions, takeScreenshot } = require('./screen')
|
|
4
|
+
|
|
5
|
+
async function runComputerTask(task, maxSteps = 15) {
|
|
6
|
+
console.log(chalk.cyan.bold('\n 🐙 Otoro Computer Use\n'))
|
|
7
|
+
console.log(chalk.gray(` Task: ${task}`))
|
|
8
|
+
console.log(chalk.gray(` Platform: ${process.platform}\n`))
|
|
9
|
+
|
|
10
|
+
// Step 1: See the screen
|
|
11
|
+
let spinner = ora({ text: chalk.gray('Looking at screen...'), color: 'cyan' }).start()
|
|
12
|
+
let screen = await analyzeScreen()
|
|
13
|
+
if (!screen.success) {
|
|
14
|
+
spinner.fail(chalk.red(`Can't see screen: ${screen.error}`))
|
|
15
|
+
console.log(chalk.yellow('\n Tips:'))
|
|
16
|
+
if (process.platform === 'linux') console.log(chalk.gray(' • Install scrot or gnome-screenshot: sudo apt install scrot'))
|
|
17
|
+
if (process.platform === 'darwin') console.log(chalk.gray(' • Grant Screen Recording permission: System Settings → Privacy → Screen Recording'))
|
|
18
|
+
return
|
|
19
|
+
}
|
|
20
|
+
spinner.succeed(chalk.green('Screen captured'))
|
|
21
|
+
console.log(chalk.gray(` I see: ${screen.description.slice(0, 150)}...\n`))
|
|
22
|
+
|
|
23
|
+
// Step 2: Loop — AI sees screen, decides action, executes, repeat
|
|
24
|
+
for (let step = 1; step <= maxSteps; step++) {
|
|
25
|
+
console.log(chalk.cyan(` Step ${step}/${maxSteps}`))
|
|
26
|
+
|
|
27
|
+
spinner = ora({ text: chalk.gray('Deciding action...'), color: 'cyan' }).start()
|
|
28
|
+
const aiResponse = await computerUseStep(task, screen.description)
|
|
29
|
+
spinner.stop()
|
|
30
|
+
|
|
31
|
+
if (!aiResponse) {
|
|
32
|
+
console.log(chalk.red(' No response from AI'))
|
|
33
|
+
break
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Show what Otoro is thinking (strip action tags for display)
|
|
37
|
+
const thinking = aiResponse.replace(/<action:[^>]*\/>/g, '').trim()
|
|
38
|
+
if (thinking) console.log(chalk.gray(` ${thinking.split('\n')[0].slice(0, 100)}`))
|
|
39
|
+
|
|
40
|
+
// Execute actions
|
|
41
|
+
const { results, isDone } = await executeComputerActions(aiResponse)
|
|
42
|
+
|
|
43
|
+
for (const r of results) {
|
|
44
|
+
if (r.success) {
|
|
45
|
+
if (r.x !== undefined) console.log(chalk.green(` ✓ Click (${r.x}, ${r.y})`))
|
|
46
|
+
else if (r.typed) console.log(chalk.green(` ✓ Typed ${r.typed}`))
|
|
47
|
+
else if (r.key) console.log(chalk.green(` ✓ Pressed ${r.key}`))
|
|
48
|
+
else if (r.waited) console.log(chalk.green(` ✓ Waited ${r.waited}`))
|
|
49
|
+
} else {
|
|
50
|
+
console.log(chalk.red(` ✗ ${r.error}`))
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
if (isDone) {
|
|
55
|
+
console.log(chalk.green.bold('\n ✓ Task complete!\n'))
|
|
56
|
+
return
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Wait a moment then re-capture screen
|
|
60
|
+
await new Promise(r => setTimeout(r, 1000))
|
|
61
|
+
spinner = ora({ text: chalk.gray('Looking at updated screen...'), color: 'cyan' }).start()
|
|
62
|
+
screen = await analyzeScreen()
|
|
63
|
+
spinner.stop()
|
|
64
|
+
|
|
65
|
+
if (screen.success) {
|
|
66
|
+
console.log(chalk.gray(` Screen: ${screen.description.slice(0, 100)}...`))
|
|
67
|
+
}
|
|
68
|
+
console.log()
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
console.log(chalk.yellow(`\n Reached max steps (${maxSteps}). Task may be incomplete.\n`))
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
module.exports = { runComputerTask }
|
package/lib/screen.js
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
const { execSync, exec } = require('child_process')
|
|
2
|
+
const fs = require('fs')
|
|
3
|
+
const path = require('path')
|
|
4
|
+
const os = require('os')
|
|
5
|
+
const chalk = require('chalk')
|
|
6
|
+
const http = require('http')
|
|
7
|
+
const { getConfig } = require('./config')
|
|
8
|
+
|
|
9
|
+
const SCREENSHOT_DIR = path.join(os.tmpdir(), 'otoro-screenshots')
|
|
10
|
+
fs.mkdirSync(SCREENSHOT_DIR, { recursive: true })
|
|
11
|
+
|
|
12
|
+
// ─── Screenshots ──────────────────────────────────────────────────────────────
|
|
13
|
+
|
|
14
|
+
function takeScreenshot() {
|
|
15
|
+
const platform = process.platform
|
|
16
|
+
const file = path.join(SCREENSHOT_DIR, `screen-${Date.now()}.png`)
|
|
17
|
+
try {
|
|
18
|
+
if (platform === 'darwin') {
|
|
19
|
+
execSync(`screencapture -x "${file}"`, { timeout: 5000 })
|
|
20
|
+
} else if (platform === 'win32') {
|
|
21
|
+
// PowerShell screenshot
|
|
22
|
+
execSync(`powershell -command "Add-Type -AssemblyName System.Windows.Forms; $screen = [System.Windows.Forms.Screen]::PrimaryScreen.Bounds; $bitmap = New-Object System.Drawing.Bitmap($screen.Width, $screen.Height); $graphics = [System.Drawing.Graphics]::FromImage($bitmap); $graphics.CopyFromScreen(0, 0, 0, 0, $screen.Size); $bitmap.Save('${file.replace(/\\/g, '\\\\')}'); $graphics.Dispose(); $bitmap.Dispose()"`, { timeout: 10000 })
|
|
23
|
+
} else {
|
|
24
|
+
// Linux — try multiple tools
|
|
25
|
+
try { execSync(`gnome-screenshot -f "${file}" 2>/dev/null`, { timeout: 5000 }) }
|
|
26
|
+
catch {
|
|
27
|
+
try { execSync(`scrot "${file}" 2>/dev/null`, { timeout: 5000 }) }
|
|
28
|
+
catch { execSync(`import -window root "${file}" 2>/dev/null`, { timeout: 5000 }) }
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
if (fs.existsSync(file)) return { success: true, path: file, size: fs.statSync(file).size }
|
|
32
|
+
return { success: false, error: 'Screenshot not created' }
|
|
33
|
+
} catch (e) {
|
|
34
|
+
return { success: false, error: e.message }
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// ─── Mouse Control ────────────────────────────────────────────────────────────
|
|
39
|
+
|
|
40
|
+
function mouseClick(x, y, button = 'left') {
|
|
41
|
+
const platform = process.platform
|
|
42
|
+
try {
|
|
43
|
+
if (platform === 'darwin') {
|
|
44
|
+
const btn = button === 'right' ? 'rc' : 'c'
|
|
45
|
+
execSync(`osascript -e 'tell application "System Events" to click at {${x}, ${y}}'`, { timeout: 3000 })
|
|
46
|
+
} else if (platform === 'win32') {
|
|
47
|
+
const btnCode = button === 'right' ? '$right = $true' : ''
|
|
48
|
+
execSync(`powershell -command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.Cursor]::Position = New-Object System.Drawing.Point(${x}, ${y}); Add-Type -MemberDefinition '[DllImport(\\\"user32.dll\\\")] public static extern void mouse_event(int f,int x,int y,int d,int i);' -Name U -Namespace W; [W.U]::mouse_event(${button === 'right' ? '0x0008' : '0x0002'},0,0,0,0); [W.U]::mouse_event(${button === 'right' ? '0x0010' : '0x0004'},0,0,0,0)"`, { timeout: 5000 })
|
|
49
|
+
} else {
|
|
50
|
+
execSync(`xdotool mousemove ${x} ${y} click ${button === 'right' ? '3' : '1'}`, { timeout: 3000 })
|
|
51
|
+
}
|
|
52
|
+
return { success: true, x, y, button }
|
|
53
|
+
} catch (e) {
|
|
54
|
+
return { success: false, error: e.message }
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function mouseMove(x, y) {
|
|
59
|
+
const platform = process.platform
|
|
60
|
+
try {
|
|
61
|
+
if (platform === 'darwin') {
|
|
62
|
+
execSync(`osascript -e 'tell application "System Events" to set position of cursor to {${x}, ${y}}'`, { timeout: 3000 })
|
|
63
|
+
} else if (platform === 'win32') {
|
|
64
|
+
execSync(`powershell -command "[System.Windows.Forms.Cursor]::Position = New-Object System.Drawing.Point(${x}, ${y})"`, { timeout: 3000 })
|
|
65
|
+
} else {
|
|
66
|
+
execSync(`xdotool mousemove ${x} ${y}`, { timeout: 3000 })
|
|
67
|
+
}
|
|
68
|
+
return { success: true, x, y }
|
|
69
|
+
} catch (e) {
|
|
70
|
+
return { success: false, error: e.message }
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// ─── Keyboard Control ─────────────────────────────────────────────────────────
|
|
75
|
+
|
|
76
|
+
function typeText(text) {
|
|
77
|
+
const platform = process.platform
|
|
78
|
+
try {
|
|
79
|
+
if (platform === 'darwin') {
|
|
80
|
+
// Escape special chars for AppleScript
|
|
81
|
+
const escaped = text.replace(/\\/g, '\\\\').replace(/"/g, '\\"')
|
|
82
|
+
execSync(`osascript -e 'tell application "System Events" to keystroke "${escaped}"'`, { timeout: 5000 })
|
|
83
|
+
} else if (platform === 'win32') {
|
|
84
|
+
const escaped = text.replace(/'/g, "''")
|
|
85
|
+
execSync(`powershell -command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${escaped}')"`, { timeout: 5000 })
|
|
86
|
+
} else {
|
|
87
|
+
execSync(`xdotool type --clearmodifiers "${text.replace(/"/g, '\\"')}"`, { timeout: 5000 })
|
|
88
|
+
}
|
|
89
|
+
return { success: true, typed: text.length + ' chars' }
|
|
90
|
+
} catch (e) {
|
|
91
|
+
return { success: false, error: e.message }
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function pressKey(key) {
|
|
96
|
+
// key: "enter", "tab", "escape", "backspace", "ctrl+c", "cmd+s", etc.
|
|
97
|
+
const platform = process.platform
|
|
98
|
+
try {
|
|
99
|
+
if (platform === 'darwin') {
|
|
100
|
+
const keyMap = { enter: 'return', tab: 'tab', escape: 'escape', backspace: 'delete', space: 'space' }
|
|
101
|
+
const mapped = keyMap[key.toLowerCase()] || key.toLowerCase()
|
|
102
|
+
if (key.includes('+')) {
|
|
103
|
+
const [mod, k] = key.split('+')
|
|
104
|
+
const modMap = { ctrl: 'control', cmd: 'command', alt: 'option', shift: 'shift' }
|
|
105
|
+
execSync(`osascript -e 'tell application "System Events" to key code 0 using {${modMap[mod] || mod} down}'`, { timeout: 3000 })
|
|
106
|
+
} else {
|
|
107
|
+
execSync(`osascript -e 'tell application "System Events" to keystroke "${mapped}"'`, { timeout: 3000 })
|
|
108
|
+
}
|
|
109
|
+
} else if (platform === 'win32') {
|
|
110
|
+
const keyMap = { enter: '{ENTER}', tab: '{TAB}', escape: '{ESC}', backspace: '{BS}', space: ' ' }
|
|
111
|
+
const mapped = keyMap[key.toLowerCase()] || `{${key.toUpperCase()}}`
|
|
112
|
+
execSync(`powershell -command "[System.Windows.Forms.SendKeys]::SendWait('${mapped}')"`, { timeout: 3000 })
|
|
113
|
+
} else {
|
|
114
|
+
execSync(`xdotool key ${key.replace('cmd', 'super').replace('ctrl', 'ctrl')}`, { timeout: 3000 })
|
|
115
|
+
}
|
|
116
|
+
return { success: true, key }
|
|
117
|
+
} catch (e) {
|
|
118
|
+
return { success: false, error: e.message }
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// ─── Vision — Send screenshot to Qwen-VL for understanding ───────────────────
|
|
123
|
+
|
|
124
|
+
async function analyzeScreen(question = 'What is on the screen? Describe the UI elements, buttons, and text visible.') {
|
|
125
|
+
const screenshot = takeScreenshot()
|
|
126
|
+
if (!screenshot.success) return { success: false, error: screenshot.error }
|
|
127
|
+
|
|
128
|
+
const config = getConfig()
|
|
129
|
+
const imageData = fs.readFileSync(screenshot.path)
|
|
130
|
+
const b64 = imageData.toString('base64')
|
|
131
|
+
|
|
132
|
+
// Send to Qwen-VL vision model
|
|
133
|
+
const body = JSON.stringify({
|
|
134
|
+
model: 'qwen-vl',
|
|
135
|
+
messages: [
|
|
136
|
+
{ role: 'user', content: [
|
|
137
|
+
{ type: 'text', text: question },
|
|
138
|
+
{ type: 'image_url', image_url: { url: `data:image/png;base64,${b64}` } }
|
|
139
|
+
]}
|
|
140
|
+
],
|
|
141
|
+
max_tokens: 1024,
|
|
142
|
+
})
|
|
143
|
+
|
|
144
|
+
return new Promise((resolve) => {
|
|
145
|
+
const url = new URL(`${config.gpu_url}/v1/chat/completions`)
|
|
146
|
+
const req = http.request(url, {
|
|
147
|
+
method: 'POST',
|
|
148
|
+
headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${config.api_key}` },
|
|
149
|
+
timeout: 30000,
|
|
150
|
+
}, (res) => {
|
|
151
|
+
let data = ''
|
|
152
|
+
res.on('data', c => data += c)
|
|
153
|
+
res.on('end', () => {
|
|
154
|
+
try {
|
|
155
|
+
const result = JSON.parse(data)
|
|
156
|
+
const description = result.choices?.[0]?.message?.content || ''
|
|
157
|
+
resolve({ success: true, description, screenshot: screenshot.path })
|
|
158
|
+
} catch { resolve({ success: false, error: 'Bad response from vision model' }) }
|
|
159
|
+
})
|
|
160
|
+
})
|
|
161
|
+
req.on('error', (e) => resolve({ success: false, error: e.message }))
|
|
162
|
+
req.write(body)
|
|
163
|
+
req.end()
|
|
164
|
+
})
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// ─── Live Screen Monitor ──────────────────────────────────────────────────────
|
|
168
|
+
|
|
169
|
+
class ScreenMonitor {
|
|
170
|
+
constructor(intervalMs = 3000) {
|
|
171
|
+
this.interval = intervalMs
|
|
172
|
+
this.running = false
|
|
173
|
+
this.timer = null
|
|
174
|
+
this.lastDescription = ''
|
|
175
|
+
this.onUpdate = null
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
start(callback) {
|
|
179
|
+
this.running = true
|
|
180
|
+
this.onUpdate = callback
|
|
181
|
+
console.log(chalk.cyan(` 👁 Screen monitor started (every ${this.interval / 1000}s)`))
|
|
182
|
+
this.tick()
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
stop() {
|
|
186
|
+
this.running = false
|
|
187
|
+
if (this.timer) clearTimeout(this.timer)
|
|
188
|
+
console.log(chalk.gray(' Screen monitor stopped'))
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
async tick() {
|
|
192
|
+
if (!this.running) return
|
|
193
|
+
const result = await analyzeScreen('Briefly describe what is currently visible on screen. Note any dialogs, windows, error messages, or UI changes.')
|
|
194
|
+
if (result.success && result.description !== this.lastDescription) {
|
|
195
|
+
this.lastDescription = result.description
|
|
196
|
+
if (this.onUpdate) this.onUpdate(result.description, result.screenshot)
|
|
197
|
+
}
|
|
198
|
+
this.timer = setTimeout(() => this.tick(), this.interval)
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// ─── Computer Use Agent — AI sees screen and controls computer ────────────────
|
|
203
|
+
|
|
204
|
+
async function computerUseStep(task, screenDescription) {
|
|
205
|
+
const config = getConfig()
|
|
206
|
+
const body = JSON.stringify({
|
|
207
|
+
model: 'qwen-coder',
|
|
208
|
+
messages: [
|
|
209
|
+
{ role: 'system', content: `You are Otoro controlling a ${process.platform} computer. You can see the screen and control mouse/keyboard.
|
|
210
|
+
|
|
211
|
+
Available actions (use XML tags):
|
|
212
|
+
- <action:click x="123" y="456"/> — click at coordinates
|
|
213
|
+
- <action:rightclick x="123" y="456"/> — right-click
|
|
214
|
+
- <action:type text="hello world"/> — type text
|
|
215
|
+
- <action:key press="enter"/> — press a key (enter, tab, escape, ctrl+c, cmd+s, etc.)
|
|
216
|
+
- <action:move x="123" y="456"/> — move mouse
|
|
217
|
+
- <action:screenshot/> — take a new screenshot
|
|
218
|
+
- <action:wait ms="1000"/> — wait before next action
|
|
219
|
+
- <action:done/> — task is complete
|
|
220
|
+
|
|
221
|
+
Current screen: ${screenDescription}
|
|
222
|
+
|
|
223
|
+
Execute the task step by step. After each action, I'll show you the updated screen.` },
|
|
224
|
+
{ role: 'user', content: task }
|
|
225
|
+
],
|
|
226
|
+
max_tokens: 2048,
|
|
227
|
+
})
|
|
228
|
+
|
|
229
|
+
return new Promise((resolve) => {
|
|
230
|
+
const url = new URL(`${config.gpu_url}/v1/chat/completions`)
|
|
231
|
+
const req = http.request(url, {
|
|
232
|
+
method: 'POST',
|
|
233
|
+
headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${config.api_key}` },
|
|
234
|
+
timeout: 60000,
|
|
235
|
+
}, (res) => {
|
|
236
|
+
let data = ''
|
|
237
|
+
res.on('data', c => data += c)
|
|
238
|
+
res.on('end', () => {
|
|
239
|
+
try {
|
|
240
|
+
const result = JSON.parse(data)
|
|
241
|
+
resolve(result.choices?.[0]?.message?.content || '')
|
|
242
|
+
} catch { resolve('') }
|
|
243
|
+
})
|
|
244
|
+
})
|
|
245
|
+
req.on('error', () => resolve(''))
|
|
246
|
+
req.write(body)
|
|
247
|
+
req.end()
|
|
248
|
+
})
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
async function executeComputerActions(response) {
|
|
252
|
+
const results = []
|
|
253
|
+
|
|
254
|
+
for (const match of response.matchAll(/<action:click\s+x="(\d+)"\s+y="(\d+)"\s*\/>/g)) {
|
|
255
|
+
results.push(mouseClick(parseInt(match[1]), parseInt(match[2])))
|
|
256
|
+
}
|
|
257
|
+
for (const match of response.matchAll(/<action:rightclick\s+x="(\d+)"\s+y="(\d+)"\s*\/>/g)) {
|
|
258
|
+
results.push(mouseClick(parseInt(match[1]), parseInt(match[2]), 'right'))
|
|
259
|
+
}
|
|
260
|
+
for (const match of response.matchAll(/<action:type\s+text="([^"]+)"\s*\/>/g)) {
|
|
261
|
+
results.push(typeText(match[1]))
|
|
262
|
+
}
|
|
263
|
+
for (const match of response.matchAll(/<action:key\s+press="([^"]+)"\s*\/>/g)) {
|
|
264
|
+
results.push(pressKey(match[1]))
|
|
265
|
+
}
|
|
266
|
+
for (const match of response.matchAll(/<action:move\s+x="(\d+)"\s+y="(\d+)"\s*\/>/g)) {
|
|
267
|
+
results.push(mouseMove(parseInt(match[1]), parseInt(match[2])))
|
|
268
|
+
}
|
|
269
|
+
for (const match of response.matchAll(/<action:wait\s+ms="(\d+)"\s*\/>/g)) {
|
|
270
|
+
await new Promise(r => setTimeout(r, parseInt(match[1])))
|
|
271
|
+
results.push({ success: true, waited: match[1] + 'ms' })
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
const isDone = response.includes('<action:done/>')
|
|
275
|
+
return { results, isDone }
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
module.exports = {
|
|
279
|
+
takeScreenshot, mouseClick, mouseMove, typeText, pressKey,
|
|
280
|
+
analyzeScreen, ScreenMonitor, computerUseStep, executeComputerActions
|
|
281
|
+
}
|
package/lib/tools.js
CHANGED
|
@@ -102,4 +102,72 @@ function processToolCalls(response) {
|
|
|
102
102
|
return actions
|
|
103
103
|
}
|
|
104
104
|
|
|
105
|
-
|
|
105
|
+
// Platform-aware app launching
|
|
106
|
+
function openApp(appName) {
|
|
107
|
+
const platform = process.platform
|
|
108
|
+
try {
|
|
109
|
+
let cmd
|
|
110
|
+
if (platform === 'darwin') {
|
|
111
|
+
cmd = `open -a "${appName}"`
|
|
112
|
+
} else if (platform === 'win32') {
|
|
113
|
+
cmd = `start "" "${appName}"`
|
|
114
|
+
} else {
|
|
115
|
+
// Linux — try common approaches
|
|
116
|
+
cmd = `${appName.toLowerCase()} &`
|
|
117
|
+
}
|
|
118
|
+
execSync(cmd, { timeout: 5000, stdio: 'ignore' })
|
|
119
|
+
return { success: true, app: appName, platform }
|
|
120
|
+
} catch (e) {
|
|
121
|
+
return { success: false, error: e.message, platform }
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
function openUrl(url) {
|
|
126
|
+
const platform = process.platform
|
|
127
|
+
try {
|
|
128
|
+
let cmd
|
|
129
|
+
if (platform === 'darwin') cmd = `open "${url}"`
|
|
130
|
+
else if (platform === 'win32') cmd = `start "" "${url}"`
|
|
131
|
+
else cmd = `xdg-open "${url}"`
|
|
132
|
+
execSync(cmd, { timeout: 5000, stdio: 'ignore' })
|
|
133
|
+
return { success: true, url }
|
|
134
|
+
} catch (e) {
|
|
135
|
+
return { success: false, error: e.message }
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
function getSystemInfo() {
|
|
140
|
+
const os = require('os')
|
|
141
|
+
return {
|
|
142
|
+
platform: process.platform,
|
|
143
|
+
arch: process.arch,
|
|
144
|
+
hostname: os.hostname(),
|
|
145
|
+
user: os.userInfo().username,
|
|
146
|
+
home: os.homedir(),
|
|
147
|
+
cwd: process.cwd(),
|
|
148
|
+
node: process.version,
|
|
149
|
+
cpus: os.cpus().length,
|
|
150
|
+
memory: Math.round(os.totalmem() / 1024 / 1024 / 1024) + 'GB',
|
|
151
|
+
freeMemory: Math.round(os.freemem() / 1024 / 1024 / 1024) + 'GB',
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
function takeScreenshot() {
|
|
156
|
+
const platform = process.platform
|
|
157
|
+
const tmpFile = path.join(require('os').tmpdir(), `otoro-screenshot-${Date.now()}.png`)
|
|
158
|
+
try {
|
|
159
|
+
let cmd
|
|
160
|
+
if (platform === 'darwin') cmd = `screencapture -x "${tmpFile}"`
|
|
161
|
+
else if (platform === 'win32') cmd = `powershell -command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.Screen]::PrimaryScreen | ForEach-Object { $bitmap = New-Object System.Drawing.Bitmap($_.Bounds.Width, $_.Bounds.Height); $graphics = [System.Drawing.Graphics]::FromImage($bitmap); $graphics.CopyFromScreen($_.Bounds.Location, [System.Drawing.Point]::Empty, $_.Bounds.Size); $bitmap.Save('${tmpFile}'); }"`
|
|
162
|
+
else cmd = `import -window root "${tmpFile}" 2>/dev/null || gnome-screenshot -f "${tmpFile}" 2>/dev/null || scrot "${tmpFile}" 2>/dev/null`
|
|
163
|
+
execSync(cmd, { timeout: 10000 })
|
|
164
|
+
if (fs.existsSync(tmpFile)) {
|
|
165
|
+
return { success: true, path: tmpFile, size: fs.statSync(tmpFile).size }
|
|
166
|
+
}
|
|
167
|
+
return { success: false, error: 'Screenshot file not created' }
|
|
168
|
+
} catch (e) {
|
|
169
|
+
return { success: false, error: e.message }
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
module.exports = { readFile, writeFile, editFile, listFiles, runCommand, searchCode, processToolCalls, openApp, openUrl, getSystemInfo, takeScreenshot }
|
package/package.json
CHANGED