otoro-cli 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/otoro.js CHANGED
@@ -68,6 +68,35 @@ program
68
68
  await generateImage(prompt.join(' '))
69
69
  })
70
70
 
71
+ program
72
+ .command('computer <task...>')
73
+ .alias('use')
74
+ .description('Computer Use — Otoro sees your screen and controls mouse/keyboard')
75
+ .action(async (task) => {
76
+ requireAuth()
77
+ const { runComputerTask } = require('../lib/computer')
78
+ await runComputerTask(task.join(' '))
79
+ })
80
+
81
+ program
82
+ .command('screen')
83
+ .description('Take a screenshot and describe what Otoro sees')
84
+ .action(async () => {
85
+ requireAuth()
86
+ const { analyzeScreen } = require('../lib/screen')
87
+ const chalk = require('chalk')
88
+ const ora = require('ora')
89
+ const spinner = ora({ text: chalk.gray('Looking at screen...'), color: 'cyan' }).start()
90
+ const result = await analyzeScreen()
91
+ if (result.success) {
92
+ spinner.succeed(chalk.green('Screen analyzed'))
93
+ console.log(chalk.cyan('\n What Otoro sees:\n'))
94
+ console.log(' ' + result.description.split('\n').join('\n ') + '\n')
95
+ } else {
96
+ spinner.fail(chalk.red(result.error))
97
+ }
98
+ })
99
+
71
100
  program
72
101
  .command('start')
73
102
  .description('Start Otoro agent daemon — connects to server for remote tasks')
package/lib/agent.js CHANGED
@@ -4,6 +4,8 @@ const path = require('path')
4
4
  const chalk = require('chalk')
5
5
  const { getConfig } = require('./config')
6
6
  const { readFile, writeFile, editFile, listFiles, runCommand, searchCode, openApp, openUrl, getSystemInfo, takeScreenshot } = require('./tools')
7
+ const { mouseClick, mouseMove, typeText, pressKey, analyzeScreen } = require('./screen')
8
+ const { runComputerTask } = require('./computer')
7
9
  const { chatCompletion } = require('./api')
8
10
 
9
11
  class OtoroAgent {
@@ -132,6 +134,25 @@ class OtoroAgent {
132
134
  case 'system_info':
133
135
  result = getSystemInfo()
134
136
  break
137
+ case 'mouse_click':
138
+ result = mouseClick(payload.x, payload.y, payload.button)
139
+ break
140
+ case 'mouse_move':
141
+ result = mouseMove(payload.x, payload.y)
142
+ break
143
+ case 'type_text':
144
+ result = typeText(payload.text)
145
+ break
146
+ case 'press_key':
147
+ result = pressKey(payload.key)
148
+ break
149
+ case 'analyze_screen':
150
+ result = await analyzeScreen(payload.question)
151
+ break
152
+ case 'computer_use':
153
+ console.log(chalk.cyan(` 🖥️ Computer Use: ${payload.task}`))
154
+ result = await runComputerTask(payload.task, payload.max_steps || 10)
155
+ break
135
156
  case 'list_files':
136
157
  result = listFiles(payload.dir || '.', payload.pattern || '')
137
158
  break
@@ -0,0 +1,74 @@
1
+ const chalk = require('chalk')
2
+ const ora = require('ora')
3
+ const { analyzeScreen, computerUseStep, executeComputerActions, takeScreenshot } = require('./screen')
4
+
5
+ async function runComputerTask(task, maxSteps = 15) {
6
+ console.log(chalk.cyan.bold('\n 🐙 Otoro Computer Use\n'))
7
+ console.log(chalk.gray(` Task: ${task}`))
8
+ console.log(chalk.gray(` Platform: ${process.platform}\n`))
9
+
10
+ // Step 1: See the screen
11
+ let spinner = ora({ text: chalk.gray('Looking at screen...'), color: 'cyan' }).start()
12
+ let screen = await analyzeScreen()
13
+ if (!screen.success) {
14
+ spinner.fail(chalk.red(`Can't see screen: ${screen.error}`))
15
+ console.log(chalk.yellow('\n Tips:'))
16
+ if (process.platform === 'linux') console.log(chalk.gray(' • Install scrot or gnome-screenshot: sudo apt install scrot'))
17
+ if (process.platform === 'darwin') console.log(chalk.gray(' • Grant Screen Recording permission: System Settings → Privacy → Screen Recording'))
18
+ return
19
+ }
20
+ spinner.succeed(chalk.green('Screen captured'))
21
+ console.log(chalk.gray(` I see: ${screen.description.slice(0, 150)}...\n`))
22
+
23
+ // Step 2: Loop — AI sees screen, decides action, executes, repeat
24
+ for (let step = 1; step <= maxSteps; step++) {
25
+ console.log(chalk.cyan(` Step ${step}/${maxSteps}`))
26
+
27
+ spinner = ora({ text: chalk.gray('Deciding action...'), color: 'cyan' }).start()
28
+ const aiResponse = await computerUseStep(task, screen.description)
29
+ spinner.stop()
30
+
31
+ if (!aiResponse) {
32
+ console.log(chalk.red(' No response from AI'))
33
+ break
34
+ }
35
+
36
+ // Show what Otoro is thinking (strip action tags for display)
37
+ const thinking = aiResponse.replace(/<action:[^>]*\/>/g, '').trim()
38
+ if (thinking) console.log(chalk.gray(` ${thinking.split('\n')[0].slice(0, 100)}`))
39
+
40
+ // Execute actions
41
+ const { results, isDone } = await executeComputerActions(aiResponse)
42
+
43
+ for (const r of results) {
44
+ if (r.success) {
45
+ if (r.x !== undefined) console.log(chalk.green(` ✓ Click (${r.x}, ${r.y})`))
46
+ else if (r.typed) console.log(chalk.green(` ✓ Typed ${r.typed}`))
47
+ else if (r.key) console.log(chalk.green(` ✓ Pressed ${r.key}`))
48
+ else if (r.waited) console.log(chalk.green(` ✓ Waited ${r.waited}`))
49
+ } else {
50
+ console.log(chalk.red(` ✗ ${r.error}`))
51
+ }
52
+ }
53
+
54
+ if (isDone) {
55
+ console.log(chalk.green.bold('\n ✓ Task complete!\n'))
56
+ return
57
+ }
58
+
59
+ // Wait a moment then re-capture screen
60
+ await new Promise(r => setTimeout(r, 1000))
61
+ spinner = ora({ text: chalk.gray('Looking at updated screen...'), color: 'cyan' }).start()
62
+ screen = await analyzeScreen()
63
+ spinner.stop()
64
+
65
+ if (screen.success) {
66
+ console.log(chalk.gray(` Screen: ${screen.description.slice(0, 100)}...`))
67
+ }
68
+ console.log()
69
+ }
70
+
71
+ console.log(chalk.yellow(`\n Reached max steps (${maxSteps}). Task may be incomplete.\n`))
72
+ }
73
+
74
+ module.exports = { runComputerTask }
package/lib/screen.js ADDED
@@ -0,0 +1,281 @@
1
+ const { execSync, exec } = require('child_process')
2
+ const fs = require('fs')
3
+ const path = require('path')
4
+ const os = require('os')
5
+ const chalk = require('chalk')
6
+ const http = require('http')
7
+ const { getConfig } = require('./config')
8
+
9
+ const SCREENSHOT_DIR = path.join(os.tmpdir(), 'otoro-screenshots')
10
+ fs.mkdirSync(SCREENSHOT_DIR, { recursive: true })
11
+
12
+ // ─── Screenshots ──────────────────────────────────────────────────────────────
13
+
14
+ function takeScreenshot() {
15
+ const platform = process.platform
16
+ const file = path.join(SCREENSHOT_DIR, `screen-${Date.now()}.png`)
17
+ try {
18
+ if (platform === 'darwin') {
19
+ execSync(`screencapture -x "${file}"`, { timeout: 5000 })
20
+ } else if (platform === 'win32') {
21
+ // PowerShell screenshot
22
+ execSync(`powershell -command "Add-Type -AssemblyName System.Windows.Forms; $screen = [System.Windows.Forms.Screen]::PrimaryScreen.Bounds; $bitmap = New-Object System.Drawing.Bitmap($screen.Width, $screen.Height); $graphics = [System.Drawing.Graphics]::FromImage($bitmap); $graphics.CopyFromScreen(0, 0, 0, 0, $screen.Size); $bitmap.Save('${file.replace(/\\/g, '\\\\')}'); $graphics.Dispose(); $bitmap.Dispose()"`, { timeout: 10000 })
23
+ } else {
24
+ // Linux — try multiple tools
25
+ try { execSync(`gnome-screenshot -f "${file}" 2>/dev/null`, { timeout: 5000 }) }
26
+ catch {
27
+ try { execSync(`scrot "${file}" 2>/dev/null`, { timeout: 5000 }) }
28
+ catch { execSync(`import -window root "${file}" 2>/dev/null`, { timeout: 5000 }) }
29
+ }
30
+ }
31
+ if (fs.existsSync(file)) return { success: true, path: file, size: fs.statSync(file).size }
32
+ return { success: false, error: 'Screenshot not created' }
33
+ } catch (e) {
34
+ return { success: false, error: e.message }
35
+ }
36
+ }
37
+
38
+ // ─── Mouse Control ────────────────────────────────────────────────────────────
39
+
40
+ function mouseClick(x, y, button = 'left') {
41
+ const platform = process.platform
42
+ try {
43
+ if (platform === 'darwin') {
44
+ const btn = button === 'right' ? 'rc' : 'c'
45
+ execSync(`osascript -e 'tell application "System Events" to click at {${x}, ${y}}'`, { timeout: 3000 })
46
+ } else if (platform === 'win32') {
47
+ const btnCode = button === 'right' ? '$right = $true' : ''
48
+ execSync(`powershell -command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.Cursor]::Position = New-Object System.Drawing.Point(${x}, ${y}); Add-Type -MemberDefinition '[DllImport(\\\"user32.dll\\\")] public static extern void mouse_event(int f,int x,int y,int d,int i);' -Name U -Namespace W; [W.U]::mouse_event(${button === 'right' ? '0x0008' : '0x0002'},0,0,0,0); [W.U]::mouse_event(${button === 'right' ? '0x0010' : '0x0004'},0,0,0,0)"`, { timeout: 5000 })
49
+ } else {
50
+ execSync(`xdotool mousemove ${x} ${y} click ${button === 'right' ? '3' : '1'}`, { timeout: 3000 })
51
+ }
52
+ return { success: true, x, y, button }
53
+ } catch (e) {
54
+ return { success: false, error: e.message }
55
+ }
56
+ }
57
+
58
+ function mouseMove(x, y) {
59
+ const platform = process.platform
60
+ try {
61
+ if (platform === 'darwin') {
62
+ execSync(`osascript -e 'tell application "System Events" to set position of cursor to {${x}, ${y}}'`, { timeout: 3000 })
63
+ } else if (platform === 'win32') {
64
+ execSync(`powershell -command "[System.Windows.Forms.Cursor]::Position = New-Object System.Drawing.Point(${x}, ${y})"`, { timeout: 3000 })
65
+ } else {
66
+ execSync(`xdotool mousemove ${x} ${y}`, { timeout: 3000 })
67
+ }
68
+ return { success: true, x, y }
69
+ } catch (e) {
70
+ return { success: false, error: e.message }
71
+ }
72
+ }
73
+
74
+ // ─── Keyboard Control ─────────────────────────────────────────────────────────
75
+
76
+ function typeText(text) {
77
+ const platform = process.platform
78
+ try {
79
+ if (platform === 'darwin') {
80
+ // Escape special chars for AppleScript
81
+ const escaped = text.replace(/\\/g, '\\\\').replace(/"/g, '\\"')
82
+ execSync(`osascript -e 'tell application "System Events" to keystroke "${escaped}"'`, { timeout: 5000 })
83
+ } else if (platform === 'win32') {
84
+ const escaped = text.replace(/'/g, "''")
85
+ execSync(`powershell -command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${escaped}')"`, { timeout: 5000 })
86
+ } else {
87
+ execSync(`xdotool type --clearmodifiers "${text.replace(/"/g, '\\"')}"`, { timeout: 5000 })
88
+ }
89
+ return { success: true, typed: text.length + ' chars' }
90
+ } catch (e) {
91
+ return { success: false, error: e.message }
92
+ }
93
+ }
94
+
95
+ function pressKey(key) {
96
+ // key: "enter", "tab", "escape", "backspace", "ctrl+c", "cmd+s", etc.
97
+ const platform = process.platform
98
+ try {
99
+ if (platform === 'darwin') {
100
+ const keyMap = { enter: 'return', tab: 'tab', escape: 'escape', backspace: 'delete', space: 'space' }
101
+ const mapped = keyMap[key.toLowerCase()] || key.toLowerCase()
102
+ if (key.includes('+')) {
103
+ const [mod, k] = key.split('+')
104
+ const modMap = { ctrl: 'control', cmd: 'command', alt: 'option', shift: 'shift' }
105
+ execSync(`osascript -e 'tell application "System Events" to key code 0 using {${modMap[mod] || mod} down}'`, { timeout: 3000 })
106
+ } else {
107
+ execSync(`osascript -e 'tell application "System Events" to keystroke "${mapped}"'`, { timeout: 3000 })
108
+ }
109
+ } else if (platform === 'win32') {
110
+ const keyMap = { enter: '{ENTER}', tab: '{TAB}', escape: '{ESC}', backspace: '{BS}', space: ' ' }
111
+ const mapped = keyMap[key.toLowerCase()] || `{${key.toUpperCase()}}`
112
+ execSync(`powershell -command "[System.Windows.Forms.SendKeys]::SendWait('${mapped}')"`, { timeout: 3000 })
113
+ } else {
114
+ execSync(`xdotool key ${key.replace('cmd', 'super').replace('ctrl', 'ctrl')}`, { timeout: 3000 })
115
+ }
116
+ return { success: true, key }
117
+ } catch (e) {
118
+ return { success: false, error: e.message }
119
+ }
120
+ }
121
+
122
+ // ─── Vision — Send screenshot to Qwen-VL for understanding ───────────────────
123
+
124
+ async function analyzeScreen(question = 'What is on the screen? Describe the UI elements, buttons, and text visible.') {
125
+ const screenshot = takeScreenshot()
126
+ if (!screenshot.success) return { success: false, error: screenshot.error }
127
+
128
+ const config = getConfig()
129
+ const imageData = fs.readFileSync(screenshot.path)
130
+ const b64 = imageData.toString('base64')
131
+
132
+ // Send to Qwen-VL vision model
133
+ const body = JSON.stringify({
134
+ model: 'qwen-vl',
135
+ messages: [
136
+ { role: 'user', content: [
137
+ { type: 'text', text: question },
138
+ { type: 'image_url', image_url: { url: `data:image/png;base64,${b64}` } }
139
+ ]}
140
+ ],
141
+ max_tokens: 1024,
142
+ })
143
+
144
+ return new Promise((resolve) => {
145
+ const url = new URL(`${config.gpu_url}/v1/chat/completions`)
146
+ const req = http.request(url, {
147
+ method: 'POST',
148
+ headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${config.api_key}` },
149
+ timeout: 30000,
150
+ }, (res) => {
151
+ let data = ''
152
+ res.on('data', c => data += c)
153
+ res.on('end', () => {
154
+ try {
155
+ const result = JSON.parse(data)
156
+ const description = result.choices?.[0]?.message?.content || ''
157
+ resolve({ success: true, description, screenshot: screenshot.path })
158
+ } catch { resolve({ success: false, error: 'Bad response from vision model' }) }
159
+ })
160
+ })
161
+ req.on('error', (e) => resolve({ success: false, error: e.message }))
162
+ req.write(body)
163
+ req.end()
164
+ })
165
+ }
166
+
167
+ // ─── Live Screen Monitor ──────────────────────────────────────────────────────
168
+
169
+ class ScreenMonitor {
170
+ constructor(intervalMs = 3000) {
171
+ this.interval = intervalMs
172
+ this.running = false
173
+ this.timer = null
174
+ this.lastDescription = ''
175
+ this.onUpdate = null
176
+ }
177
+
178
+ start(callback) {
179
+ this.running = true
180
+ this.onUpdate = callback
181
+ console.log(chalk.cyan(` 👁 Screen monitor started (every ${this.interval / 1000}s)`))
182
+ this.tick()
183
+ }
184
+
185
+ stop() {
186
+ this.running = false
187
+ if (this.timer) clearTimeout(this.timer)
188
+ console.log(chalk.gray(' Screen monitor stopped'))
189
+ }
190
+
191
+ async tick() {
192
+ if (!this.running) return
193
+ const result = await analyzeScreen('Briefly describe what is currently visible on screen. Note any dialogs, windows, error messages, or UI changes.')
194
+ if (result.success && result.description !== this.lastDescription) {
195
+ this.lastDescription = result.description
196
+ if (this.onUpdate) this.onUpdate(result.description, result.screenshot)
197
+ }
198
+ this.timer = setTimeout(() => this.tick(), this.interval)
199
+ }
200
+ }
201
+
202
+ // ─── Computer Use Agent — AI sees screen and controls computer ────────────────
203
+
204
+ async function computerUseStep(task, screenDescription) {
205
+ const config = getConfig()
206
+ const body = JSON.stringify({
207
+ model: 'qwen-coder',
208
+ messages: [
209
+ { role: 'system', content: `You are Otoro controlling a ${process.platform} computer. You can see the screen and control mouse/keyboard.
210
+
211
+ Available actions (use XML tags):
212
+ - <action:click x="123" y="456"/> — click at coordinates
213
+ - <action:rightclick x="123" y="456"/> — right-click
214
+ - <action:type text="hello world"/> — type text
215
+ - <action:key press="enter"/> — press a key (enter, tab, escape, ctrl+c, cmd+s, etc.)
216
+ - <action:move x="123" y="456"/> — move mouse
217
+ - <action:screenshot/> — take a new screenshot
218
+ - <action:wait ms="1000"/> — wait before next action
219
+ - <action:done/> — task is complete
220
+
221
+ Current screen: ${screenDescription}
222
+
223
+ Execute the task step by step. After each action, I'll show you the updated screen.` },
224
+ { role: 'user', content: task }
225
+ ],
226
+ max_tokens: 2048,
227
+ })
228
+
229
+ return new Promise((resolve) => {
230
+ const url = new URL(`${config.gpu_url}/v1/chat/completions`)
231
+ const req = http.request(url, {
232
+ method: 'POST',
233
+ headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${config.api_key}` },
234
+ timeout: 60000,
235
+ }, (res) => {
236
+ let data = ''
237
+ res.on('data', c => data += c)
238
+ res.on('end', () => {
239
+ try {
240
+ const result = JSON.parse(data)
241
+ resolve(result.choices?.[0]?.message?.content || '')
242
+ } catch { resolve('') }
243
+ })
244
+ })
245
+ req.on('error', () => resolve(''))
246
+ req.write(body)
247
+ req.end()
248
+ })
249
+ }
250
+
251
+ async function executeComputerActions(response) {
252
+ const results = []
253
+
254
+ for (const match of response.matchAll(/<action:click\s+x="(\d+)"\s+y="(\d+)"\s*\/>/g)) {
255
+ results.push(mouseClick(parseInt(match[1]), parseInt(match[2])))
256
+ }
257
+ for (const match of response.matchAll(/<action:rightclick\s+x="(\d+)"\s+y="(\d+)"\s*\/>/g)) {
258
+ results.push(mouseClick(parseInt(match[1]), parseInt(match[2]), 'right'))
259
+ }
260
+ for (const match of response.matchAll(/<action:type\s+text="([^"]+)"\s*\/>/g)) {
261
+ results.push(typeText(match[1]))
262
+ }
263
+ for (const match of response.matchAll(/<action:key\s+press="([^"]+)"\s*\/>/g)) {
264
+ results.push(pressKey(match[1]))
265
+ }
266
+ for (const match of response.matchAll(/<action:move\s+x="(\d+)"\s+y="(\d+)"\s*\/>/g)) {
267
+ results.push(mouseMove(parseInt(match[1]), parseInt(match[2])))
268
+ }
269
+ for (const match of response.matchAll(/<action:wait\s+ms="(\d+)"\s*\/>/g)) {
270
+ await new Promise(r => setTimeout(r, parseInt(match[1])))
271
+ results.push({ success: true, waited: match[1] + 'ms' })
272
+ }
273
+
274
+ const isDone = response.includes('<action:done/>')
275
+ return { results, isDone }
276
+ }
277
+
278
+ module.exports = {
279
+ takeScreenshot, mouseClick, mouseMove, typeText, pressKey,
280
+ analyzeScreen, ScreenMonitor, computerUseStep, executeComputerActions
281
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "otoro-cli",
3
- "version": "1.3.0",
3
+ "version": "1.4.0",
4
4
  "description": "Otoro AGI — AI coding assistant for your terminal. Code, generate images, execute tasks, and control your projects remotely.",
5
5
  "main": "index.js",
6
6
  "bin": {