@simular-ai/simulang-js 8.0.0 → 9.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/CLAUDE.md +18 -8
- package/index.d.ts +173 -17
- package/index.js +67 -56
- package/package.json +21 -15
- package/sai.d.ts +868 -0
- package/sai.js +749 -0
package/sai.js
ADDED
|
@@ -0,0 +1,749 @@
|
|
|
1
|
+
// Sai / Unified UI compatibility syntax over @simular-ai/simulang-js.
|
|
2
|
+
//
|
|
3
|
+
// Implementations mirror simular-pro-unified-ui's primitive bodies as closely
|
|
4
|
+
// as possible:
|
|
5
|
+
// - free desktop primitives -> app/src/main/primitives/primitives-core/index.ts
|
|
6
|
+
// - ref-based launch/getApp -> app/src/main/primitives/primitives-desktop/index.ts
|
|
7
|
+
//
|
|
8
|
+
// Where unified-ui branches on a capability that has no standalone simulang-js
|
|
9
|
+
// equivalent (cloud endpoints, the LLM accessibility element resolver, the Sai
|
|
10
|
+
// product runtime), the unimplementable branch is left as a comment marked
|
|
11
|
+
// MISSING / NOTE so the gap is visible, and the implementable branch is kept.
|
|
12
|
+
|
|
13
|
+
let nativeBinding = globalThis.__SIMULANG_JS_SAI_NATIVE__ ?? null
|
|
14
|
+
|
|
15
|
+
function binding() {
|
|
16
|
+
if (!nativeBinding) nativeBinding = require('./wrapped.js')
|
|
17
|
+
return nativeBinding
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// unified-ui constructs one controller per process at module load. We do the
|
|
21
|
+
// same but lazily, so importing this module never forces the native binding to
|
|
22
|
+
// load before a supported primitive is actually called.
|
|
23
|
+
let mouseController = null
|
|
24
|
+
let keyboardController = null
|
|
25
|
+
let clipboardController = null
|
|
26
|
+
function mouse() {
|
|
27
|
+
const { MouseController } = binding()
|
|
28
|
+
return (mouseController ??= new MouseController())
|
|
29
|
+
}
|
|
30
|
+
function keyboard() {
|
|
31
|
+
const { KeyboardController } = binding()
|
|
32
|
+
return (keyboardController ??= new KeyboardController())
|
|
33
|
+
}
|
|
34
|
+
function clipboard() {
|
|
35
|
+
const { Clipboard } = binding()
|
|
36
|
+
return (clipboardController ??= new Clipboard())
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// unified-ui defaults grounding to 'vision' (BaseSimularPrimitives.defaultGroundingMode).
|
|
40
|
+
const DEFAULT_GROUNDING_MODE = 'vision'
|
|
41
|
+
|
|
42
|
+
class UnsupportedSaiPrimitiveError extends Error {
|
|
43
|
+
constructor(primitive, details = {}) {
|
|
44
|
+
const reason = details.reason ?? 'This Sai primitive is not available in standalone simulang-js.'
|
|
45
|
+
const closest = details.closestNativeApi ? ` Closest native API: ${details.closestNativeApi}.` : ''
|
|
46
|
+
const category = details.category ? ` Category: ${details.category}.` : ''
|
|
47
|
+
super(`${primitive} is unsupported. ${reason}${closest}${category}`)
|
|
48
|
+
this.name = 'UnsupportedSaiPrimitiveError'
|
|
49
|
+
this.primitive = primitive
|
|
50
|
+
this.reason = reason
|
|
51
|
+
this.closestNativeApi = details.closestNativeApi
|
|
52
|
+
this.category = details.category
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function unsupported(primitive, details) {
|
|
57
|
+
throw new UnsupportedSaiPrimitiveError(primitive, details)
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function unsupportedFunction(primitive, details) {
|
|
61
|
+
return function unsupportedSaiPrimitive() {
|
|
62
|
+
unsupported(primitive, details)
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function unsupportedAsyncFunction(primitive, details) {
|
|
67
|
+
return async function unsupportedSaiPrimitive() {
|
|
68
|
+
unsupported(primitive, details)
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function unsupportedNamespace(primitive, details) {
|
|
73
|
+
return new Proxy(
|
|
74
|
+
{},
|
|
75
|
+
{
|
|
76
|
+
get(_target, prop) {
|
|
77
|
+
if (prop === Symbol.toStringTag) return 'UnsupportedSaiNamespace'
|
|
78
|
+
unsupported(`${primitive}.${String(prop)}`, details)
|
|
79
|
+
},
|
|
80
|
+
apply() {
|
|
81
|
+
unsupported(primitive, details)
|
|
82
|
+
},
|
|
83
|
+
},
|
|
84
|
+
)
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function stripDataUrl(dataUrl) {
|
|
88
|
+
return dataUrl.replace(/^data:image\/[a-z]+;base64,/, '')
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function sleep(ms) {
|
|
92
|
+
Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, Math.max(0, Math.round(ms)))
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function isWindows() {
|
|
96
|
+
return process.platform === 'win32'
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// ── Timing ──
|
|
100
|
+
|
|
101
|
+
function wait(params) {
|
|
102
|
+
const { unit = 's', waitTime } = params
|
|
103
|
+
sleep(unit === 'ms' ? waitTime : waitTime * 1000)
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// ── Clipboard ──
|
|
107
|
+
|
|
108
|
+
function copyToClipboard(params) {
|
|
109
|
+
clipboard().setString(params.text)
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function getFromClipboard() {
|
|
113
|
+
return clipboard().getString() ?? ''
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// ── Keyboard ──
|
|
117
|
+
|
|
118
|
+
function press(params) {
|
|
119
|
+
const { key, cmd = false, shift = false, option = false, alt = false, ctrl = false } = params
|
|
120
|
+
if (!key) {
|
|
121
|
+
throw new Error('No key argument was provided for press()')
|
|
122
|
+
}
|
|
123
|
+
const { keyFromString, Key, Direction } = binding()
|
|
124
|
+
// keyFromString throws for invalid values (matches unified-ui).
|
|
125
|
+
const keyToPress = keyFromString(key)
|
|
126
|
+
const kb = keyboard()
|
|
127
|
+
|
|
128
|
+
if (cmd) kb.key(Key.Meta, Direction.Press)
|
|
129
|
+
if (shift) kb.key(Key.Shift, Direction.Press)
|
|
130
|
+
if (option || alt) kb.key(Key.Alt, Direction.Press)
|
|
131
|
+
if (ctrl) kb.key(Key.Control, Direction.Press)
|
|
132
|
+
|
|
133
|
+
kb.key(keyToPress, Direction.Click)
|
|
134
|
+
|
|
135
|
+
if (cmd) kb.key(Key.Meta, Direction.Release)
|
|
136
|
+
if (shift) kb.key(Key.Shift, Direction.Release)
|
|
137
|
+
if (option || alt) kb.key(Key.Alt, Direction.Release)
|
|
138
|
+
if (ctrl) kb.key(Key.Control, Direction.Release)
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
function shortCut(params) {
|
|
142
|
+
const { waitTime = 0, ...pressParams } = params
|
|
143
|
+
press(pressParams)
|
|
144
|
+
if (waitTime > 0) wait({ waitTime })
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
function type(params) {
|
|
148
|
+
const { text, withReturn } = params
|
|
149
|
+
const { Key, Direction } = binding()
|
|
150
|
+
// Typing long text via clipboard is much faster than char-by-char; short text
|
|
151
|
+
// goes through the keyboard so paste-blocking fields (passwords) still work.
|
|
152
|
+
if (text.length > 40) {
|
|
153
|
+
clipboard().pasteText(text)
|
|
154
|
+
} else {
|
|
155
|
+
keyboard().text(text)
|
|
156
|
+
}
|
|
157
|
+
wait({ waitTime: 0.5 })
|
|
158
|
+
if (withReturn) {
|
|
159
|
+
keyboard().key(Key.Return, Direction.Click)
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// ── Grounding (concept -> coordinates) ──
|
|
164
|
+
|
|
165
|
+
// Vision grounding: screenshot + VLM -> global desktop coordinates.
|
|
166
|
+
// Mirrors unified-ui getLocationByConceptFromVisualGrounding, which branches on
|
|
167
|
+
// whether a crop region is supplied. The native Screenshot.ground performs the
|
|
168
|
+
// VLM call and coordinate mapping that unified-ui does by hand against the cloud.
|
|
169
|
+
function getLocationByConceptFromVisualGrounding(concept, cropBounds) {
|
|
170
|
+
try {
|
|
171
|
+
const { GroundingModel, Screen, screenshotCropped, screenshotFull } = binding()
|
|
172
|
+
// Model selection differs from unified-ui: it picks a specific cloud-configured
|
|
173
|
+
// production model via resolveProductionGroundingModel(this.groundingModel),
|
|
174
|
+
// whereas standalone simulang-js uses whatever the local provider config
|
|
175
|
+
// advertises as the default.
|
|
176
|
+
const model = GroundingModel.default()
|
|
177
|
+
|
|
178
|
+
let screenshot
|
|
179
|
+
if (cropBounds) {
|
|
180
|
+
const width = cropBounds.right - cropBounds.left
|
|
181
|
+
const height = cropBounds.bottom - cropBounds.top
|
|
182
|
+
if (width > 0 && height > 0) {
|
|
183
|
+
screenshot = screenshotCropped(cropBounds.left, cropBounds.top, width, height, true)
|
|
184
|
+
} else {
|
|
185
|
+
screenshot = screenshotFull(false, Screen.fromCurrentMouseLocation())
|
|
186
|
+
}
|
|
187
|
+
} else {
|
|
188
|
+
screenshot = screenshotFull(false, Screen.fromCurrentMouseLocation())
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
const [x, y] = screenshot.ground(model, concept)
|
|
192
|
+
return { x, y }
|
|
193
|
+
} catch {
|
|
194
|
+
// unified-ui logs the provider details internally and throws this generic
|
|
195
|
+
// primitive-level error, so callers don't branch on provider-specific text.
|
|
196
|
+
throw new Error('Failed to get location by concept from visual grounding')
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Mirrors unified-ui getElementByConcept across platforms. Returns an
|
|
201
|
+
// AutomationElement (same shape unified-ui produces) or null when nothing resolves.
|
|
202
|
+
// - Windows resolves via an exact `overallDescription` match in the active
|
|
203
|
+
// window's AX tree (uia.findElementByOverallDescription, a deterministic
|
|
204
|
+
// string-equality walk — NOT an LLM). The cross-platform native equivalent
|
|
205
|
+
// is AccessibilityTree.findByDescription(concept).
|
|
206
|
+
// - macOS/Linux in unified-ui resolve via vision grounding wrapped in a
|
|
207
|
+
// synthetic element; we keep that behavior even though standalone
|
|
208
|
+
// simulang-js can also read accessibility trees on those platforms.
|
|
209
|
+
function getElementByConcept(concept) {
|
|
210
|
+
if (isWindows()) {
|
|
211
|
+
try {
|
|
212
|
+
const { AccessibilityTree } = binding()
|
|
213
|
+
const [node] = AccessibilityTree.fromForeground().findByDescription(concept)
|
|
214
|
+
if (node) {
|
|
215
|
+
return { ...node, processId: 0, boundingRect: node.boundingBox, children: [] }
|
|
216
|
+
}
|
|
217
|
+
} catch {
|
|
218
|
+
return null
|
|
219
|
+
}
|
|
220
|
+
return null
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
try {
|
|
224
|
+
const { x, y } = getLocationByConceptFromVisualGrounding(concept)
|
|
225
|
+
// Synthetic 10x10 element around the grounded point — mirrors unified-ui macOS/Linux.
|
|
226
|
+
const halfSize = 5
|
|
227
|
+
return {
|
|
228
|
+
name: concept,
|
|
229
|
+
className: '',
|
|
230
|
+
controlType: 0,
|
|
231
|
+
localizedControlType: 'visual-grounding',
|
|
232
|
+
description: `Visual grounding result for: ${concept}`,
|
|
233
|
+
overallDescription: concept,
|
|
234
|
+
helpText: '',
|
|
235
|
+
value: '',
|
|
236
|
+
automationId: '',
|
|
237
|
+
processId: 0,
|
|
238
|
+
isEnabled: true,
|
|
239
|
+
boundingRect: {
|
|
240
|
+
left: x - halfSize,
|
|
241
|
+
top: y - halfSize,
|
|
242
|
+
right: x + halfSize,
|
|
243
|
+
bottom: y + halfSize,
|
|
244
|
+
},
|
|
245
|
+
children: [],
|
|
246
|
+
}
|
|
247
|
+
} catch {
|
|
248
|
+
return null
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// Mirrors unified-ui getLocationByConcept(concept, mode).
|
|
253
|
+
function getLocationByConcept(concept, mode) {
|
|
254
|
+
// If vision mode is requested, skip accessibility and ground directly.
|
|
255
|
+
if (mode === 'vision') {
|
|
256
|
+
return getLocationByConceptFromVisualGrounding(concept)
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// Accessibility-first: exact AX overallDescription match via
|
|
260
|
+
// AccessibilityTree.findByDescription (the standalone equivalent of Windows'
|
|
261
|
+
// findElementByOverallDescription); on macOS/Linux this naturally falls back to
|
|
262
|
+
// vision inside getElementByConcept. Final vision fallback matches unified-ui.
|
|
263
|
+
const element = getElementByConcept(concept)
|
|
264
|
+
if (element) {
|
|
265
|
+
const { left, top, right, bottom } = element.boundingRect
|
|
266
|
+
return { x: (left + right) / 2, y: (top + bottom) / 2 }
|
|
267
|
+
}
|
|
268
|
+
return getLocationByConceptFromVisualGrounding(concept)
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// Sai ref-based ground({ concept, app? }). Mirrors unified-ui ground(concept, appBounds):
|
|
272
|
+
// when an app is supplied, crop to the window bounds for less noise / better accuracy.
|
|
273
|
+
async function ground(params) {
|
|
274
|
+
const app = params.app
|
|
275
|
+
const cropBounds = app && typeof app.getWindowBounds === 'function' ? app.getWindowBounds() : undefined
|
|
276
|
+
return getLocationByConceptFromVisualGrounding(params.concept, cropBounds)
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// ── Mouse ──
|
|
280
|
+
|
|
281
|
+
function click(params) {
|
|
282
|
+
// Concept-based click (legacy Sai). Mirrors BaseSimularPrimitives.click.
|
|
283
|
+
if ('concept' in params) {
|
|
284
|
+
const { clickType = 'left', withCommand = false } = params
|
|
285
|
+
const { Button, Direction, Key } = binding()
|
|
286
|
+
const kb = keyboard()
|
|
287
|
+
|
|
288
|
+
if (withCommand) kb.key(Key.Meta, Direction.Press) // Meta == Cmd
|
|
289
|
+
move(params) // concept move to the location
|
|
290
|
+
const mc = mouse()
|
|
291
|
+
if (clickType === 'left') {
|
|
292
|
+
mc.button(Button.Left, Direction.Click)
|
|
293
|
+
} else if (clickType === 'right') {
|
|
294
|
+
mc.button(Button.Right, Direction.Click)
|
|
295
|
+
} else if (clickType === 'doubleClick') {
|
|
296
|
+
mc.button(Button.Left, Direction.Click)
|
|
297
|
+
mc.button(Button.Left, Direction.Click)
|
|
298
|
+
}
|
|
299
|
+
if (withCommand) kb.key(Key.Meta, Direction.Release)
|
|
300
|
+
wait({ waitTime: 0.5 })
|
|
301
|
+
return
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// Coordinate click. Mirrors unified-ui clickAt(): move + left click only
|
|
305
|
+
// (the Sai coordinate overload carries no clickType / modifiers).
|
|
306
|
+
const { Button, Coordinate, Direction } = binding()
|
|
307
|
+
const mc = mouse()
|
|
308
|
+
mc.moveMouse(params.x, params.y, Coordinate.Abs)
|
|
309
|
+
mc.button(Button.Left, Direction.Click)
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
function move(params) {
|
|
313
|
+
// Concept-based move (legacy Sai). Mirrors BaseSimularPrimitives.move.
|
|
314
|
+
if ('concept' in params) {
|
|
315
|
+
const { concept, mode = DEFAULT_GROUNDING_MODE } = params
|
|
316
|
+
const { x, y } = getLocationByConcept(concept, mode)
|
|
317
|
+
const { Coordinate } = binding()
|
|
318
|
+
mouse().moveMouse(x, y, Coordinate.Abs)
|
|
319
|
+
wait({ waitTime: 0.5 })
|
|
320
|
+
return
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Coordinate move. Mirrors unified-ui moveTo().
|
|
324
|
+
const { Coordinate } = binding()
|
|
325
|
+
mouse().moveMouse(params.x, params.y, Coordinate.Abs)
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// Smooth, interpolated drag — ported from unified-ui smoothDrag so apps detect
|
|
329
|
+
// continuous movement (drop zones need real intermediate moves + dwell).
|
|
330
|
+
function smoothDrag(fromX, fromY, toX, toY) {
|
|
331
|
+
const { Button, Coordinate, Direction } = binding()
|
|
332
|
+
const mc = mouse()
|
|
333
|
+
|
|
334
|
+
mc.moveMouse(fromX, fromY, Coordinate.Abs)
|
|
335
|
+
sleep(200)
|
|
336
|
+
mc.button(Button.Left, Direction.Press)
|
|
337
|
+
sleep(100)
|
|
338
|
+
// Initial jiggle — exceed the Windows drag threshold (SM_CXDRAG = 4px).
|
|
339
|
+
mc.moveMouse(fromX + 5, fromY + 5, Coordinate.Abs)
|
|
340
|
+
sleep(50)
|
|
341
|
+
|
|
342
|
+
const dx = toX - fromX
|
|
343
|
+
const dy = toY - fromY
|
|
344
|
+
const distance = Math.sqrt(dx * dx + dy * dy)
|
|
345
|
+
const steps = Math.max(5, Math.round(distance / 20))
|
|
346
|
+
for (let i = 1; i <= steps; i++) {
|
|
347
|
+
const t = i / steps
|
|
348
|
+
mc.moveMouse(Math.round(fromX + dx * t), Math.round(fromY + dy * t), Coordinate.Abs)
|
|
349
|
+
sleep(15)
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
sleep(500) // hover over target so the drop zone activates
|
|
353
|
+
mc.button(Button.Left, Direction.Release)
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
function drag(params) {
|
|
357
|
+
smoothDrag(params.fromX, params.fromY, params.toX, params.toY)
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
function scroll(params = {}) {
|
|
361
|
+
const { direction = 'down', distance = 200 } = params
|
|
362
|
+
const mc = mouse()
|
|
363
|
+
switch (direction.toLowerCase()) {
|
|
364
|
+
case 'up':
|
|
365
|
+
mc.scroll(0, -distance)
|
|
366
|
+
break
|
|
367
|
+
case 'down':
|
|
368
|
+
mc.scroll(0, distance)
|
|
369
|
+
break
|
|
370
|
+
case 'left':
|
|
371
|
+
mc.scroll(-distance, 0)
|
|
372
|
+
break
|
|
373
|
+
case 'right':
|
|
374
|
+
mc.scroll(distance, 0)
|
|
375
|
+
break
|
|
376
|
+
default:
|
|
377
|
+
throw new Error(`Invalid scroll direction: ${direction}`)
|
|
378
|
+
}
|
|
379
|
+
wait({ waitTime: 0.15 })
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
// ── App / window ──
|
|
383
|
+
|
|
384
|
+
function open(params) {
|
|
385
|
+
const { app, url } = params
|
|
386
|
+
const { FocusPolicy, Visibility, legacyOpen } = binding()
|
|
387
|
+
// TODO (unified-ui parity): allow caller to specify focus policy and visibility.
|
|
388
|
+
legacyOpen(app, url, FocusPolicy.Steal, Visibility.Show)
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
function serializeNode(node, depth = 0, lines = []) {
|
|
392
|
+
const { ariaRoleToString } = binding()
|
|
393
|
+
const indent = ' '.repeat(depth)
|
|
394
|
+
const role = typeof node.role === 'number' ? ariaRoleToString(node.role) : String(node.role)
|
|
395
|
+
const label = [node.name, node.value].filter(Boolean).join(' ')
|
|
396
|
+
lines.push(`${indent}- ${role}${label ? ` "${label}"` : ''}`)
|
|
397
|
+
for (const child of node.children ?? []) {
|
|
398
|
+
serializeNode(child, depth + 1, lines)
|
|
399
|
+
}
|
|
400
|
+
return lines
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
// unified-ui pageContent is abstract and platform-specific; this is the
|
|
404
|
+
// cross-platform approximation: foreground AX text + a full-screen screenshot.
|
|
405
|
+
function pageContent() {
|
|
406
|
+
let text = ''
|
|
407
|
+
let imageBase64 = ''
|
|
408
|
+
try {
|
|
409
|
+
const { AccessibilityTree } = binding()
|
|
410
|
+
text = serializeNode(AccessibilityTree.fromForeground().snapshot()).join('\n')
|
|
411
|
+
} catch {
|
|
412
|
+
text = ''
|
|
413
|
+
}
|
|
414
|
+
try {
|
|
415
|
+
const { Screen, screenshotFull } = binding()
|
|
416
|
+
imageBase64 = stripDataUrl(screenshotFull(false, Screen.fromCurrentMouseLocation()).base64())
|
|
417
|
+
} catch {
|
|
418
|
+
imageBase64 = ''
|
|
419
|
+
}
|
|
420
|
+
return { text, imageBase64 }
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
/**
|
|
424
|
+
* Proxy for a desktop application window.
|
|
425
|
+
*
|
|
426
|
+
* NOTE: unified-ui's AppProxy (primitives-desktop) is backed by an
|
|
427
|
+
* AccessibilityTree + DesktopRefManager that dispatches UIA/AX patterns
|
|
428
|
+
* (Invoke/Toggle/Selection/ExpandCollapse/Value) for ref-based actions, plus an
|
|
429
|
+
* LLM ElementResolver for find(). That ref subsystem is not ported here, so the
|
|
430
|
+
* ref-driven action methods below throw; observation (snapshot/screenshot),
|
|
431
|
+
* focus, title and window-scoped grounding are implemented.
|
|
432
|
+
*/
|
|
433
|
+
class SaiApp {
|
|
434
|
+
constructor(window, instance = null) {
|
|
435
|
+
this._window = window
|
|
436
|
+
this._instance = instance
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
async snapshot() {
|
|
440
|
+
// Window.snapshot() returns a plain aria-snapshot string (no refs). unified-ui
|
|
441
|
+
// returns a navigable SnapshotValue with refs from DesktopRefManager — not ported.
|
|
442
|
+
return {
|
|
443
|
+
snapshot: this._window.snapshot(),
|
|
444
|
+
refs: {},
|
|
445
|
+
title: this._window.title,
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
async screenshot() {
|
|
450
|
+
return stripDataUrl(this._window.screenshot(true).base64())
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
async click(_params) {
|
|
454
|
+
unsupported('App.click', {
|
|
455
|
+
reason: 'Ref-based clicks need the DesktopRefManager (UIA/AX pattern dispatch); not ported.',
|
|
456
|
+
closestNativeApi: 'Window.click(x, y, button, direction)',
|
|
457
|
+
category: 'ref-runtime-only',
|
|
458
|
+
})
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
async type(_params) {
|
|
462
|
+
unsupported('App.type', {
|
|
463
|
+
reason: 'Ref-based typing needs the DesktopRefManager (ValuePattern / focus); not ported.',
|
|
464
|
+
closestNativeApi: 'KeyboardController.text(text)',
|
|
465
|
+
category: 'ref-runtime-only',
|
|
466
|
+
})
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
async check() {
|
|
470
|
+
unsupported('App.check', {
|
|
471
|
+
reason: 'Needs DesktopRefManager TogglePattern; not ported.',
|
|
472
|
+
category: 'ref-runtime-only',
|
|
473
|
+
})
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
async select() {
|
|
477
|
+
unsupported('App.select', {
|
|
478
|
+
reason: 'Needs DesktopRefManager SelectionItemPattern; not ported.',
|
|
479
|
+
category: 'ref-runtime-only',
|
|
480
|
+
})
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
async scroll() {
|
|
484
|
+
unsupported('App.scroll', {
|
|
485
|
+
reason: 'Needs DesktopRefManager scrollIntoView; not ported.',
|
|
486
|
+
closestNativeApi: 'Window.scroll(deltaX, deltaY)',
|
|
487
|
+
category: 'ref-runtime-only',
|
|
488
|
+
})
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
async focus() {
|
|
492
|
+
this._window.focus()
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
async press(params) {
|
|
496
|
+
press(params)
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
selector() {
|
|
500
|
+
unsupported('App.selector', {
|
|
501
|
+
reason: 'Needs DesktopRefManager ref metadata; not ported.',
|
|
502
|
+
category: 'ref-runtime-only',
|
|
503
|
+
})
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
async find() {
|
|
507
|
+
unsupported('App.find', {
|
|
508
|
+
reason: 'Needs DesktopRefManager + LLM ElementResolver; not ported.',
|
|
509
|
+
closestNativeApi: 'Window.scoredSearch(...) or AccessibilityTree.find(...)',
|
|
510
|
+
category: 'ref-runtime-only',
|
|
511
|
+
})
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
async waitFor() {
|
|
515
|
+
unsupported('App.waitFor', {
|
|
516
|
+
reason: 'Polls find(), which needs the ref subsystem; not ported.',
|
|
517
|
+
category: 'ref-runtime-only',
|
|
518
|
+
})
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
async drag() {
|
|
522
|
+
unsupported('App.drag', { reason: 'Needs DesktopRefManager ref bounds; not ported.', category: 'ref-runtime-only' })
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
title() {
|
|
526
|
+
return this._window.title
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
windowId() {
|
|
530
|
+
return this._window.pid
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
getWindowBounds() {
|
|
534
|
+
return this._window.boundingBox()
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
ground(model, concept) {
|
|
538
|
+
return this._window.ground(model, concept)
|
|
539
|
+
}
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
async function launch(appName) {
|
|
543
|
+
const { FocusPolicy, System, Visibility } = binding()
|
|
544
|
+
const app = System.fuzzySearch(appName)
|
|
545
|
+
// NOTE: unified-ui binds an AccessibilityTree.fromForeground() into an AppProxy
|
|
546
|
+
// here. We bind the launched Instance's first window instead (the ref subsystem
|
|
547
|
+
// that the AX-tree-backed AppProxy needs is not ported — see SaiApp).
|
|
548
|
+
const instance = app.open(null, FocusPolicy.Steal, Visibility.Show, true)
|
|
549
|
+
const [window] = instance.windows()
|
|
550
|
+
if (!window) throw new Error(`No visible windows found after launching ${appName}`)
|
|
551
|
+
return new SaiApp(window, instance)
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
function getAppWindow(windowPidOrTitle) {
|
|
555
|
+
const { Window } = binding()
|
|
556
|
+
const windows = Window.all()
|
|
557
|
+
const window =
|
|
558
|
+
typeof windowPidOrTitle === 'number'
|
|
559
|
+
? windows.find((w) => w.pid === windowPidOrTitle)
|
|
560
|
+
: windows.find((w) => w.title.toLowerCase().includes(windowPidOrTitle.toLowerCase()))
|
|
561
|
+
if (!window) {
|
|
562
|
+
throw new Error(`No window found matching "${windowPidOrTitle}". Use listAppWindows() to see available windows.`)
|
|
563
|
+
}
|
|
564
|
+
return new SaiApp(window)
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
function listAppWindows() {
|
|
568
|
+
const { Window } = binding()
|
|
569
|
+
return Window.all().map((window) => ({
|
|
570
|
+
id: window.pid,
|
|
571
|
+
title: window.title,
|
|
572
|
+
pid: window.pid,
|
|
573
|
+
}))
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
function listApps() {
|
|
577
|
+
const { System } = binding()
|
|
578
|
+
return System.listApps().map((app) => ({
|
|
579
|
+
name: app.canonicalName ?? '',
|
|
580
|
+
target: app.launchTarget ?? '',
|
|
581
|
+
}))
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
// ── Files ──
|
|
585
|
+
|
|
586
|
+
function readFile(params) {
|
|
587
|
+
return binding().readFile(params.path)
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
function writeToFile(params) {
|
|
591
|
+
const { text, path = 'SimularActionResult.txt', overwrite = false } = params
|
|
592
|
+
const append = overwrite ? false : true
|
|
593
|
+
return binding().writeFile(path, text, append)
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
// ── VLM-backed primitives (built on the native AskModel / grounding) ──
|
|
597
|
+
|
|
598
|
+
function ask(params) {
|
|
599
|
+
const { prompt, context } = params
|
|
600
|
+
const { AskModel, Image } = binding()
|
|
601
|
+
const currentDate = new Date().toISOString()
|
|
602
|
+
|
|
603
|
+
// Mirror unified-ui prompt shaping (persona header + page content + task).
|
|
604
|
+
let inputPrompt = `I am Sai, a computer use agent. Now is ${currentDate}. My birthday is November 1, 2024. User asks me questions and I need you to provide a response pretending you were me, without prefacing or meta-commentary.`
|
|
605
|
+
|
|
606
|
+
const images = []
|
|
607
|
+
if (context?.text) {
|
|
608
|
+
inputPrompt += `\nPage content:\n${context.text}`
|
|
609
|
+
}
|
|
610
|
+
if (context?.imageBase64) {
|
|
611
|
+
images.push(Image.fromBase64(context.imageBase64.replace(/^data:image\/(png|jpeg);base64,/, '')))
|
|
612
|
+
}
|
|
613
|
+
inputPrompt += `\nTask: ${prompt}`
|
|
614
|
+
|
|
615
|
+
// NOTE: unified-ui POSTs to the Simular cloud `/v1/chat/completions/ask`
|
|
616
|
+
// endpoint (billing + server-managed model). Standalone simulang-js routes the
|
|
617
|
+
// same shaped prompt through the locally configured AskModel provider instead.
|
|
618
|
+
return AskModel.default()
|
|
619
|
+
.ask(inputPrompt, null, images.length ? images : null)
|
|
620
|
+
.trim()
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
function stateSatisfies(params) {
|
|
624
|
+
const { condition } = params
|
|
625
|
+
try {
|
|
626
|
+
// Mirrors unified-ui: capture the current page (a11y text + screenshot) and evaluate
|
|
627
|
+
// the condition against the dedicated `v1/perception/state_satisfies` perception
|
|
628
|
+
// model
|
|
629
|
+
const { StateSatisfiesModel } = binding()
|
|
630
|
+
const { text, imageBase64 } = pageContent()
|
|
631
|
+
return StateSatisfiesModel.default().stateSatisfies(condition, text, imageBase64)
|
|
632
|
+
} catch (error) {
|
|
633
|
+
throw new Error(`Failed to check state condition: ${error}`)
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
function ConceptsExist(params) {
|
|
638
|
+
const { concepts } = params
|
|
639
|
+
// Mirrors unified-ui: resolve each concept via getElementByConcept (exact AX
|
|
640
|
+
// overallDescription match, with macOS/Linux vision fallback); absent on the
|
|
641
|
+
// first concept that fails to resolve.
|
|
642
|
+
for (const concept of concepts) {
|
|
643
|
+
if (!getElementByConcept(concept)) return false
|
|
644
|
+
}
|
|
645
|
+
return true
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
// ── Google Sheets helpers (pure keyboard + clipboard, ported from unified-ui) ──
|
|
649
|
+
|
|
650
|
+
function setFocusToCell(cell) {
|
|
651
|
+
// Cmd/Ctrl+J opens the Google Sheets "Go to range" dialog.
|
|
652
|
+
press({ key: 'j', cmd: true })
|
|
653
|
+
wait({ waitTime: 60, unit: 'ms' })
|
|
654
|
+
type({ text: cell, withReturn: true })
|
|
655
|
+
wait({ waitTime: 60, unit: 'ms' })
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
function getGoogleSheetCellValue(params) {
|
|
659
|
+
setFocusToCell(params.cell)
|
|
660
|
+
const previousClipboardValue = getFromClipboard()
|
|
661
|
+
wait({ waitTime: 30, unit: 'ms' })
|
|
662
|
+
press({ key: 'c', cmd: true })
|
|
663
|
+
wait({ waitTime: 60, unit: 'ms' })
|
|
664
|
+
// Google Sheets appends newlines for empty cells; trim trailing whitespace.
|
|
665
|
+
const cellValue = getFromClipboard().replace(/\s+$/g, '')
|
|
666
|
+
press({ key: 'escape' })
|
|
667
|
+
copyToClipboard({ text: previousClipboardValue })
|
|
668
|
+
return cellValue
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
function setGoogleSheetCellValue(params) {
|
|
672
|
+
setFocusToCell(params.cell)
|
|
673
|
+
press({ key: 'delete' })
|
|
674
|
+
wait({ waitTime: 60, unit: 'ms' })
|
|
675
|
+
type({ text: params.value, withReturn: true })
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
// ── Unsupported Sai runtime/product primitives (no standalone equivalent) ──
|
|
679
|
+
|
|
680
|
+
const unsupportedProduct = { category: 'product-runtime-only' }
|
|
681
|
+
const unsupportedCloud = { category: 'cloud-dependent' }
|
|
682
|
+
const unsupportedBrowser = {
|
|
683
|
+
reason: 'Sai browser automation is backed by the Sai runtime Playwright/CDP layer, not simulang-js.',
|
|
684
|
+
...unsupportedProduct,
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
const browser = {
|
|
688
|
+
newtab: unsupportedAsyncFunction('browser.newtab', unsupportedBrowser),
|
|
689
|
+
getTab: unsupportedAsyncFunction('browser.getTab', unsupportedBrowser),
|
|
690
|
+
listTabs: unsupportedAsyncFunction('browser.listTabs', unsupportedBrowser),
|
|
691
|
+
closeTab: unsupportedAsyncFunction('browser.closeTab', unsupportedBrowser),
|
|
692
|
+
disconnect: unsupportedAsyncFunction('browser.disconnect', unsupportedBrowser),
|
|
693
|
+
close: unsupportedAsyncFunction('browser.close', unsupportedBrowser),
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
module.exports = {
|
|
697
|
+
SaiApp,
|
|
698
|
+
UnsupportedSaiPrimitiveError,
|
|
699
|
+
ask,
|
|
700
|
+
// unified-ui browser.* is the Sai runtime Playwright/CDP layer, not simulang-js.
|
|
701
|
+
browser,
|
|
702
|
+
click,
|
|
703
|
+
ConceptsExist,
|
|
704
|
+
copyToClipboard,
|
|
705
|
+
drag,
|
|
706
|
+
// unified-ui exec runs through the exec security manager + approval UI.
|
|
707
|
+
exec: unsupportedAsyncFunction('exec', {
|
|
708
|
+
reason: 'Needs the Sai exec security manager and approval UI.',
|
|
709
|
+
...unsupportedProduct,
|
|
710
|
+
}),
|
|
711
|
+
// unified-ui generateImage POSTs to the cloud `/v1/image-gen` service; AskModel is chat-only.
|
|
712
|
+
generateImage: unsupportedAsyncFunction('generateImage', {
|
|
713
|
+
reason: 'simulang-js has no image-generation provider (AskModel is chat-completions only).',
|
|
714
|
+
...unsupportedCloud,
|
|
715
|
+
}),
|
|
716
|
+
getAppWindow,
|
|
717
|
+
getFromClipboard,
|
|
718
|
+
getGoogleSheetCellValue,
|
|
719
|
+
github: unsupportedNamespace('github', unsupportedProduct),
|
|
720
|
+
google: unsupportedNamespace('google', unsupportedProduct),
|
|
721
|
+
ground,
|
|
722
|
+
launch,
|
|
723
|
+
listApps,
|
|
724
|
+
listAppWindows,
|
|
725
|
+
move,
|
|
726
|
+
open,
|
|
727
|
+
pageContent,
|
|
728
|
+
press,
|
|
729
|
+
readFile,
|
|
730
|
+
// unified-ui requestApproval pauses execution for an approval/user-input UI.
|
|
731
|
+
requestApproval: unsupportedAsyncFunction('requestApproval', {
|
|
732
|
+
reason: 'Needs the Sai approval / user-input UI and execution pause.',
|
|
733
|
+
...unsupportedProduct,
|
|
734
|
+
}),
|
|
735
|
+
// unified-ui respond shows a message (and optional confirm) in the Sai UI.
|
|
736
|
+
respond: unsupportedFunction('respond', {
|
|
737
|
+
reason: 'Needs the Sai message / confirmation UI (showMessageWithChoices).',
|
|
738
|
+
...unsupportedProduct,
|
|
739
|
+
}),
|
|
740
|
+
sai: unsupportedNamespace('sai', unsupportedProduct),
|
|
741
|
+
scroll,
|
|
742
|
+
setGoogleSheetCellValue,
|
|
743
|
+
shortCut,
|
|
744
|
+
slack: unsupportedNamespace('slack', unsupportedProduct),
|
|
745
|
+
stateSatisfies,
|
|
746
|
+
type,
|
|
747
|
+
wait,
|
|
748
|
+
writeToFile,
|
|
749
|
+
}
|