@simular-ai/simulang-js 8.0.0 → 9.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/sai.js ADDED
@@ -0,0 +1,749 @@
1
+ // Sai / Unified UI compatibility syntax over @simular-ai/simulang-js.
2
+ //
3
+ // Implementations mirror simular-pro-unified-ui's primitive bodies as closely
4
+ // as possible:
5
+ // - free desktop primitives -> app/src/main/primitives/primitives-core/index.ts
6
+ // - ref-based launch/getApp -> app/src/main/primitives/primitives-desktop/index.ts
7
+ //
8
+ // Where unified-ui branches on a capability that has no standalone simulang-js
9
+ // equivalent (cloud endpoints, the LLM accessibility element resolver, the Sai
10
+ // product runtime), the unimplementable branch is left as a comment marked
11
+ // MISSING / NOTE so the gap is visible, and the implementable branch is kept.
12
+
13
+ let nativeBinding = globalThis.__SIMULANG_JS_SAI_NATIVE__ ?? null
14
+
15
+ function binding() {
16
+ if (!nativeBinding) nativeBinding = require('./wrapped.js')
17
+ return nativeBinding
18
+ }
19
+
20
+ // unified-ui constructs one controller per process at module load. We do the
21
+ // same but lazily, so importing this module never forces the native binding to
22
+ // load before a supported primitive is actually called.
23
+ let mouseController = null
24
+ let keyboardController = null
25
+ let clipboardController = null
26
+ function mouse() {
27
+ const { MouseController } = binding()
28
+ return (mouseController ??= new MouseController())
29
+ }
30
+ function keyboard() {
31
+ const { KeyboardController } = binding()
32
+ return (keyboardController ??= new KeyboardController())
33
+ }
34
+ function clipboard() {
35
+ const { Clipboard } = binding()
36
+ return (clipboardController ??= new Clipboard())
37
+ }
38
+
39
+ // unified-ui defaults grounding to 'vision' (BaseSimularPrimitives.defaultGroundingMode).
40
+ const DEFAULT_GROUNDING_MODE = 'vision'
41
+
42
+ class UnsupportedSaiPrimitiveError extends Error {
43
+ constructor(primitive, details = {}) {
44
+ const reason = details.reason ?? 'This Sai primitive is not available in standalone simulang-js.'
45
+ const closest = details.closestNativeApi ? ` Closest native API: ${details.closestNativeApi}.` : ''
46
+ const category = details.category ? ` Category: ${details.category}.` : ''
47
+ super(`${primitive} is unsupported. ${reason}${closest}${category}`)
48
+ this.name = 'UnsupportedSaiPrimitiveError'
49
+ this.primitive = primitive
50
+ this.reason = reason
51
+ this.closestNativeApi = details.closestNativeApi
52
+ this.category = details.category
53
+ }
54
+ }
55
+
56
+ function unsupported(primitive, details) {
57
+ throw new UnsupportedSaiPrimitiveError(primitive, details)
58
+ }
59
+
60
+ function unsupportedFunction(primitive, details) {
61
+ return function unsupportedSaiPrimitive() {
62
+ unsupported(primitive, details)
63
+ }
64
+ }
65
+
66
+ function unsupportedAsyncFunction(primitive, details) {
67
+ return async function unsupportedSaiPrimitive() {
68
+ unsupported(primitive, details)
69
+ }
70
+ }
71
+
72
+ function unsupportedNamespace(primitive, details) {
73
+ return new Proxy(
74
+ {},
75
+ {
76
+ get(_target, prop) {
77
+ if (prop === Symbol.toStringTag) return 'UnsupportedSaiNamespace'
78
+ unsupported(`${primitive}.${String(prop)}`, details)
79
+ },
80
+ apply() {
81
+ unsupported(primitive, details)
82
+ },
83
+ },
84
+ )
85
+ }
86
+
87
+ function stripDataUrl(dataUrl) {
88
+ return dataUrl.replace(/^data:image\/[a-z]+;base64,/, '')
89
+ }
90
+
91
+ function sleep(ms) {
92
+ Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, Math.max(0, Math.round(ms)))
93
+ }
94
+
95
+ function isWindows() {
96
+ return process.platform === 'win32'
97
+ }
98
+
99
+ // ── Timing ──
100
+
101
+ function wait(params) {
102
+ const { unit = 's', waitTime } = params
103
+ sleep(unit === 'ms' ? waitTime : waitTime * 1000)
104
+ }
105
+
106
+ // ── Clipboard ──
107
+
108
+ function copyToClipboard(params) {
109
+ clipboard().setString(params.text)
110
+ }
111
+
112
+ function getFromClipboard() {
113
+ return clipboard().getString() ?? ''
114
+ }
115
+
116
+ // ── Keyboard ──
117
+
118
+ function press(params) {
119
+ const { key, cmd = false, shift = false, option = false, alt = false, ctrl = false } = params
120
+ if (!key) {
121
+ throw new Error('No key argument was provided for press()')
122
+ }
123
+ const { keyFromString, Key, Direction } = binding()
124
+ // keyFromString throws for invalid values (matches unified-ui).
125
+ const keyToPress = keyFromString(key)
126
+ const kb = keyboard()
127
+
128
+ if (cmd) kb.key(Key.Meta, Direction.Press)
129
+ if (shift) kb.key(Key.Shift, Direction.Press)
130
+ if (option || alt) kb.key(Key.Alt, Direction.Press)
131
+ if (ctrl) kb.key(Key.Control, Direction.Press)
132
+
133
+ kb.key(keyToPress, Direction.Click)
134
+
135
+ if (cmd) kb.key(Key.Meta, Direction.Release)
136
+ if (shift) kb.key(Key.Shift, Direction.Release)
137
+ if (option || alt) kb.key(Key.Alt, Direction.Release)
138
+ if (ctrl) kb.key(Key.Control, Direction.Release)
139
+ }
140
+
141
+ function shortCut(params) {
142
+ const { waitTime = 0, ...pressParams } = params
143
+ press(pressParams)
144
+ if (waitTime > 0) wait({ waitTime })
145
+ }
146
+
147
+ function type(params) {
148
+ const { text, withReturn } = params
149
+ const { Key, Direction } = binding()
150
+ // Typing long text via clipboard is much faster than char-by-char; short text
151
+ // goes through the keyboard so paste-blocking fields (passwords) still work.
152
+ if (text.length > 40) {
153
+ clipboard().pasteText(text)
154
+ } else {
155
+ keyboard().text(text)
156
+ }
157
+ wait({ waitTime: 0.5 })
158
+ if (withReturn) {
159
+ keyboard().key(Key.Return, Direction.Click)
160
+ }
161
+ }
162
+
163
+ // ── Grounding (concept -> coordinates) ──
164
+
165
+ // Vision grounding: screenshot + VLM -> global desktop coordinates.
166
+ // Mirrors unified-ui getLocationByConceptFromVisualGrounding, which branches on
167
+ // whether a crop region is supplied. The native Screenshot.ground performs the
168
+ // VLM call and coordinate mapping that unified-ui does by hand against the cloud.
169
+ function getLocationByConceptFromVisualGrounding(concept, cropBounds) {
170
+ try {
171
+ const { GroundingModel, Screen, screenshotCropped, screenshotFull } = binding()
172
+ // Model selection differs from unified-ui: it picks a specific cloud-configured
173
+ // production model via resolveProductionGroundingModel(this.groundingModel),
174
+ // whereas standalone simulang-js uses whatever the local provider config
175
+ // advertises as the default.
176
+ const model = GroundingModel.default()
177
+
178
+ let screenshot
179
+ if (cropBounds) {
180
+ const width = cropBounds.right - cropBounds.left
181
+ const height = cropBounds.bottom - cropBounds.top
182
+ if (width > 0 && height > 0) {
183
+ screenshot = screenshotCropped(cropBounds.left, cropBounds.top, width, height, true)
184
+ } else {
185
+ screenshot = screenshotFull(false, Screen.fromCurrentMouseLocation())
186
+ }
187
+ } else {
188
+ screenshot = screenshotFull(false, Screen.fromCurrentMouseLocation())
189
+ }
190
+
191
+ const [x, y] = screenshot.ground(model, concept)
192
+ return { x, y }
193
+ } catch {
194
+ // unified-ui logs the provider details internally and throws this generic
195
+ // primitive-level error, so callers don't branch on provider-specific text.
196
+ throw new Error('Failed to get location by concept from visual grounding')
197
+ }
198
+ }
199
+
200
+ // Mirrors unified-ui getElementByConcept across platforms. Returns an
201
+ // AutomationElement (same shape unified-ui produces) or null when nothing resolves.
202
+ // - Windows resolves via an exact `overallDescription` match in the active
203
+ // window's AX tree (uia.findElementByOverallDescription, a deterministic
204
+ // string-equality walk — NOT an LLM). The cross-platform native equivalent
205
+ // is AccessibilityTree.findByDescription(concept).
206
+ // - macOS/Linux in unified-ui resolve via vision grounding wrapped in a
207
+ // synthetic element; we keep that behavior even though standalone
208
+ // simulang-js can also read accessibility trees on those platforms.
209
+ function getElementByConcept(concept) {
210
+ if (isWindows()) {
211
+ try {
212
+ const { AccessibilityTree } = binding()
213
+ const [node] = AccessibilityTree.fromForeground().findByDescription(concept)
214
+ if (node) {
215
+ return { ...node, processId: 0, boundingRect: node.boundingBox, children: [] }
216
+ }
217
+ } catch {
218
+ return null
219
+ }
220
+ return null
221
+ }
222
+
223
+ try {
224
+ const { x, y } = getLocationByConceptFromVisualGrounding(concept)
225
+ // Synthetic 10x10 element around the grounded point — mirrors unified-ui macOS/Linux.
226
+ const halfSize = 5
227
+ return {
228
+ name: concept,
229
+ className: '',
230
+ controlType: 0,
231
+ localizedControlType: 'visual-grounding',
232
+ description: `Visual grounding result for: ${concept}`,
233
+ overallDescription: concept,
234
+ helpText: '',
235
+ value: '',
236
+ automationId: '',
237
+ processId: 0,
238
+ isEnabled: true,
239
+ boundingRect: {
240
+ left: x - halfSize,
241
+ top: y - halfSize,
242
+ right: x + halfSize,
243
+ bottom: y + halfSize,
244
+ },
245
+ children: [],
246
+ }
247
+ } catch {
248
+ return null
249
+ }
250
+ }
251
+
252
+ // Mirrors unified-ui getLocationByConcept(concept, mode).
253
+ function getLocationByConcept(concept, mode) {
254
+ // If vision mode is requested, skip accessibility and ground directly.
255
+ if (mode === 'vision') {
256
+ return getLocationByConceptFromVisualGrounding(concept)
257
+ }
258
+
259
+ // Accessibility-first: exact AX overallDescription match via
260
+ // AccessibilityTree.findByDescription (the standalone equivalent of Windows'
261
+ // findElementByOverallDescription); on macOS/Linux this naturally falls back to
262
+ // vision inside getElementByConcept. Final vision fallback matches unified-ui.
263
+ const element = getElementByConcept(concept)
264
+ if (element) {
265
+ const { left, top, right, bottom } = element.boundingRect
266
+ return { x: (left + right) / 2, y: (top + bottom) / 2 }
267
+ }
268
+ return getLocationByConceptFromVisualGrounding(concept)
269
+ }
270
+
271
+ // Sai ref-based ground({ concept, app? }). Mirrors unified-ui ground(concept, appBounds):
272
+ // when an app is supplied, crop to the window bounds for less noise / better accuracy.
273
+ async function ground(params) {
274
+ const app = params.app
275
+ const cropBounds = app && typeof app.getWindowBounds === 'function' ? app.getWindowBounds() : undefined
276
+ return getLocationByConceptFromVisualGrounding(params.concept, cropBounds)
277
+ }
278
+
279
+ // ── Mouse ──
280
+
281
+ function click(params) {
282
+ // Concept-based click (legacy Sai). Mirrors BaseSimularPrimitives.click.
283
+ if ('concept' in params) {
284
+ const { clickType = 'left', withCommand = false } = params
285
+ const { Button, Direction, Key } = binding()
286
+ const kb = keyboard()
287
+
288
+ if (withCommand) kb.key(Key.Meta, Direction.Press) // Meta == Cmd
289
+ move(params) // concept move to the location
290
+ const mc = mouse()
291
+ if (clickType === 'left') {
292
+ mc.button(Button.Left, Direction.Click)
293
+ } else if (clickType === 'right') {
294
+ mc.button(Button.Right, Direction.Click)
295
+ } else if (clickType === 'doubleClick') {
296
+ mc.button(Button.Left, Direction.Click)
297
+ mc.button(Button.Left, Direction.Click)
298
+ }
299
+ if (withCommand) kb.key(Key.Meta, Direction.Release)
300
+ wait({ waitTime: 0.5 })
301
+ return
302
+ }
303
+
304
+ // Coordinate click. Mirrors unified-ui clickAt(): move + left click only
305
+ // (the Sai coordinate overload carries no clickType / modifiers).
306
+ const { Button, Coordinate, Direction } = binding()
307
+ const mc = mouse()
308
+ mc.moveMouse(params.x, params.y, Coordinate.Abs)
309
+ mc.button(Button.Left, Direction.Click)
310
+ }
311
+
312
+ function move(params) {
313
+ // Concept-based move (legacy Sai). Mirrors BaseSimularPrimitives.move.
314
+ if ('concept' in params) {
315
+ const { concept, mode = DEFAULT_GROUNDING_MODE } = params
316
+ const { x, y } = getLocationByConcept(concept, mode)
317
+ const { Coordinate } = binding()
318
+ mouse().moveMouse(x, y, Coordinate.Abs)
319
+ wait({ waitTime: 0.5 })
320
+ return
321
+ }
322
+
323
+ // Coordinate move. Mirrors unified-ui moveTo().
324
+ const { Coordinate } = binding()
325
+ mouse().moveMouse(params.x, params.y, Coordinate.Abs)
326
+ }
327
+
328
+ // Smooth, interpolated drag — ported from unified-ui smoothDrag so apps detect
329
+ // continuous movement (drop zones need real intermediate moves + dwell).
330
+ function smoothDrag(fromX, fromY, toX, toY) {
331
+ const { Button, Coordinate, Direction } = binding()
332
+ const mc = mouse()
333
+
334
+ mc.moveMouse(fromX, fromY, Coordinate.Abs)
335
+ sleep(200)
336
+ mc.button(Button.Left, Direction.Press)
337
+ sleep(100)
338
+ // Initial jiggle — exceed the Windows drag threshold (SM_CXDRAG = 4px).
339
+ mc.moveMouse(fromX + 5, fromY + 5, Coordinate.Abs)
340
+ sleep(50)
341
+
342
+ const dx = toX - fromX
343
+ const dy = toY - fromY
344
+ const distance = Math.sqrt(dx * dx + dy * dy)
345
+ const steps = Math.max(5, Math.round(distance / 20))
346
+ for (let i = 1; i <= steps; i++) {
347
+ const t = i / steps
348
+ mc.moveMouse(Math.round(fromX + dx * t), Math.round(fromY + dy * t), Coordinate.Abs)
349
+ sleep(15)
350
+ }
351
+
352
+ sleep(500) // hover over target so the drop zone activates
353
+ mc.button(Button.Left, Direction.Release)
354
+ }
355
+
356
+ function drag(params) {
357
+ smoothDrag(params.fromX, params.fromY, params.toX, params.toY)
358
+ }
359
+
360
+ function scroll(params = {}) {
361
+ const { direction = 'down', distance = 200 } = params
362
+ const mc = mouse()
363
+ switch (direction.toLowerCase()) {
364
+ case 'up':
365
+ mc.scroll(0, -distance)
366
+ break
367
+ case 'down':
368
+ mc.scroll(0, distance)
369
+ break
370
+ case 'left':
371
+ mc.scroll(-distance, 0)
372
+ break
373
+ case 'right':
374
+ mc.scroll(distance, 0)
375
+ break
376
+ default:
377
+ throw new Error(`Invalid scroll direction: ${direction}`)
378
+ }
379
+ wait({ waitTime: 0.15 })
380
+ }
381
+
382
+ // ── App / window ──
383
+
384
+ function open(params) {
385
+ const { app, url } = params
386
+ const { FocusPolicy, Visibility, legacyOpen } = binding()
387
+ // TODO (unified-ui parity): allow caller to specify focus policy and visibility.
388
+ legacyOpen(app, url, FocusPolicy.Steal, Visibility.Show)
389
+ }
390
+
391
+ function serializeNode(node, depth = 0, lines = []) {
392
+ const { ariaRoleToString } = binding()
393
+ const indent = ' '.repeat(depth)
394
+ const role = typeof node.role === 'number' ? ariaRoleToString(node.role) : String(node.role)
395
+ const label = [node.name, node.value].filter(Boolean).join(' ')
396
+ lines.push(`${indent}- ${role}${label ? ` "${label}"` : ''}`)
397
+ for (const child of node.children ?? []) {
398
+ serializeNode(child, depth + 1, lines)
399
+ }
400
+ return lines
401
+ }
402
+
403
+ // unified-ui pageContent is abstract and platform-specific; this is the
404
+ // cross-platform approximation: foreground AX text + a full-screen screenshot.
405
+ function pageContent() {
406
+ let text = ''
407
+ let imageBase64 = ''
408
+ try {
409
+ const { AccessibilityTree } = binding()
410
+ text = serializeNode(AccessibilityTree.fromForeground().snapshot()).join('\n')
411
+ } catch {
412
+ text = ''
413
+ }
414
+ try {
415
+ const { Screen, screenshotFull } = binding()
416
+ imageBase64 = stripDataUrl(screenshotFull(false, Screen.fromCurrentMouseLocation()).base64())
417
+ } catch {
418
+ imageBase64 = ''
419
+ }
420
+ return { text, imageBase64 }
421
+ }
422
+
423
+ /**
424
+ * Proxy for a desktop application window.
425
+ *
426
+ * NOTE: unified-ui's AppProxy (primitives-desktop) is backed by an
427
+ * AccessibilityTree + DesktopRefManager that dispatches UIA/AX patterns
428
+ * (Invoke/Toggle/Selection/ExpandCollapse/Value) for ref-based actions, plus an
429
+ * LLM ElementResolver for find(). That ref subsystem is not ported here, so the
430
+ * ref-driven action methods below throw; observation (snapshot/screenshot),
431
+ * focus, title and window-scoped grounding are implemented.
432
+ */
433
+ class SaiApp {
434
+ constructor(window, instance = null) {
435
+ this._window = window
436
+ this._instance = instance
437
+ }
438
+
439
+ async snapshot() {
440
+ // Window.snapshot() returns a plain aria-snapshot string (no refs). unified-ui
441
+ // returns a navigable SnapshotValue with refs from DesktopRefManager — not ported.
442
+ return {
443
+ snapshot: this._window.snapshot(),
444
+ refs: {},
445
+ title: this._window.title,
446
+ }
447
+ }
448
+
449
+ async screenshot() {
450
+ return stripDataUrl(this._window.screenshot(true).base64())
451
+ }
452
+
453
+ async click(_params) {
454
+ unsupported('App.click', {
455
+ reason: 'Ref-based clicks need the DesktopRefManager (UIA/AX pattern dispatch); not ported.',
456
+ closestNativeApi: 'Window.click(x, y, button, direction)',
457
+ category: 'ref-runtime-only',
458
+ })
459
+ }
460
+
461
+ async type(_params) {
462
+ unsupported('App.type', {
463
+ reason: 'Ref-based typing needs the DesktopRefManager (ValuePattern / focus); not ported.',
464
+ closestNativeApi: 'KeyboardController.text(text)',
465
+ category: 'ref-runtime-only',
466
+ })
467
+ }
468
+
469
+ async check() {
470
+ unsupported('App.check', {
471
+ reason: 'Needs DesktopRefManager TogglePattern; not ported.',
472
+ category: 'ref-runtime-only',
473
+ })
474
+ }
475
+
476
+ async select() {
477
+ unsupported('App.select', {
478
+ reason: 'Needs DesktopRefManager SelectionItemPattern; not ported.',
479
+ category: 'ref-runtime-only',
480
+ })
481
+ }
482
+
483
+ async scroll() {
484
+ unsupported('App.scroll', {
485
+ reason: 'Needs DesktopRefManager scrollIntoView; not ported.',
486
+ closestNativeApi: 'Window.scroll(deltaX, deltaY)',
487
+ category: 'ref-runtime-only',
488
+ })
489
+ }
490
+
491
+ async focus() {
492
+ this._window.focus()
493
+ }
494
+
495
+ async press(params) {
496
+ press(params)
497
+ }
498
+
499
+ selector() {
500
+ unsupported('App.selector', {
501
+ reason: 'Needs DesktopRefManager ref metadata; not ported.',
502
+ category: 'ref-runtime-only',
503
+ })
504
+ }
505
+
506
+ async find() {
507
+ unsupported('App.find', {
508
+ reason: 'Needs DesktopRefManager + LLM ElementResolver; not ported.',
509
+ closestNativeApi: 'Window.scoredSearch(...) or AccessibilityTree.find(...)',
510
+ category: 'ref-runtime-only',
511
+ })
512
+ }
513
+
514
+ async waitFor() {
515
+ unsupported('App.waitFor', {
516
+ reason: 'Polls find(), which needs the ref subsystem; not ported.',
517
+ category: 'ref-runtime-only',
518
+ })
519
+ }
520
+
521
+ async drag() {
522
+ unsupported('App.drag', { reason: 'Needs DesktopRefManager ref bounds; not ported.', category: 'ref-runtime-only' })
523
+ }
524
+
525
+ title() {
526
+ return this._window.title
527
+ }
528
+
529
+ windowId() {
530
+ return this._window.pid
531
+ }
532
+
533
+ getWindowBounds() {
534
+ return this._window.boundingBox()
535
+ }
536
+
537
+ ground(model, concept) {
538
+ return this._window.ground(model, concept)
539
+ }
540
+ }
541
+
542
+ async function launch(appName) {
543
+ const { FocusPolicy, System, Visibility } = binding()
544
+ const app = System.fuzzySearch(appName)
545
+ // NOTE: unified-ui binds an AccessibilityTree.fromForeground() into an AppProxy
546
+ // here. We bind the launched Instance's first window instead (the ref subsystem
547
+ // that the AX-tree-backed AppProxy needs is not ported — see SaiApp).
548
+ const instance = app.open(null, FocusPolicy.Steal, Visibility.Show, true)
549
+ const [window] = instance.windows()
550
+ if (!window) throw new Error(`No visible windows found after launching ${appName}`)
551
+ return new SaiApp(window, instance)
552
+ }
553
+
554
+ function getAppWindow(windowPidOrTitle) {
555
+ const { Window } = binding()
556
+ const windows = Window.all()
557
+ const window =
558
+ typeof windowPidOrTitle === 'number'
559
+ ? windows.find((w) => w.pid === windowPidOrTitle)
560
+ : windows.find((w) => w.title.toLowerCase().includes(windowPidOrTitle.toLowerCase()))
561
+ if (!window) {
562
+ throw new Error(`No window found matching "${windowPidOrTitle}". Use listAppWindows() to see available windows.`)
563
+ }
564
+ return new SaiApp(window)
565
+ }
566
+
567
+ function listAppWindows() {
568
+ const { Window } = binding()
569
+ return Window.all().map((window) => ({
570
+ id: window.pid,
571
+ title: window.title,
572
+ pid: window.pid,
573
+ }))
574
+ }
575
+
576
+ function listApps() {
577
+ const { System } = binding()
578
+ return System.listApps().map((app) => ({
579
+ name: app.canonicalName ?? '',
580
+ target: app.launchTarget ?? '',
581
+ }))
582
+ }
583
+
584
+ // ── Files ──
585
+
586
+ function readFile(params) {
587
+ return binding().readFile(params.path)
588
+ }
589
+
590
+ function writeToFile(params) {
591
+ const { text, path = 'SimularActionResult.txt', overwrite = false } = params
592
+ const append = overwrite ? false : true
593
+ return binding().writeFile(path, text, append)
594
+ }
595
+
596
+ // ── VLM-backed primitives (built on the native AskModel / grounding) ──
597
+
598
+ function ask(params) {
599
+ const { prompt, context } = params
600
+ const { AskModel, Image } = binding()
601
+ const currentDate = new Date().toISOString()
602
+
603
+ // Mirror unified-ui prompt shaping (persona header + page content + task).
604
+ let inputPrompt = `I am Sai, a computer use agent. Now is ${currentDate}. My birthday is November 1, 2024. User asks me questions and I need you to provide a response pretending you were me, without prefacing or meta-commentary.`
605
+
606
+ const images = []
607
+ if (context?.text) {
608
+ inputPrompt += `\nPage content:\n${context.text}`
609
+ }
610
+ if (context?.imageBase64) {
611
+ images.push(Image.fromBase64(context.imageBase64.replace(/^data:image\/(png|jpeg);base64,/, '')))
612
+ }
613
+ inputPrompt += `\nTask: ${prompt}`
614
+
615
+ // NOTE: unified-ui POSTs to the Simular cloud `/v1/chat/completions/ask`
616
+ // endpoint (billing + server-managed model). Standalone simulang-js routes the
617
+ // same shaped prompt through the locally configured AskModel provider instead.
618
+ return AskModel.default()
619
+ .ask(inputPrompt, null, images.length ? images : null)
620
+ .trim()
621
+ }
622
+
623
+ function stateSatisfies(params) {
624
+ const { condition } = params
625
+ try {
626
+ // Mirrors unified-ui: capture the current page (a11y text + screenshot) and evaluate
627
+ // the condition against the dedicated `v1/perception/state_satisfies` perception
628
+ // model
629
+ const { StateSatisfiesModel } = binding()
630
+ const { text, imageBase64 } = pageContent()
631
+ return StateSatisfiesModel.default().stateSatisfies(condition, text, imageBase64)
632
+ } catch (error) {
633
+ throw new Error(`Failed to check state condition: ${error}`)
634
+ }
635
+ }
636
+
637
+ function ConceptsExist(params) {
638
+ const { concepts } = params
639
+ // Mirrors unified-ui: resolve each concept via getElementByConcept (exact AX
640
+ // overallDescription match, with macOS/Linux vision fallback); absent on the
641
+ // first concept that fails to resolve.
642
+ for (const concept of concepts) {
643
+ if (!getElementByConcept(concept)) return false
644
+ }
645
+ return true
646
+ }
647
+
648
+ // ── Google Sheets helpers (pure keyboard + clipboard, ported from unified-ui) ──
649
+
650
+ function setFocusToCell(cell) {
651
+ // Cmd/Ctrl+J opens the Google Sheets "Go to range" dialog.
652
+ press({ key: 'j', cmd: true })
653
+ wait({ waitTime: 60, unit: 'ms' })
654
+ type({ text: cell, withReturn: true })
655
+ wait({ waitTime: 60, unit: 'ms' })
656
+ }
657
+
658
+ function getGoogleSheetCellValue(params) {
659
+ setFocusToCell(params.cell)
660
+ const previousClipboardValue = getFromClipboard()
661
+ wait({ waitTime: 30, unit: 'ms' })
662
+ press({ key: 'c', cmd: true })
663
+ wait({ waitTime: 60, unit: 'ms' })
664
+ // Google Sheets appends newlines for empty cells; trim trailing whitespace.
665
+ const cellValue = getFromClipboard().replace(/\s+$/g, '')
666
+ press({ key: 'escape' })
667
+ copyToClipboard({ text: previousClipboardValue })
668
+ return cellValue
669
+ }
670
+
671
+ function setGoogleSheetCellValue(params) {
672
+ setFocusToCell(params.cell)
673
+ press({ key: 'delete' })
674
+ wait({ waitTime: 60, unit: 'ms' })
675
+ type({ text: params.value, withReturn: true })
676
+ }
677
+
678
+ // ── Unsupported Sai runtime/product primitives (no standalone equivalent) ──
679
+
680
+ const unsupportedProduct = { category: 'product-runtime-only' }
681
+ const unsupportedCloud = { category: 'cloud-dependent' }
682
+ const unsupportedBrowser = {
683
+ reason: 'Sai browser automation is backed by the Sai runtime Playwright/CDP layer, not simulang-js.',
684
+ ...unsupportedProduct,
685
+ }
686
+
687
+ const browser = {
688
+ newtab: unsupportedAsyncFunction('browser.newtab', unsupportedBrowser),
689
+ getTab: unsupportedAsyncFunction('browser.getTab', unsupportedBrowser),
690
+ listTabs: unsupportedAsyncFunction('browser.listTabs', unsupportedBrowser),
691
+ closeTab: unsupportedAsyncFunction('browser.closeTab', unsupportedBrowser),
692
+ disconnect: unsupportedAsyncFunction('browser.disconnect', unsupportedBrowser),
693
+ close: unsupportedAsyncFunction('browser.close', unsupportedBrowser),
694
+ }
695
+
696
+ module.exports = {
697
+ SaiApp,
698
+ UnsupportedSaiPrimitiveError,
699
+ ask,
700
+ // unified-ui browser.* is the Sai runtime Playwright/CDP layer, not simulang-js.
701
+ browser,
702
+ click,
703
+ ConceptsExist,
704
+ copyToClipboard,
705
+ drag,
706
+ // unified-ui exec runs through the exec security manager + approval UI.
707
+ exec: unsupportedAsyncFunction('exec', {
708
+ reason: 'Needs the Sai exec security manager and approval UI.',
709
+ ...unsupportedProduct,
710
+ }),
711
+ // unified-ui generateImage POSTs to the cloud `/v1/image-gen` service; AskModel is chat-only.
712
+ generateImage: unsupportedAsyncFunction('generateImage', {
713
+ reason: 'simulang-js has no image-generation provider (AskModel is chat-completions only).',
714
+ ...unsupportedCloud,
715
+ }),
716
+ getAppWindow,
717
+ getFromClipboard,
718
+ getGoogleSheetCellValue,
719
+ github: unsupportedNamespace('github', unsupportedProduct),
720
+ google: unsupportedNamespace('google', unsupportedProduct),
721
+ ground,
722
+ launch,
723
+ listApps,
724
+ listAppWindows,
725
+ move,
726
+ open,
727
+ pageContent,
728
+ press,
729
+ readFile,
730
+ // unified-ui requestApproval pauses execution for an approval/user-input UI.
731
+ requestApproval: unsupportedAsyncFunction('requestApproval', {
732
+ reason: 'Needs the Sai approval / user-input UI and execution pause.',
733
+ ...unsupportedProduct,
734
+ }),
735
+ // unified-ui respond shows a message (and optional confirm) in the Sai UI.
736
+ respond: unsupportedFunction('respond', {
737
+ reason: 'Needs the Sai message / confirmation UI (showMessageWithChoices).',
738
+ ...unsupportedProduct,
739
+ }),
740
+ sai: unsupportedNamespace('sai', unsupportedProduct),
741
+ scroll,
742
+ setGoogleSheetCellValue,
743
+ shortCut,
744
+ slack: unsupportedNamespace('slack', unsupportedProduct),
745
+ stateSatisfies,
746
+ type,
747
+ wait,
748
+ writeToFile,
749
+ }