stelo 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/LICENSE +184 -0
  2. package/README.md +853 -0
  3. package/dist/accessibility.d.ts +227 -0
  4. package/dist/accessibility.d.ts.map +1 -0
  5. package/dist/accessibility.js +602 -0
  6. package/dist/accessibility.js.map +1 -0
  7. package/dist/agent.d.ts +870 -0
  8. package/dist/agent.d.ts.map +1 -0
  9. package/dist/agent.js +1107 -0
  10. package/dist/agent.js.map +1 -0
  11. package/dist/audio-stream.d.ts +114 -0
  12. package/dist/audio-stream.d.ts.map +1 -0
  13. package/dist/audio-stream.js +167 -0
  14. package/dist/audio-stream.js.map +1 -0
  15. package/dist/clipboard.d.ts +99 -0
  16. package/dist/clipboard.d.ts.map +1 -0
  17. package/dist/clipboard.js +352 -0
  18. package/dist/clipboard.js.map +1 -0
  19. package/dist/config.d.ts +183 -0
  20. package/dist/config.d.ts.map +1 -0
  21. package/dist/config.js +477 -0
  22. package/dist/config.js.map +1 -0
  23. package/dist/context.d.ts +213 -0
  24. package/dist/context.d.ts.map +1 -0
  25. package/dist/context.js +387 -0
  26. package/dist/context.js.map +1 -0
  27. package/dist/cortex.d.ts +548 -0
  28. package/dist/cortex.d.ts.map +1 -0
  29. package/dist/cortex.js +1479 -0
  30. package/dist/cortex.js.map +1 -0
  31. package/dist/errors.d.ts +133 -0
  32. package/dist/errors.d.ts.map +1 -0
  33. package/dist/errors.js +278 -0
  34. package/dist/errors.js.map +1 -0
  35. package/dist/events.d.ts +227 -0
  36. package/dist/events.d.ts.map +1 -0
  37. package/dist/events.js +429 -0
  38. package/dist/events.js.map +1 -0
  39. package/dist/executor.d.ts +212 -0
  40. package/dist/executor.d.ts.map +1 -0
  41. package/dist/executor.js +545 -0
  42. package/dist/executor.js.map +1 -0
  43. package/dist/index.d.ts +69 -0
  44. package/dist/index.d.ts.map +1 -0
  45. package/dist/index.js +167 -0
  46. package/dist/index.js.map +1 -0
  47. package/dist/integration.d.ts +159 -0
  48. package/dist/integration.d.ts.map +1 -0
  49. package/dist/integration.js +533 -0
  50. package/dist/integration.js.map +1 -0
  51. package/dist/keyboard.d.ts +276 -0
  52. package/dist/keyboard.d.ts.map +1 -0
  53. package/dist/keyboard.js +404 -0
  54. package/dist/keyboard.js.map +1 -0
  55. package/dist/logger.d.ts +198 -0
  56. package/dist/logger.d.ts.map +1 -0
  57. package/dist/logger.js +516 -0
  58. package/dist/logger.js.map +1 -0
  59. package/dist/middleware.d.ts +183 -0
  60. package/dist/middleware.d.ts.map +1 -0
  61. package/dist/middleware.js +493 -0
  62. package/dist/middleware.js.map +1 -0
  63. package/dist/monitor.d.ts +136 -0
  64. package/dist/monitor.d.ts.map +1 -0
  65. package/dist/monitor.js +341 -0
  66. package/dist/monitor.js.map +1 -0
  67. package/dist/mouse.d.ts +290 -0
  68. package/dist/mouse.d.ts.map +1 -0
  69. package/dist/mouse.js +466 -0
  70. package/dist/mouse.js.map +1 -0
  71. package/dist/plugin.d.ts +157 -0
  72. package/dist/plugin.d.ts.map +1 -0
  73. package/dist/plugin.js +409 -0
  74. package/dist/plugin.js.map +1 -0
  75. package/dist/process.d.ts +106 -0
  76. package/dist/process.d.ts.map +1 -0
  77. package/dist/process.js +326 -0
  78. package/dist/process.js.map +1 -0
  79. package/dist/recorder.d.ts +100 -0
  80. package/dist/recorder.d.ts.map +1 -0
  81. package/dist/recorder.js +258 -0
  82. package/dist/recorder.js.map +1 -0
  83. package/dist/safety.d.ts +59 -0
  84. package/dist/safety.d.ts.map +1 -0
  85. package/dist/safety.js +98 -0
  86. package/dist/safety.js.map +1 -0
  87. package/dist/scheduler.d.ts +152 -0
  88. package/dist/scheduler.d.ts.map +1 -0
  89. package/dist/scheduler.js +615 -0
  90. package/dist/scheduler.js.map +1 -0
  91. package/dist/screen.d.ts +96 -0
  92. package/dist/screen.d.ts.map +1 -0
  93. package/dist/screen.js +154 -0
  94. package/dist/screen.js.map +1 -0
  95. package/dist/session.d.ts +209 -0
  96. package/dist/session.d.ts.map +1 -0
  97. package/dist/session.js +479 -0
  98. package/dist/session.js.map +1 -0
  99. package/dist/stream.d.ts +168 -0
  100. package/dist/stream.d.ts.map +1 -0
  101. package/dist/stream.js +298 -0
  102. package/dist/stream.js.map +1 -0
  103. package/dist/telemetry.d.ts +223 -0
  104. package/dist/telemetry.d.ts.map +1 -0
  105. package/dist/telemetry.js +433 -0
  106. package/dist/telemetry.js.map +1 -0
  107. package/dist/types.d.ts +165 -0
  108. package/dist/types.d.ts.map +1 -0
  109. package/dist/types.js +8 -0
  110. package/dist/types.js.map +1 -0
  111. package/dist/utils/bezier.d.ts +51 -0
  112. package/dist/utils/bezier.d.ts.map +1 -0
  113. package/dist/utils/bezier.js +117 -0
  114. package/dist/utils/bezier.js.map +1 -0
  115. package/dist/utils/helpers.d.ts +90 -0
  116. package/dist/utils/helpers.d.ts.map +1 -0
  117. package/dist/utils/helpers.js +143 -0
  118. package/dist/utils/helpers.js.map +1 -0
  119. package/dist/utils/index.d.ts +4 -0
  120. package/dist/utils/index.d.ts.map +1 -0
  121. package/dist/utils/index.js +18 -0
  122. package/dist/utils/index.js.map +1 -0
  123. package/dist/validation.d.ts +254 -0
  124. package/dist/validation.d.ts.map +1 -0
  125. package/dist/validation.js +478 -0
  126. package/dist/validation.js.map +1 -0
  127. package/dist/vision.d.ts +719 -0
  128. package/dist/vision.d.ts.map +1 -0
  129. package/dist/vision.js +1197 -0
  130. package/dist/vision.js.map +1 -0
  131. package/dist/window.d.ts +80 -0
  132. package/dist/window.d.ts.map +1 -0
  133. package/dist/window.js +170 -0
  134. package/dist/window.js.map +1 -0
  135. package/dist/workflow.d.ts +224 -0
  136. package/dist/workflow.d.ts.map +1 -0
  137. package/dist/workflow.js +578 -0
  138. package/dist/workflow.js.map +1 -0
  139. package/index.d.ts +840 -0
  140. package/index.js +495 -0
  141. package/package.json +91 -0
package/dist/vision.js ADDED
@@ -0,0 +1,1197 @@
1
+ "use strict";
2
+ // ============================================================================
3
+ // Stelo — Vision & Change Detection Module
4
+ // ============================================================================
5
+ // Advanced screen analysis for desktop automation. Provides visual grounding,
6
+ // change detection, action verification, and state tracking primitives.
7
+ // ============================================================================
8
+ Object.defineProperty(exports, "__esModule", { value: true });
9
+ exports.AgentVision = exports.vision = void 0;
10
+ exports.createAgentVision = createAgentVision;
11
+ const native = require('../index.js');
12
+ // ── Vision Module ───────────────────────────────────────────────────────────
13
+ /**
14
+ * Vision and change detection utilities for automation workflows.
15
+ *
16
+ * These primitives enable visual grounding, action verification, and
17
+ * state tracking - essential building blocks for robust automation flows.
18
+ *
19
+ * @example
20
+ * ```typescript
21
+ * import { vision, screen } from 'stelo';
22
+ *
23
+ * // Take a reference screenshot
24
+ * const before = vision.captureReference();
25
+ *
26
+ * // Perform some action
27
+ * await mouse.click();
28
+ *
29
+ * // Verify the screen changed
30
+ * const diff = vision.diff(before);
31
+ * console.log(`${diff.changePercentage}% of screen changed`);
32
+ *
33
+ * // Wait for UI to stabilize after action
34
+ * await vision.waitForStable({ stabilityThreshold: 0.5 });
35
+ *
36
+ * // Analyze screen as a grid for vision model
37
+ * const grid = vision.analyzeGrid(16, 9);
38
+ * const textCells = grid.cells.filter(c => c.likelyText);
39
+ * ```
40
+ */
41
+ exports.vision = {
42
+ /**
43
+ * Capture a reference screenshot for later comparison.
44
+ * Use this before triggering an action to verify it had an effect.
45
+ *
46
+ * @param region - Optional region to capture (full screen if not specified)
47
+ * @returns Reference object to pass to diff()
48
+ *
49
+ * @example
50
+ * ```typescript
51
+ * const before = vision.captureReference();
52
+ * await mouse.click();
53
+ * const diff = vision.diff(before);
54
+ * ```
55
+ */
56
+ captureReference(region) {
57
+ const cap = region
58
+ ? native.screenCapture(region.x, region.y, region.width, region.height)
59
+ : native.screenCapture();
60
+ return {
61
+ data: cap.data,
62
+ width: cap.width,
63
+ height: cap.height,
64
+ x: region?.x ?? 0,
65
+ y: region?.y ?? 0,
66
+ };
67
+ },
68
+ /**
69
+ * Compare current screen to a reference capture.
70
+ * Returns detailed diff statistics including change percentage and bounds.
71
+ *
72
+ * @param reference - Reference from captureReference()
73
+ * @param options - Diff options (tolerance, sample rate)
74
+ * @returns Diff result with change statistics
75
+ *
76
+ * @example
77
+ * ```typescript
78
+ * const diff = vision.diff(before, { tolerance: 15 });
79
+ * if (diff.changePercentage > 1) {
80
+ * console.log('Screen changed!', diff.changedBounds);
81
+ * }
82
+ * ```
83
+ */
84
+ diff(reference, options) {
85
+ const result = native.visionDiff(reference.data, reference.width, reference.height, reference.x, reference.y, reference.width, reference.height, options?.tolerance, options?.sampleRate);
86
+ return {
87
+ changePercentage: result.changePercentage ?? result.change_percentage,
88
+ changedPixelCount: result.changedPixelCount ?? result.changed_pixel_count,
89
+ totalPixelCount: result.totalPixelCount ?? result.total_pixel_count,
90
+ changedBounds: (result.hasChanges ?? result.has_changes)
91
+ ? {
92
+ x: result.changedX ?? result.changed_x,
93
+ y: result.changedY ?? result.changed_y,
94
+ width: result.changedWidth ?? result.changed_width,
95
+ height: result.changedHeight ?? result.changed_height,
96
+ }
97
+ : undefined,
98
+ averageDiff: result.averageDiff ?? result.average_diff,
99
+ maxDiff: result.maxDiff ?? result.max_diff,
100
+ hasChanges: result.hasChanges ?? result.has_changes,
101
+ };
102
+ },
103
+ /**
104
+ * Analyze screen as a grid of cells.
105
+ * Each cell includes statistics useful for vision model region selection.
106
+ *
107
+ * This enables efficient visual grounding - instead of sending the entire
108
+ * screen to a vision model, you can identify regions of interest first.
109
+ *
110
+ * @param cols - Number of columns in grid
111
+ * @param rows - Number of rows in grid
112
+ * @param region - Optional region to analyze (full screen if not specified)
113
+ * @returns Grid analysis with cell statistics
114
+ *
115
+ * @example
116
+ * ```typescript
117
+ * // Analyze screen as 16x9 grid
118
+ * const grid = vision.analyzeGrid(16, 9);
119
+ *
120
+ * // Find cells likely containing text
121
+ * const textCells = grid.cells.filter(c => c.likelyText);
122
+ *
123
+ * // Get center of cell [3, 2]
124
+ * const center = vision.gridCellCenter(grid, 3, 2);
125
+ * await mouse.click(center.x, center.y);
126
+ * ```
127
+ */
128
+ analyzeGrid(cols, rows, region) {
129
+ const result = native.visionAnalyzeGrid(cols, rows, region?.x, region?.y, region?.width, region?.height);
130
+ const cells = result.cells.map((c) => ({
131
+ gridX: c.gridX ?? c.grid_x,
132
+ gridY: c.gridY ?? c.grid_y,
133
+ screenX: c.screenX ?? c.screen_x,
134
+ screenY: c.screenY ?? c.screen_y,
135
+ width: c.width,
136
+ height: c.height,
137
+ avgColor: { r: c.avgR ?? c.avg_r, g: c.avgG ?? c.avg_g, b: c.avgB ?? c.avg_b },
138
+ variance: c.variance,
139
+ likelyText: c.likelyText ?? c.likely_text,
140
+ likelyUI: c.likelyUi ?? c.likely_ui,
141
+ }));
142
+ const activeCells = [];
143
+ const activeCellsX = result.activeCellsX ?? result.active_cells_x;
144
+ const activeCellsY = result.activeCellsY ?? result.active_cells_y;
145
+ for (let i = 0; i < activeCellsX.length; i++) {
146
+ activeCells.push({
147
+ gridX: activeCellsX[i],
148
+ gridY: activeCellsY[i],
149
+ });
150
+ }
151
+ return {
152
+ cols: result.cols,
153
+ rows: result.rows,
154
+ cellWidth: result.cellWidth ?? result.cell_width,
155
+ cellHeight: result.cellHeight ?? result.cell_height,
156
+ cells,
157
+ activeCells,
158
+ };
159
+ },
160
+ /**
161
+ * Get the screen center point of a grid cell.
162
+ *
163
+ * @param grid - Grid analysis from analyzeGrid()
164
+ * @param gridX - Cell column index
165
+ * @param gridY - Cell row index
166
+ * @returns Screen coordinates of cell center, or undefined if out of bounds
167
+ */
168
+ gridCellCenter(grid, gridX, gridY) {
169
+ const cell = grid.cells.find((c) => c.gridX === gridX && c.gridY === gridY);
170
+ if (!cell)
171
+ return undefined;
172
+ return {
173
+ x: cell.screenX + Math.floor(cell.width / 2),
174
+ y: cell.screenY + Math.floor(cell.height / 2),
175
+ };
176
+ },
177
+ /**
178
+ * Wait until the screen changes beyond a threshold.
179
+ * Useful for detecting when an action triggers a visual response.
180
+ *
181
+ * @param thresholdPercent - Minimum change percentage to trigger (0-100)
182
+ * @param timeoutMs - Maximum time to wait
183
+ * @param options - Wait options (region, poll interval)
184
+ * @returns true if change detected, false if timed out
185
+ *
186
+ * @example
187
+ * ```typescript
188
+ * // Click and wait for visual feedback
189
+ * mouse.click();
190
+ * const changed = await vision.waitForChange(0.5, 3000);
191
+ * if (!changed) console.log('Button may not have responded');
192
+ * ```
193
+ */
194
+ async waitForChange(thresholdPercent, timeoutMs, options) {
195
+ return native.visionWaitForChange(thresholdPercent, timeoutMs, options?.pollIntervalMs, options?.region?.x, options?.region?.y, options?.region?.width, options?.region?.height);
196
+ },
197
+ /**
198
+ * Wait until the screen stabilizes (stops changing).
199
+ * Essential for waiting for animations, loading spinners, or transitions.
200
+ *
201
+ * @param stabilityThreshold - Maximum change % to consider "stable" (0-100)
202
+ * @param stableDurationMs - How long screen must remain stable
203
+ * @param timeoutMs - Maximum time to wait
204
+ * @param options - Wait options (region, poll interval)
205
+ * @returns true if stabilized, false if timed out
206
+ *
207
+ * @example
208
+ * ```typescript
209
+ * // Click a button and wait for animation to complete
210
+ * mouse.click();
211
+ * await vision.waitForStable(0.1, 200, 5000);
212
+ * // Screen is now stable - safe to read or continue
213
+ * ```
214
+ */
215
+ async waitForStable(stabilityThreshold, stableDurationMs, timeoutMs, options) {
216
+ return native.visionWaitForStable(stabilityThreshold, stableDurationMs, timeoutMs, options?.pollIntervalMs, options?.region?.x, options?.region?.y, options?.region?.width, options?.region?.height);
217
+ },
218
+ /**
219
+ * Compute a perceptual hash of a screen region.
220
+ * Two visually similar images will have hashes with low Hamming distance.
221
+ *
222
+ * Use this for fast "has the screen changed significantly?" checks
223
+ * without doing full pixel comparison.
224
+ *
225
+ * @param region - Optional region to hash (full screen if not specified)
226
+ * @returns 64-bit perceptual hash
227
+ *
228
+ * @example
229
+ * ```typescript
230
+ * const hash1 = vision.perceptualHash();
231
+ * await performSomeAction();
232
+ * const hash2 = vision.perceptualHash();
233
+ * const distance = vision.hashDistance(hash1, hash2);
234
+ * if (distance < 5) console.log('Screen looks similar');
235
+ * ```
236
+ */
237
+ perceptualHash(region) {
238
+ return native.visionPerceptualHash(region?.x, region?.y, region?.width, region?.height);
239
+ },
240
+ /**
241
+ * Compute Hamming distance between two perceptual hashes.
242
+ * Lower distance = more visually similar. 0 = identical.
243
+ *
244
+ * Rules of thumb:
245
+ * - 0-5: Very similar (minor changes)
246
+ * - 5-10: Moderately similar
247
+ * - 10-20: Significant differences
248
+ * - 20+: Completely different
249
+ *
250
+ * @param hash1 - First perceptual hash
251
+ * @param hash2 - Second perceptual hash
252
+ * @returns Hamming distance (0-64)
253
+ */
254
+ hashDistance(hash1, hash2) {
255
+ return native.visionHashDistance(hash1, hash2);
256
+ },
257
+ /**
258
+ * Find all pixels matching a color within a region.
259
+ * Returns all matching points up to a maximum count.
260
+ *
261
+ * @param color - Color to search for (RGB)
262
+ * @param tolerance - Color distance tolerance (0-441)
263
+ * @param options - Search options
264
+ * @returns Array of matching screen coordinates
265
+ *
266
+ * @example
267
+ * ```typescript
268
+ * // Find all red pixels
269
+ * const redPixels = vision.findAllColors(
270
+ * { r: 255, g: 0, b: 0 },
271
+ * 30,
272
+ * { maxResults: 100 }
273
+ * );
274
+ * ```
275
+ */
276
+ findAllColors(color, tolerance, options) {
277
+ const result = native.visionFindAllColors(color.r, color.g, color.b, tolerance, options?.maxResults, options?.region?.x, options?.region?.y, options?.region?.width, options?.region?.height);
278
+ return result.map((p) => ({ x: p.x, y: p.y }));
279
+ },
280
+ /**
281
+ * Find clusters of similar colors (potential UI elements).
282
+ * Clusters are groups of nearby pixels with similar colors.
283
+ *
284
+ * @param color - Color to search for (RGB)
285
+ * @param tolerance - Color distance tolerance (0-441)
286
+ * @param minClusterSize - Minimum pixels to form a cluster
287
+ * @param region - Optional region to search (full screen if not specified)
288
+ * @returns Array of bounding rectangles for each cluster
289
+ *
290
+ * @example
291
+ * ```typescript
292
+ * // Find blue button-like regions
293
+ * const clusters = vision.findColorClusters(
294
+ * { r: 0, g: 120, b: 215 }, // Windows blue
295
+ * 40,
296
+ * 50 // At least 50 pixels
297
+ * );
298
+ * if (clusters.length > 0) {
299
+ * // Click center of first cluster
300
+ * const btn = clusters[0];
301
+ * mouse.click(btn.x + btn.width / 2, btn.y + btn.height / 2);
302
+ * }
303
+ * ```
304
+ */
305
+ findColorClusters(color, tolerance, minClusterSize, region) {
306
+ const result = native.visionFindColorClusters(color.r, color.g, color.b, tolerance, minClusterSize, region?.x, region?.y, region?.width, region?.height);
307
+ return result.map((r) => ({
308
+ x: r.x,
309
+ y: r.y,
310
+ width: r.width,
311
+ height: r.height,
312
+ }));
313
+ },
314
+ /**
315
+ * Verify an action caused a visual change.
316
+ * High-level primitive that captures before/after and compares.
317
+ *
318
+ * @param action - Async action to execute and verify
319
+ * @param minChangePercent - Minimum change to consider verified
320
+ * @param timeoutMs - Maximum time to wait for change
321
+ * @param region - Optional region to monitor
322
+ * @returns Verification result with diff statistics
323
+ *
324
+ * @example
325
+ * ```typescript
326
+ * const result = await vision.verifyAction(
327
+ * async () => { mouse.click(); },
328
+ * 0.5, // At least 0.5% change
329
+ * 2000
330
+ * );
331
+ * if (!result.verified) {
332
+ * // Click didn't cause visual change - might need to retry
333
+ * }
334
+ * ```
335
+ */
336
+ async verifyAction(action, minChangePercent, timeoutMs, region) {
337
+ const before = this.captureReference(region);
338
+ const startTime = Date.now();
339
+ await action();
340
+ // Poll for change
341
+ const pollInterval = 50;
342
+ const deadline = startTime + timeoutMs;
343
+ while (Date.now() < deadline) {
344
+ const diff = this.diff(before);
345
+ if (diff.changePercentage >= minChangePercent) {
346
+ return {
347
+ verified: true,
348
+ diff,
349
+ durationMs: Date.now() - startTime,
350
+ };
351
+ }
352
+ await new Promise((resolve) => setTimeout(resolve, pollInterval));
353
+ }
354
+ // Final check
355
+ const finalDiff = this.diff(before);
356
+ return {
357
+ verified: finalDiff.changePercentage >= minChangePercent,
358
+ diff: finalDiff,
359
+ durationMs: Date.now() - startTime,
360
+ };
361
+ },
362
+ /**
363
+ * Take a reference, perform action, wait for stability.
364
+ * Combines action execution with waiting for the UI to settle.
365
+ *
366
+ * @param action - Action to execute
367
+ * @param options - Wait and verification options
368
+ * @returns true if action completed and screen stabilized
369
+ *
370
+ * @example
371
+ * ```typescript
372
+ * // Click and wait for any animation to complete
373
+ * await vision.doAndWaitStable(async () => {
374
+ * await mouse.click(100, 200);
375
+ * });
376
+ * // Screen is now stable
377
+ * ```
378
+ */
379
+ async doAndWaitStable(action, options) {
380
+ const { stabilityThreshold = 0.1, stableDurationMs = 150, timeoutMs = 5000, region } = options ?? {};
381
+ await action();
382
+ return this.waitForStable(stabilityThreshold, stableDurationMs, timeoutMs, { region });
383
+ },
384
+ };
385
+ // ─── Core Fingerprinting Functions ──────────────────────────────────────────
386
+ function computeFingerprint(data, width, height, startX, startY, cellW, cellH) {
387
+ const brightnessHist = new Float64Array(16);
388
+ const hueHist = new Float64Array(8);
389
+ let sumBright = 0, sumBrightSq = 0;
390
+ let edges = 0;
391
+ let minBright = 255, maxBright = 0;
392
+ let pixelCount = 0;
393
+ const step = 2;
394
+ for (let py = startY; py < startY + cellH && py < height; py += step) {
395
+ for (let px = startX; px < startX + cellW && px < width; px += step) {
396
+ const offset = (py * width + px) * 4;
397
+ const r = data[offset];
398
+ const g = data[offset + 1];
399
+ const b = data[offset + 2];
400
+ const bright = (r * 299 + g * 587 + b * 114) / 1000;
401
+ sumBright += bright;
402
+ sumBrightSq += bright * bright;
403
+ pixelCount++;
404
+ if (bright < minBright)
405
+ minBright = bright;
406
+ if (bright > maxBright)
407
+ maxBright = bright;
408
+ const binIdx = Math.min(15, Math.floor(bright / 16));
409
+ brightnessHist[binIdx]++;
410
+ const maxC = Math.max(r, g, b);
411
+ const minC = Math.min(r, g, b);
412
+ if (maxC - minC > 20) {
413
+ let hue = 0;
414
+ if (maxC === r)
415
+ hue = ((g - b) / (maxC - minC)) % 6;
416
+ else if (maxC === g)
417
+ hue = (b - r) / (maxC - minC) + 2;
418
+ else
419
+ hue = (r - g) / (maxC - minC) + 4;
420
+ if (hue < 0)
421
+ hue += 6;
422
+ const hueBin = Math.min(7, Math.floor(hue / 6 * 8));
423
+ hueHist[hueBin]++;
424
+ }
425
+ if (px + step < startX + cellW && px + step < width) {
426
+ const rOff = (py * width + px + step) * 4;
427
+ const rBright = (data[rOff] * 299 + data[rOff + 1] * 587 + data[rOff + 2] * 114) / 1000;
428
+ if (Math.abs(bright - rBright) > 25)
429
+ edges++;
430
+ }
431
+ }
432
+ }
433
+ if (pixelCount > 0) {
434
+ for (let i = 0; i < 16; i++)
435
+ brightnessHist[i] /= pixelCount;
436
+ let hueTotal = 0;
437
+ for (let i = 0; i < 8; i++)
438
+ hueTotal += hueHist[i];
439
+ if (hueTotal > 0) {
440
+ for (let i = 0; i < 8; i++)
441
+ hueHist[i] /= hueTotal;
442
+ }
443
+ }
444
+ if (pixelCount === 0)
445
+ pixelCount = 1;
446
+ const avgBright = sumBright / pixelCount;
447
+ const variance = Math.max(0, sumBrightSq / pixelCount - avgBright * avgBright);
448
+ const miniW = Math.floor(cellW / 4) || 1;
449
+ const miniH = Math.floor(cellH / 4) || 1;
450
+ let phash = 0;
451
+ for (let mr = 0; mr < 4; mr++) {
452
+ for (let mc = 0; mc < 4; mc++) {
453
+ const sx = startX + mc * miniW;
454
+ const sy = startY + mr * miniH;
455
+ if (sx < width && sy < height) {
456
+ const off = (sy * width + sx) * 4;
457
+ const b = (data[off] * 299 + data[off + 1] * 587 + data[off + 2] * 114) / 1000;
458
+ if (b > avgBright)
459
+ phash |= (1 << (mr * 4 + mc));
460
+ }
461
+ }
462
+ }
463
+ return {
464
+ brightnessHist, hueHist,
465
+ edgeDensity: edges / Math.max(1, pixelCount),
466
+ avgBrightness: avgBright,
467
+ contrast: maxBright - minBright,
468
+ variance,
469
+ aspect: cellW / Math.max(1, cellH),
470
+ phash,
471
+ };
472
+ }
473
+ function fingerprintSimilarity(a, b) {
474
+ let histSim = 0;
475
+ for (let i = 0; i < 16; i++)
476
+ histSim += Math.min(a.brightnessHist[i], b.brightnessHist[i]);
477
+ let hueSim = 0;
478
+ let aHueSum = 0, bHueSum = 0;
479
+ for (let i = 0; i < 8; i++) {
480
+ hueSim += Math.min(a.hueHist[i], b.hueHist[i]);
481
+ aHueSum += a.hueHist[i];
482
+ bHueSum += b.hueHist[i];
483
+ }
484
+ // If both lack color, hue is irrelevant — treat as identical
485
+ if (aHueSum < 0.001 && bHueSum < 0.001)
486
+ hueSim = 1;
487
+ const edgeSim = 1 - Math.abs(a.edgeDensity - b.edgeDensity);
488
+ const contrastSim = 1 - Math.abs(a.contrast - b.contrast) / 255;
489
+ const brightSim = 1 - Math.abs(a.avgBrightness - b.avgBrightness) / 255;
490
+ let xor = a.phash ^ b.phash;
491
+ let hamming = 0;
492
+ while (xor) {
493
+ hamming += xor & 1;
494
+ xor >>= 1;
495
+ }
496
+ const phashSim = 1 - hamming / 16;
497
+ return (histSim * 0.25 +
498
+ hueSim * 0.10 +
499
+ edgeSim * 0.15 +
500
+ contrastSim * 0.15 +
501
+ brightSim * 0.10 +
502
+ phashSim * 0.25);
503
+ }
504
+ // ─── Helper to create LocatedElement ────────────────────────────────────────
505
+ function makeLocatedElement(x, y, w, h, confidence, similarity, fp, label, supportCount = 1) {
506
+ return {
507
+ x, y, w, h, confidence, similarity, label, supportCount, fingerprint: fp,
508
+ click(button) {
509
+ const cx = x + Math.floor(w / 2);
510
+ const cy = y + Math.floor(h / 2);
511
+ native.mouseClickAt(cx, cy, button ?? 'left', false);
512
+ },
513
+ doubleClick() {
514
+ native.mouseMove(x + Math.floor(w / 2), y + Math.floor(h / 2));
515
+ native.mouseDoubleClick('left');
516
+ },
517
+ rightClick() {
518
+ native.mouseClickAt(x + Math.floor(w / 2), y + Math.floor(h / 2), 'right', false);
519
+ },
520
+ moveTo(options) {
521
+ const cx = x + Math.floor(w / 2);
522
+ const cy = y + Math.floor(h / 2);
523
+ if (options?.smooth) {
524
+ native.mouseMoveSmooth(cx, cy, options.duration ?? 300, 'easeInOut');
525
+ }
526
+ else {
527
+ native.mouseMove(cx, cy);
528
+ }
529
+ },
530
+ type(text) {
531
+ const cx = x + Math.floor(w / 2);
532
+ const cy = y + Math.floor(h / 2);
533
+ native.mouseClickAt(cx, cy, 'left', false);
534
+ native.keyboardType(text);
535
+ },
536
+ isStillPresent(threshold) {
537
+ const cap = native.screenCapture();
538
+ const nowFp = computeFingerprint(cap.data, cap.width, cap.height, x, y, w, h);
539
+ return fingerprintSimilarity(fp, nowFp) >= (threshold ?? 0.7);
540
+ },
541
+ };
542
+ }
543
+ // ─── AgentVision Class ──────────────────────────────────────────────────────
544
+ class AgentVision {
545
+ cols;
546
+ rows;
547
+ maxMemories;
548
+ defaultRegionSize;
549
+ matchThreshold;
550
+ searchRadius;
551
+ appContext;
552
+ memories = [];
553
+ sequenceCounter = 0;
554
+ recentActions = [];
555
+ temporalPatterns = new Map();
556
+ // Grid cache
557
+ gridFingerprints = [];
558
+ gridDirty = true;
559
+ lastGridTime = 0;
560
+ lastCapture = null;
561
+ constructor(config = {}) {
562
+ this.cols = config.cols ?? 32;
563
+ this.rows = config.rows ?? 18;
564
+ this.maxMemories = config.maxMemories ?? 2000;
565
+ this.defaultRegionSize = config.defaultRegionSize ?? { w: 100, h: 60 };
566
+ this.matchThreshold = config.matchThreshold ?? 0.65;
567
+ this.searchRadius = config.searchRadius ?? 400;
568
+ this.appContext = config.appContext;
569
+ }
570
+ // ═══ FINGERPRINT — Capture visual identity of any region ══════════════
571
+ /**
572
+ * Fingerprint a screen region. Returns a compact visual descriptor
573
+ * that can be compared against other fingerprints for similarity.
574
+ *
575
+ * @example
576
+ * ```typescript
577
+ * const fp = agentVision.fingerprint(100, 200, 150, 40);
578
+ * // Later: check if the same element is somewhere else
579
+ * const match = agentVision.locate(fp);
580
+ * ```
581
+ */
582
+ fingerprint(x, y, w, h) {
583
+ const cap = native.screenCapture();
584
+ return computeFingerprint(cap.data, cap.width, cap.height, x, y, w, h);
585
+ }
586
+ /**
587
+ * Compute similarity between two fingerprints (0-1, 1 = identical).
588
+ */
589
+ similarity(a, b) {
590
+ return fingerprintSimilarity(a, b);
591
+ }
592
+ // ═══ REMEMBER — Teach the agent about screen elements ═════════════════
593
+ /**
594
+ * Remember an element at a position with a label.
595
+ * The agent fingerprints the region and stores it for later recognition.
596
+ *
597
+ * @example
598
+ * ```typescript
599
+ * // Agent clicked Save, now remembers what it looks like
600
+ * agentVision.remember('save-button', 350, 15, { w: 80, h: 30 });
601
+ *
602
+ * // Later, find it again even if it moved
603
+ * const saveBtn = agentVision.find('save-button');
604
+ * if (saveBtn) saveBtn.click();
605
+ * ```
606
+ */
607
+ remember(label, x, y, regionSize, action = 'click') {
608
+ const size = regionSize ?? this.defaultRegionSize;
609
+ const rx = Math.max(0, x - Math.floor(size.w / 2));
610
+ const ry = Math.max(0, y - Math.floor(size.h / 2));
611
+ const cap = native.screenCapture();
612
+ const fp = computeFingerprint(cap.data, cap.width, cap.height, rx, ry, size.w, size.h);
613
+ const memory = {
614
+ fingerprint: fp,
615
+ action,
616
+ label,
617
+ position: { x, y },
618
+ regionSize: size,
619
+ timestamp: Date.now(),
620
+ appContext: this.appContext,
621
+ sequenceIndex: this.sequenceCounter++,
622
+ };
623
+ this.memories.push(memory);
624
+ this.evictOldMemories();
625
+ this.gridDirty = true;
626
+ // Track temporal patterns
627
+ this.trackAction(action, label, x, y);
628
+ return memory;
629
+ }
630
+ /**
631
+ * Remember after performing a click — records the action AND verifies it.
632
+ * Captures before/after fingerprints and records whether the click did anything.
633
+ *
634
+ * @example
635
+ * ```typescript
636
+ * const result = await agentVision.rememberClick('file-menu', 44, 12);
637
+ * console.log(`Click ${result.success ? 'worked' : 'had no effect'}`);
638
+ * ```
639
+ */
640
+ async rememberClick(label, x, y, regionSize) {
641
+ const size = regionSize ?? this.defaultRegionSize;
642
+ const rx = Math.max(0, x - Math.floor(size.w / 2));
643
+ const ry = Math.max(0, y - Math.floor(size.h / 2));
644
+ // Capture before
645
+ const capBefore = native.screenCapture();
646
+ const fp = computeFingerprint(capBefore.data, capBefore.width, capBefore.height, rx, ry, size.w, size.h);
647
+ // Perform click
648
+ native.mouseClickAt(x, y, 'left', false);
649
+ // Wait a bit for UI response
650
+ await new Promise(r => setTimeout(r, 400));
651
+ // Capture after and measure change
652
+ const capAfter = native.screenCapture();
653
+ const diff = native.visionDiff(capBefore.data, capBefore.width, capBefore.height, undefined, undefined, undefined, undefined, 10, 2);
654
+ const changePercent = diff.changePercentage ?? diff.change_percentage ?? 0;
655
+ const success = changePercent > 0.3;
656
+ const memory = {
657
+ fingerprint: fp,
658
+ action: 'click',
659
+ label,
660
+ position: { x, y },
661
+ regionSize: size,
662
+ timestamp: Date.now(),
663
+ outcome: { screenChangePercent: changePercent, responseTimeMs: 400 },
664
+ success,
665
+ appContext: this.appContext,
666
+ sequenceIndex: this.sequenceCounter++,
667
+ };
668
+ this.memories.push(memory);
669
+ this.evictOldMemories();
670
+ this.gridDirty = true;
671
+ this.trackAction('click', label, x, y);
672
+ return { ...memory, success };
673
+ }
674
+ // ═══ FIND — Locate remembered elements on screen ══════════════════════
675
+ /**
676
+ * Find a previously remembered element on the current screen.
677
+ * Uses visual fingerprint matching — works even if the element moved.
678
+ *
679
+ * @returns The best match, or null if nothing above threshold
680
+ *
681
+ * @example
682
+ * ```typescript
683
+ * const saveBtn = agentVision.find('save-button');
684
+ * if (saveBtn) {
685
+ * saveBtn.click();
686
+ * } else {
687
+ * console.log('Save button not visible');
688
+ * }
689
+ * ```
690
+ */
691
+ find(label) {
692
+ const results = this.findAll(label, 1);
693
+ return results.length > 0 ? results[0] : null;
694
+ }
695
+ /**
696
+ * Find all instances of a remembered element on screen.
697
+ *
698
+ * @example
699
+ * ```typescript
700
+ * // Find all things that look like "close-button"
701
+ * const closeButtons = agentVision.findAll('close-button');
702
+ * console.log(`Found ${closeButtons.length} close buttons`);
703
+ * ```
704
+ */
705
+ findAll(label, maxResults = 20) {
706
+ const labeledMemories = this.memories.filter(m => m.label === label && (!this.appContext || m.appContext === this.appContext));
707
+ if (labeledMemories.length === 0)
708
+ return [];
709
+ this.refreshGrid();
710
+ const cap = this.lastCapture;
711
+ const cellW = Math.floor(cap.width / this.cols);
712
+ const cellH = Math.floor(cap.height / this.rows);
713
+ const results = [];
714
+ for (let row = 0; row < this.rows; row++) {
715
+ for (let col = 0; col < this.cols; col++) {
716
+ const idx = row * this.cols + col;
717
+ const fp = this.gridFingerprints[idx];
718
+ if (!fp)
719
+ continue;
720
+ let bestSim = 0;
721
+ for (const mem of labeledMemories) {
722
+ const sim = fingerprintSimilarity(fp, mem.fingerprint);
723
+ if (sim > bestSim)
724
+ bestSim = sim;
725
+ }
726
+ if (bestSim < this.matchThreshold)
727
+ continue;
728
+ const x = col * cellW;
729
+ const y = row * cellH;
730
+ results.push(makeLocatedElement(x, y, cellW, cellH, bestSim, bestSim, fp, label, labeledMemories.length));
731
+ }
732
+ }
733
+ results.sort((a, b) => b.confidence - a.confidence);
734
+ return results.slice(0, maxResults);
735
+ }
736
+ // ═══ LOCATE — Find element by fingerprint anywhere on screen ══════════
737
+ /**
738
+ * Locate a specific visual fingerprint on the current screen.
739
+ * Does a focused scan: first near the expected position, then widens.
740
+ *
741
+ * @example
742
+ * ```typescript
743
+ * const fp = agentVision.fingerprint(100, 200, 80, 30);
744
+ * // ... some time later, UI may have reorganized ...
745
+ * const found = agentVision.locate(fp, { near: { x: 100, y: 200 } });
746
+ * if (found) found.click();
747
+ * ```
748
+ */
749
+ locate(target, options) {
750
+ const threshold = options?.threshold ?? this.matchThreshold;
751
+ const radius = options?.searchRadius ?? this.searchRadius;
752
+ const cap = native.screenCapture();
753
+ // Determine region size from fingerprint aspect ratio
754
+ const h = this.defaultRegionSize.h;
755
+ const w = Math.round(h * target.aspect);
756
+ let bestSim = 0;
757
+ let bestX = 0;
758
+ let bestY = 0;
759
+ const stepX = Math.max(8, Math.floor(w * 0.4));
760
+ const stepY = Math.max(8, Math.floor(h * 0.4));
761
+ // If we have a hint, search near it first
762
+ if (options?.near) {
763
+ const sX = Math.max(0, options.near.x - radius);
764
+ const eX = Math.min(cap.width - w, options.near.x + radius);
765
+ const sY = Math.max(0, options.near.y - radius);
766
+ const eY = Math.min(cap.height - h, options.near.y + radius);
767
+ for (let sy = sY; sy < eY; sy += stepY) {
768
+ for (let sx = sX; sx < eX; sx += stepX) {
769
+ const candidate = computeFingerprint(cap.data, cap.width, cap.height, sx, sy, w, h);
770
+ const sim = fingerprintSimilarity(target, candidate);
771
+ if (sim > bestSim) {
772
+ bestSim = sim;
773
+ bestX = sx;
774
+ bestY = sy;
775
+ }
776
+ }
777
+ }
778
+ if (bestSim >= threshold) {
779
+ return makeLocatedElement(bestX, bestY, w, h, bestSim, bestSim, target);
780
+ }
781
+ }
782
+ // Full screen scan with grid
783
+ this.refreshGrid();
784
+ const cellW = Math.floor(cap.width / this.cols);
785
+ const cellH = Math.floor(cap.height / this.rows);
786
+ for (let row = 0; row < this.rows; row++) {
787
+ for (let col = 0; col < this.cols; col++) {
788
+ const idx = row * this.cols + col;
789
+ const fp = this.gridFingerprints[idx];
790
+ if (!fp)
791
+ continue;
792
+ const sim = fingerprintSimilarity(target, fp);
793
+ if (sim > bestSim) {
794
+ bestSim = sim;
795
+ bestX = col * cellW;
796
+ bestY = row * cellH;
797
+ }
798
+ }
799
+ }
800
+ if (bestSim >= threshold) {
801
+ return makeLocatedElement(bestX, bestY, cellW, cellH, bestSim, bestSim, target);
802
+ }
803
+ return null;
804
+ }
805
+ // ═══ FIND BY TEXT — Combine OCR with visual memory ════════════════════
806
+ /**
807
+ * Find an element by text using OCR, then fingerprint it for future recognition.
808
+ * First time: uses OCR. After that: can find it visually even without OCR.
809
+ *
810
+ * @example
811
+ * ```typescript
812
+ * // First call uses OCR to find "Save" text
813
+ * const save = agentVision.findByText('Save');
814
+ * if (save) save.click();
815
+ * // Now agentVision remembers what "Save" looks like visually
816
+ * ```
817
+ */
818
+ findByText(text, options) {
819
+ const shouldRemember = options?.remember !== false;
820
+ // First try: recall from memory (fast, no OCR needed)
821
+ const recalled = this.find(`text:${text}`);
822
+ if (recalled && recalled.isStillPresent()) {
823
+ return recalled;
824
+ }
825
+ // Second try: OCR
826
+ try {
827
+ const found = native.ocrFindText(text);
828
+ if (found) {
829
+ const x = found.x;
830
+ const y = found.y;
831
+ const w = found.width || this.defaultRegionSize.w;
832
+ const h = found.height || this.defaultRegionSize.h;
833
+ const cap = native.screenCapture();
834
+ const fp = computeFingerprint(cap.data, cap.width, cap.height, x, y, w, h);
835
+ if (shouldRemember) {
836
+ this.remember(`text:${text}`, x + Math.floor(w / 2), y + Math.floor(h / 2), { w, h });
837
+ }
838
+ return makeLocatedElement(x, y, w, h, found.confidence ?? 1.0, 1.0, fp, `text:${text}`);
839
+ }
840
+ }
841
+ catch { /* OCR unavailable */ }
842
+ return null;
843
+ }
844
+ // ═══ VERIFY — Check if actions had an effect ══════════════════════════
845
+ /**
846
+ * Click an element and verify the screen changed.
847
+ * Returns the located element with success/failure status.
848
+ *
849
+ * @example
850
+ * ```typescript
851
+ * const result = await agentVision.clickAndVerify('submit-button');
852
+ * if (!result.verified) {
853
+ * // Button didn't respond — try again or escalate
854
+ * }
855
+ * ```
856
+ */
857
+ async clickAndVerify(label, options) {
858
+ const el = this.find(label);
859
+ if (!el)
860
+ return { element: null, verified: false, changePercent: 0 };
861
+ const before = exports.vision.captureReference();
862
+ el.click();
863
+ await new Promise(r => setTimeout(r, options?.timeout ?? 500));
864
+ const diff = exports.vision.diff(before);
865
+ const minChange = options?.minChange ?? 0.3;
866
+ const verified = diff.changePercentage >= minChange;
867
+ // Update memory with outcome
868
+ const mem = [...this.memories].reverse().find(m => m.label === label);
869
+ if (mem) {
870
+ mem.outcome = { screenChangePercent: diff.changePercentage, responseTimeMs: options?.timeout ?? 500 };
871
+ mem.success = verified;
872
+ }
873
+ return { element: el, verified, changePercent: diff.changePercentage };
874
+ }
875
+ /**
876
+ * Check if a specific screen region visually changed since a fingerprint was taken.
877
+ *
878
+ * @example
879
+ * ```typescript
880
+ * const before = agentVision.fingerprint(100, 200, 80, 30);
881
+ * await agent.doSomething();
882
+ * const changed = agentVision.hasChanged(before, 100, 200, 80, 30);
883
+ * ```
884
+ */
885
+ hasChanged(previousFingerprint, x, y, w, h, changeThreshold = 0.15) {
886
+ const current = this.fingerprint(x, y, w, h);
887
+ const sim = fingerprintSimilarity(previousFingerprint, current);
888
+ return sim < (1 - changeThreshold);
889
+ }
890
+ // ═══ PREDICT — Temporal pattern learning & prediction ═════════════════
891
+ /**
892
+ * Predict what will happen next based on learned temporal patterns.
893
+ *
894
+ * @example
895
+ * ```typescript
896
+ * // After clicking "File" many times, the agent learns:
897
+ * // "After clicking file-menu, a dropdown appears"
898
+ * const next = agentVision.predictNext();
899
+ * if (next) {
900
+ * console.log(`Expected: ${next.nextAction} in ${next.expectedDelayMs}ms`);
901
+ * }
902
+ * ```
903
+ */
904
+ predictNext(filterLabel) {
905
+ if (this.recentActions.length === 0)
906
+ return null;
907
+ const last = this.recentActions[this.recentActions.length - 1];
908
+ const prefix = `${last.action}:${last.label || ''}→`;
909
+ let bestKey = null;
910
+ let bestPattern = null;
911
+ let bestCount = 0;
912
+ const entries = Array.from(this.temporalPatterns.entries());
913
+ for (let i = 0; i < entries.length; i++) {
914
+ const key = entries[i][0];
915
+ const pattern = entries[i][1];
916
+ if (!key.startsWith(prefix))
917
+ continue;
918
+ if (filterLabel && !key.includes(filterLabel))
919
+ continue;
920
+ if (pattern.count > bestCount) {
921
+ bestKey = key;
922
+ bestPattern = pattern;
923
+ bestCount = pattern.count;
924
+ }
925
+ }
926
+ if (!bestKey || !bestPattern)
927
+ return null;
928
+ const nextPart = bestKey.split('→')[1] || '';
929
+ const parts = nextPart.split(':');
930
+ return {
931
+ nextAction: parts[0] || nextPart,
932
+ label: parts[1] || undefined,
933
+ confidence: Math.min(1, bestPattern.count / 5),
934
+ expectedDelayMs: bestPattern.avgDelayMs,
935
+ predictedRegion: bestPattern.predictedRegion,
936
+ };
937
+ }
938
+ /**
939
+ * Wait for a predicted event to happen.
940
+ * Uses temporal patterns to know WHEN and WHERE to look.
941
+ *
942
+ * @example
943
+ * ```typescript
944
+ * // Click File menu
945
+ * agentVision.find('file-menu')?.click();
946
+ * // Wait for the dropdown the agent learned usually appears
947
+ * const appeared = await agentVision.waitForPredicted('dropdown', 3000);
948
+ * ```
949
+ */
950
+ async waitForPredicted(label, timeoutMs = 5000) {
951
+ const prediction = this.predictNext(label);
952
+ const pollMs = 100;
953
+ const deadline = Date.now() + timeoutMs;
954
+ // Take baseline for change detection
955
+ const baseline = exports.vision.captureReference(prediction?.predictedRegion
956
+ ? { x: prediction.predictedRegion.x, y: prediction.predictedRegion.y, width: prediction.predictedRegion.w, height: prediction.predictedRegion.h }
957
+ : undefined);
958
+ while (Date.now() < deadline) {
959
+ // Check if something changed in the predicted region
960
+ const diff = exports.vision.diff(baseline);
961
+ if (diff.hasChanges && diff.changePercentage > 0.5) {
962
+ // Something appeared — try to locate the expected element
963
+ if (label) {
964
+ const found = this.find(label);
965
+ if (found)
966
+ return found;
967
+ }
968
+ // Or just return the changed region
969
+ if (diff.changedBounds) {
970
+ const b = diff.changedBounds;
971
+ const cap = native.screenCapture();
972
+ const fp = computeFingerprint(cap.data, cap.width, cap.height, b.x, b.y, b.width, b.height);
973
+ return makeLocatedElement(b.x, b.y, b.width, b.height, 0.8, 0.8, fp, label);
974
+ }
975
+ }
976
+ await new Promise(r => setTimeout(r, pollMs));
977
+ }
978
+ return null;
979
+ }
980
+ /**
981
+ * Get all learned temporal patterns.
982
+ */
983
+ getPatterns() {
984
+ const result = [];
985
+ const entries = Array.from(this.temporalPatterns.entries());
986
+ for (let i = 0; i < entries.length; i++) {
987
+ result.push({ pattern: entries[i][0], count: entries[i][1].count, avgDelayMs: entries[i][1].avgDelayMs });
988
+ }
989
+ return result.sort((a, b) => b.count - a.count);
990
+ }
991
+ // ═══ SCAN — Full screen understanding ═════════════════════════════════
992
+ /**
993
+ * Scan the full screen and return all regions that match ANY remembered element.
994
+ * Gives the agent a complete understanding of "what's on screen that I recognize."
995
+ *
996
+ * @example
997
+ * ```typescript
998
+ * const recognized = agentVision.scan();
999
+ * for (const el of recognized) {
1000
+ * console.log(`Found "${el.label}" at (${el.x},${el.y}) confidence=${el.confidence}`);
1001
+ * }
1002
+ * ```
1003
+ */
1004
+ scan(threshold) {
1005
+ const minSim = threshold ?? this.matchThreshold;
1006
+ this.refreshGrid();
1007
+ const cap = this.lastCapture;
1008
+ const cellW = Math.floor(cap.width / this.cols);
1009
+ const cellH = Math.floor(cap.height / this.rows);
1010
+ // Group memories by label
1011
+ const labelGroups = new Map();
1012
+ for (const mem of this.memories) {
1013
+ if (!mem.label)
1014
+ continue;
1015
+ const group = labelGroups.get(mem.label) ?? [];
1016
+ group.push(mem);
1017
+ labelGroups.set(mem.label, group);
1018
+ }
1019
+ const results = [];
1020
+ for (let row = 0; row < this.rows; row++) {
1021
+ for (let col = 0; col < this.cols; col++) {
1022
+ const idx = row * this.cols + col;
1023
+ const fp = this.gridFingerprints[idx];
1024
+ if (!fp)
1025
+ continue;
1026
+ let bestLabel;
1027
+ let bestSim = 0;
1028
+ let bestCount = 0;
1029
+ const labelEntries = Array.from(labelGroups.entries());
1030
+ for (let li = 0; li < labelEntries.length; li++) {
1031
+ const label = labelEntries[li][0];
1032
+ const mems = labelEntries[li][1];
1033
+ for (const mem of mems) {
1034
+ const sim = fingerprintSimilarity(fp, mem.fingerprint);
1035
+ if (sim > bestSim) {
1036
+ bestSim = sim;
1037
+ bestLabel = label;
1038
+ bestCount = mems.length;
1039
+ }
1040
+ }
1041
+ }
1042
+ if (bestSim < minSim || !bestLabel)
1043
+ continue;
1044
+ results.push(makeLocatedElement(col * cellW, row * cellH, cellW, cellH, bestSim, bestSim, fp, bestLabel, bestCount));
1045
+ }
1046
+ }
1047
+ // Deduplicate: keep only the best match per label per rough area
1048
+ const deduped = [];
1049
+ const seen = new Set();
1050
+ results.sort((a, b) => b.confidence - a.confidence);
1051
+ for (const el of results) {
1052
+ const key = `${el.label}:${Math.floor(el.x / 200)}:${Math.floor(el.y / 200)}`;
1053
+ if (!seen.has(key)) {
1054
+ seen.add(key);
1055
+ deduped.push(el);
1056
+ }
1057
+ }
1058
+ return deduped;
1059
+ }
1060
+ // ═══ PERSISTENCE — Save/load memory across sessions ═══════════════════
1061
+ /**
1062
+ * Export all visual memories as a JSON string.
1063
+ *
1064
+ * @example
1065
+ * ```typescript
1066
+ * const data = agentVision.save();
1067
+ * fs.writeFileSync('agent-memory.json', data);
1068
+ * ```
1069
+ */
1070
+ save() {
1071
+ return JSON.stringify({
1072
+ version: 1,
1073
+ appContext: this.appContext,
1074
+ memories: this.memories.map(m => ({
1075
+ ...m,
1076
+ fingerprint: {
1077
+ ...m.fingerprint,
1078
+ brightnessHist: Array.from(m.fingerprint.brightnessHist),
1079
+ hueHist: Array.from(m.fingerprint.hueHist),
1080
+ },
1081
+ })),
1082
+ temporalPatterns: Array.from(this.temporalPatterns.entries()),
1083
+ });
1084
+ }
1085
+ /**
1086
+ * Load visual memories from a JSON string.
1087
+ *
1088
+ * @example
1089
+ * ```typescript
1090
+ * const data = fs.readFileSync('agent-memory.json', 'utf8');
1091
+ * agentVision.load(data);
1092
+ * // Agent now remembers everything from last session
1093
+ * ```
1094
+ */
1095
+ load(json) {
1096
+ const data = JSON.parse(json);
1097
+ if (data.memories) {
1098
+ for (const m of data.memories) {
1099
+ m.fingerprint.brightnessHist = new Float64Array(m.fingerprint.brightnessHist);
1100
+ m.fingerprint.hueHist = new Float64Array(m.fingerprint.hueHist);
1101
+ this.memories.push(m);
1102
+ }
1103
+ }
1104
+ if (data.temporalPatterns) {
1105
+ for (const [key, val] of data.temporalPatterns) {
1106
+ this.temporalPatterns.set(key, val);
1107
+ }
1108
+ }
1109
+ }
1110
+ // ═══ STATE ════════════════════════════════════════════════════════════
1111
+ /** Number of stored memories. */
1112
+ get memoryCount() { return this.memories.length; }
1113
+ /** All unique labels the agent has learned. */
1114
+ get knownLabels() {
1115
+ const labels = new Set();
1116
+ for (const m of this.memories) {
1117
+ if (m.label)
1118
+ labels.add(m.label);
1119
+ }
1120
+ return Array.from(labels);
1121
+ }
1122
+ /** Clear all memories. */
1123
+ reset() {
1124
+ this.memories = [];
1125
+ this.temporalPatterns.clear();
1126
+ this.recentActions = [];
1127
+ this.sequenceCounter = 0;
1128
+ this.gridDirty = true;
1129
+ }
1130
+ /** Set the app context (filters memories by app). */
1131
+ setContext(appContext) {
1132
+ this.appContext = appContext;
1133
+ this.gridDirty = true;
1134
+ }
1135
+ // ═══ INTERNALS ════════════════════════════════════════════════════════
1136
+ refreshGrid() {
1137
+ const now = Date.now();
1138
+ if (!this.gridDirty && now - this.lastGridTime < 200)
1139
+ return;
1140
+ const cap = native.screenCapture();
1141
+ this.lastCapture = cap;
1142
+ const cellW = Math.floor(cap.width / this.cols);
1143
+ const cellH = Math.floor(cap.height / this.rows);
1144
+ this.gridFingerprints = [];
1145
+ for (let row = 0; row < this.rows; row++) {
1146
+ for (let col = 0; col < this.cols; col++) {
1147
+ this.gridFingerprints.push(computeFingerprint(cap.data, cap.width, cap.height, col * cellW, row * cellH, cellW, cellH));
1148
+ }
1149
+ }
1150
+ this.gridDirty = false;
1151
+ this.lastGridTime = now;
1152
+ }
1153
+ trackAction(action, label, x, y) {
1154
+ this.recentActions.push({ action, label, time: Date.now(), pos: { x, y } });
1155
+ // Keep only last 30s
1156
+ const cutoff = Date.now() - 30000;
1157
+ this.recentActions = this.recentActions.filter(a => a.time > cutoff);
1158
+ // Learn temporal patterns from consecutive actions
1159
+ for (let i = 1; i < this.recentActions.length; i++) {
1160
+ const prev = this.recentActions[i - 1];
1161
+ const curr = this.recentActions[i];
1162
+ const key = `${prev.action}:${prev.label || ''}→${curr.action}:${curr.label || ''}`;
1163
+ const delay = curr.time - prev.time;
1164
+ const existing = this.temporalPatterns.get(key);
1165
+ if (existing) {
1166
+ existing.avgDelayMs = (existing.avgDelayMs * existing.count + delay) / (existing.count + 1);
1167
+ existing.count++;
1168
+ existing.predictedRegion = { x: curr.pos.x - 50, y: curr.pos.y - 50, w: 100, h: 100 };
1169
+ }
1170
+ else {
1171
+ this.temporalPatterns.set(key, {
1172
+ avgDelayMs: delay,
1173
+ count: 1,
1174
+ predictedRegion: { x: curr.pos.x - 50, y: curr.pos.y - 50, w: 100, h: 100 },
1175
+ });
1176
+ }
1177
+ }
1178
+ }
1179
+ evictOldMemories() {
1180
+ while (this.memories.length > this.maxMemories) {
1181
+ // Remove oldest non-labeled memory first, then oldest overall
1182
+ const unlabeledIdx = this.memories.findIndex(m => !m.label);
1183
+ if (unlabeledIdx >= 0) {
1184
+ this.memories.splice(unlabeledIdx, 1);
1185
+ }
1186
+ else {
1187
+ this.memories.shift();
1188
+ }
1189
+ }
1190
+ }
1191
+ }
1192
+ exports.AgentVision = AgentVision;
1193
+ /** Create a new AgentVision instance. */
1194
+ function createAgentVision(config) {
1195
+ return new AgentVision(config);
1196
+ }
1197
+ //# sourceMappingURL=vision.js.map