screenhand 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/.claude/commands/automate.md +28 -0
  2. package/.claude/commands/debug-ui.md +19 -0
  3. package/.claude/commands/screenshot.md +15 -0
  4. package/.github/FUNDING.yml +1 -0
  5. package/.github/ISSUE_TEMPLATE/bug_report.md +27 -0
  6. package/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
  7. package/.mcp.json +8 -0
  8. package/DESKTOP_MCP_GUIDE.md +92 -0
  9. package/LICENSE +661 -21
  10. package/README.md +97 -292
  11. package/SECURITY.md +44 -0
  12. package/docs/architecture.md +47 -0
  13. package/install-skills.sh +19 -0
  14. package/mcp-bridge.ts +271 -0
  15. package/mcp-desktop.ts +1221 -0
  16. package/native/macos-bridge/Package.swift +21 -0
  17. package/native/macos-bridge/Sources/AccessibilityBridge.swift +261 -0
  18. package/native/macos-bridge/Sources/AppManagement.swift +129 -0
  19. package/native/macos-bridge/Sources/CoreGraphicsBridge.swift +242 -0
  20. package/native/macos-bridge/Sources/ObserverBridge.swift +120 -0
  21. package/native/macos-bridge/Sources/VisionBridge.swift +80 -0
  22. package/native/macos-bridge/Sources/main.swift +345 -0
  23. package/native/windows-bridge/AppManagement.cs +234 -0
  24. package/native/windows-bridge/InputBridge.cs +436 -0
  25. package/native/windows-bridge/Program.cs +265 -0
  26. package/native/windows-bridge/ScreenCapture.cs +329 -0
  27. package/native/windows-bridge/UIAutomationBridge.cs +571 -0
  28. package/native/windows-bridge/WindowsBridge.csproj +17 -0
  29. package/package.json +3 -14
  30. package/playbooks/devpost.json +186 -0
  31. package/playbooks/instagram.json +41 -0
  32. package/playbooks/instagram_v2.json +201 -0
  33. package/playbooks/x_v1.json +211 -0
  34. package/scripts/devpost-live-loop.mjs +421 -0
  35. package/src/config.ts +30 -0
  36. package/src/index.ts +92 -0
  37. package/src/logging/timeline-logger.ts +55 -0
  38. package/src/mcp/server.ts +449 -0
  39. package/src/memory/recall.ts +191 -0
  40. package/src/memory/research.ts +146 -0
  41. package/src/memory/seeds.ts +123 -0
  42. package/src/memory/session.ts +201 -0
  43. package/src/memory/store.ts +434 -0
  44. package/src/memory/types.ts +69 -0
  45. package/src/native/bridge-client.ts +239 -0
  46. package/src/native/macos-bridge-client.ts +22 -0
  47. package/src/runtime/accessibility-adapter.ts +487 -0
  48. package/src/runtime/app-adapter.ts +169 -0
  49. package/src/runtime/applescript-adapter.ts +376 -0
  50. package/src/runtime/ax-role-map.ts +102 -0
  51. package/src/runtime/browser-adapter.ts +129 -0
  52. package/src/runtime/cdp-chrome-adapter.ts +676 -0
  53. package/src/runtime/composite-adapter.ts +274 -0
  54. package/src/runtime/executor.ts +396 -0
  55. package/src/runtime/locator-cache.ts +33 -0
  56. package/src/runtime/planning-loop.ts +81 -0
  57. package/src/runtime/service.ts +448 -0
  58. package/src/runtime/session-manager.ts +50 -0
  59. package/src/runtime/state-observer.ts +136 -0
  60. package/src/runtime/vision-adapter.ts +297 -0
  61. package/src/types.ts +297 -0
  62. package/tests/bridge-client.test.ts +176 -0
  63. package/tests/browser-stealth.test.ts +210 -0
  64. package/tests/composite-adapter.test.ts +64 -0
  65. package/tests/mcp-server.test.ts +151 -0
  66. package/tests/memory-recall.test.ts +339 -0
  67. package/tests/memory-research.test.ts +159 -0
  68. package/tests/memory-seeds.test.ts +120 -0
  69. package/tests/memory-store.test.ts +392 -0
  70. package/tests/types.test.ts +92 -0
  71. package/tsconfig.check.json +17 -0
  72. package/tsconfig.json +19 -0
  73. package/vitest.config.ts +8 -0
  74. package/dist/config.js +0 -9
  75. package/dist/index.js +0 -55
  76. package/dist/logging/timeline-logger.js +0 -29
  77. package/dist/mcp/mcp-stdio-server.js +0 -284
  78. package/dist/mcp/server.js +0 -347
  79. package/dist/mcp-entry.js +0 -62
  80. package/dist/memory/recall.js +0 -160
  81. package/dist/memory/research.js +0 -98
  82. package/dist/memory/seeds.js +0 -89
  83. package/dist/memory/session.js +0 -161
  84. package/dist/memory/store.js +0 -391
  85. package/dist/memory/types.js +0 -4
  86. package/dist/native/bridge-client.js +0 -173
  87. package/dist/native/macos-bridge-client.js +0 -5
  88. package/dist/runtime/accessibility-adapter.js +0 -377
  89. package/dist/runtime/app-adapter.js +0 -48
  90. package/dist/runtime/applescript-adapter.js +0 -283
  91. package/dist/runtime/ax-role-map.js +0 -80
  92. package/dist/runtime/browser-adapter.js +0 -36
  93. package/dist/runtime/cdp-chrome-adapter.js +0 -505
  94. package/dist/runtime/composite-adapter.js +0 -205
  95. package/dist/runtime/executor.js +0 -250
  96. package/dist/runtime/locator-cache.js +0 -12
  97. package/dist/runtime/planning-loop.js +0 -47
  98. package/dist/runtime/service.js +0 -372
  99. package/dist/runtime/session-manager.js +0 -28
  100. package/dist/runtime/state-observer.js +0 -105
  101. package/dist/runtime/vision-adapter.js +0 -208
  102. package/dist/test-mcp-protocol.js +0 -138
  103. package/dist/types.js +0 -1
@@ -0,0 +1,265 @@
1
+ using System.Text.Json;
2
+ using System.Text.Json.Nodes;
3
+
4
+ namespace WindowsBridge;
5
+
6
+ /// <summary>
7
+ /// JSON-RPC over stdio bridge for Windows native APIs.
8
+ /// Reads JSON requests from stdin (one per line), dispatches to the appropriate bridge,
9
+ /// and writes JSON responses to stdout (one per line).
10
+ /// Mirrors the protocol of the macOS Swift bridge exactly.
11
+ /// </summary>
12
+ class Program
13
+ {
14
+ private static readonly AppManagement _appManagement = new();
15
+ private static readonly UIAutomationBridge _uiAutomation = new();
16
+ private static readonly InputBridge _input = new();
17
+ private static readonly ScreenCapture _screenCapture = new();
18
+
19
+ private static readonly object _outputLock = new();
20
+ private static readonly JsonSerializerOptions _jsonOptions = new()
21
+ {
22
+ PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
23
+ WriteIndented = false,
24
+ };
25
+
26
+ static void Main(string[] args)
27
+ {
28
+ Console.InputEncoding = System.Text.Encoding.UTF8;
29
+ Console.OutputEncoding = System.Text.Encoding.UTF8;
30
+
31
+ string? line;
32
+ while ((line = Console.ReadLine()) != null)
33
+ {
34
+ if (string.IsNullOrWhiteSpace(line)) continue;
35
+
36
+ try
37
+ {
38
+ var request = JsonSerializer.Deserialize<JsonRpcRequest>(line, _jsonOptions);
39
+ if (request == null)
40
+ {
41
+ WriteError(0, -32700, "Parse error: null request");
42
+ continue;
43
+ }
44
+
45
+ try
46
+ {
47
+ var result = Dispatch(request.Method, request.Params);
48
+ WriteResult(request.Id, result);
49
+ }
50
+ catch (BridgeException ex)
51
+ {
52
+ WriteError(request.Id, -1, ex.Message);
53
+ }
54
+ catch (Exception ex)
55
+ {
56
+ WriteError(request.Id, -1, ex.Message);
57
+ }
58
+ }
59
+ catch (Exception ex)
60
+ {
61
+ WriteError(0, -32700, $"Parse error: {ex.Message}");
62
+ }
63
+ }
64
+ }
65
+
66
+ private static object Dispatch(string method, JsonObject? p)
67
+ {
68
+ return method switch
69
+ {
70
+ // Lifecycle
71
+ "ping" => new Dictionary<string, object>
72
+ {
73
+ ["pong"] = true,
74
+ ["pid"] = Environment.ProcessId,
75
+ ["accessible"] = true, // UI Automation doesn't need special permissions on Windows
76
+ },
77
+ "check_permissions" => new Dictionary<string, object>
78
+ {
79
+ ["trusted"] = true, // No special permissions needed on Windows for UIA
80
+ },
81
+
82
+ // App Management
83
+ "app.launch" => _appManagement.LaunchApp(RequiredParam<string>(p, "bundleId")),
84
+ "app.focus" => _appManagement.FocusApp(RequiredParam<string>(p, "bundleId")),
85
+ "app.list" => _appManagement.ListRunningApps(),
86
+ "app.windows" => _appManagement.ListWindows(),
87
+ "app.frontmost" => _appManagement.FrontmostApp(),
88
+
89
+ // UI Automation (Accessibility equivalent)
90
+ "ax.findElement" => _uiAutomation.FindElement(
91
+ RequiredParam<int>(p, "pid"),
92
+ Param<string>(p, "role"),
93
+ Param<string>(p, "title"),
94
+ Param<string>(p, "value"),
95
+ Param<string>(p, "identifier"),
96
+ Param<bool>(p, "exact") ?? true),
97
+ "ax.getElementTree" => _uiAutomation.GetElementTree(
98
+ RequiredParam<int>(p, "pid"),
99
+ Param<int>(p, "maxDepth") ?? 5),
100
+ "ax.performAction" => _uiAutomation.PerformAction(
101
+ RequiredParam<int>(p, "pid"),
102
+ RequiredParam<int[]>(p, "elementPath"),
103
+ Param<string>(p, "action") ?? "AXPress"),
104
+ "ax.setElementValue" => _uiAutomation.SetElementValue(
105
+ RequiredParam<int>(p, "pid"),
106
+ RequiredParam<int[]>(p, "elementPath"),
107
+ RequiredParam<string>(p, "value")),
108
+ "ax.getElementValue" => _uiAutomation.GetElementValue(
109
+ RequiredParam<int>(p, "pid"),
110
+ RequiredParam<int[]>(p, "elementPath")),
111
+ "ax.menuClick" => _uiAutomation.MenuClick(
112
+ RequiredParam<int>(p, "pid"),
113
+ RequiredParam<string[]>(p, "menuPath")),
114
+
115
+ // Observer (stub — Windows UIA events could be added later)
116
+ "observer.start" => new Dictionary<string, object>
117
+ {
118
+ ["ok"] = true,
119
+ ["stub"] = true,
120
+ ["message"] = "UI Automation event observation not yet implemented on Windows",
121
+ },
122
+ "observer.stop" => new Dictionary<string, object>
123
+ {
124
+ ["ok"] = true,
125
+ ["stub"] = true,
126
+ ["message"] = "UI Automation event observation not yet implemented on Windows",
127
+ },
128
+
129
+ // Input (CoreGraphics equivalent)
130
+ "cg.mouseClick" => _input.MouseClick(
131
+ RequiredParam<double>(p, "x"),
132
+ RequiredParam<double>(p, "y"),
133
+ Param<string>(p, "button") ?? "left",
134
+ Param<int>(p, "clickCount") ?? 1),
135
+ "cg.mouseMove" => _input.MouseMove(
136
+ RequiredParam<double>(p, "x"),
137
+ RequiredParam<double>(p, "y")),
138
+ "cg.mouseDrag" => _input.MouseDrag(
139
+ RequiredParam<double>(p, "fromX"),
140
+ RequiredParam<double>(p, "fromY"),
141
+ RequiredParam<double>(p, "toX"),
142
+ RequiredParam<double>(p, "toY")),
143
+ "cg.mouseFlick" => _input.MouseDrag( // Map flick to fast drag on Windows
144
+ RequiredParam<double>(p, "fromX"),
145
+ RequiredParam<double>(p, "fromY"),
146
+ RequiredParam<double>(p, "toX"),
147
+ RequiredParam<double>(p, "toY")),
148
+ "cg.keyCombo" => _input.KeyCombo(RequiredParam<string[]>(p, "keys")),
149
+ "cg.typeText" => _input.TypeText(RequiredParam<string>(p, "text")),
150
+ "cg.scroll" => _input.Scroll(
151
+ RequiredParam<double>(p, "x"),
152
+ RequiredParam<double>(p, "y"),
153
+ Param<int>(p, "deltaX") ?? 0,
154
+ Param<int>(p, "deltaY") ?? 0),
155
+ "cg.captureScreen" => _screenCapture.CaptureScreen(
156
+ Param<Dictionary<string, double>>(p, "region")),
157
+ "cg.captureWindow" => _screenCapture.CaptureWindow(
158
+ RequiredParam<int>(p, "windowId")),
159
+
160
+ // Vision (OCR)
161
+ "vision.findText" => _screenCapture.FindText(
162
+ RequiredParam<string>(p, "imagePath"),
163
+ Param<string>(p, "searchText")),
164
+ "vision.ocr" => _screenCapture.Ocr(
165
+ RequiredParam<string>(p, "imagePath")),
166
+
167
+ _ => throw new BridgeException($"Unknown method: {method}"),
168
+ };
169
+ }
170
+
171
+ // Parameter helpers (mirror Swift's param/requiredParam)
172
+ private static T? Param<T>(JsonObject? p, string key)
173
+ {
174
+ if (p == null || !p.ContainsKey(key) || p[key] == null) return default;
175
+
176
+ var node = p[key]!;
177
+
178
+ // Handle numeric coercion
179
+ if (typeof(T) == typeof(double) && node is JsonValue jv)
180
+ {
181
+ if (jv.TryGetValue<double>(out var d)) return (T)(object)d;
182
+ if (jv.TryGetValue<int>(out var i)) return (T)(object)(double)i;
183
+ if (jv.TryGetValue<long>(out var l)) return (T)(object)(double)l;
184
+ }
185
+ if (typeof(T) == typeof(int) && node is JsonValue jv2)
186
+ {
187
+ if (jv2.TryGetValue<int>(out var i)) return (T)(object)i;
188
+ if (jv2.TryGetValue<double>(out var d)) return (T)(object)(int)d;
189
+ if (jv2.TryGetValue<long>(out var l)) return (T)(object)(int)l;
190
+ }
191
+
192
+ try
193
+ {
194
+ return node.Deserialize<T>(_jsonOptions);
195
+ }
196
+ catch
197
+ {
198
+ return default;
199
+ }
200
+ }
201
+
202
+ private static T RequiredParam<T>(JsonObject? p, string key)
203
+ {
204
+ var value = Param<T>(p, key);
205
+ if (value == null)
206
+ throw new BridgeException($"Missing required parameter: {key}");
207
+ return value;
208
+ }
209
+
210
+ // Output helpers
211
+ private static void WriteResult(int id, object result)
212
+ {
213
+ var response = new Dictionary<string, object?>
214
+ {
215
+ ["id"] = id,
216
+ ["result"] = result,
217
+ ["error"] = null,
218
+ };
219
+ WriteLine(response);
220
+ }
221
+
222
+ private static void WriteError(int id, int code, string message)
223
+ {
224
+ var response = new Dictionary<string, object?>
225
+ {
226
+ ["id"] = id,
227
+ ["result"] = null,
228
+ ["error"] = new Dictionary<string, object> { ["code"] = code, ["message"] = message },
229
+ };
230
+ WriteLine(response);
231
+ }
232
+
233
+ public static void WriteEvent(Dictionary<string, object> eventData)
234
+ {
235
+ var wrapped = new Dictionary<string, object>
236
+ {
237
+ ["id"] = 0,
238
+ ["event"] = eventData,
239
+ };
240
+ WriteLine(wrapped);
241
+ }
242
+
243
+ private static void WriteLine(object obj)
244
+ {
245
+ var json = JsonSerializer.Serialize(obj, _jsonOptions);
246
+ lock (_outputLock)
247
+ {
248
+ Console.WriteLine(json);
249
+ Console.Out.Flush();
250
+ }
251
+ }
252
+ }
253
+
254
+ // JSON-RPC types
255
+ class JsonRpcRequest
256
+ {
257
+ public int Id { get; set; }
258
+ public string Method { get; set; } = "";
259
+ public JsonObject? Params { get; set; }
260
+ }
261
+
262
+ class BridgeException : Exception
263
+ {
264
+ public BridgeException(string message) : base(message) { }
265
+ }
@@ -0,0 +1,329 @@
1
+ using System.Drawing;
2
+ using System.Drawing.Imaging;
3
+ using System.Runtime.InteropServices;
4
+
5
+ namespace WindowsBridge;
6
+
7
+ /// <summary>
8
+ /// Screenshot capture and OCR.
9
+ /// Equivalent to macOS CoreGraphicsBridge (capture) + VisionBridge (OCR).
10
+ /// Uses GDI+ for screenshots and Windows.Media.Ocr for text recognition.
11
+ /// </summary>
12
+ class ScreenCapture
13
+ {
14
+ [DllImport("user32.dll")]
15
+ private static extern bool GetWindowRect(IntPtr hWnd, out RECT lpRect);
16
+
17
+ [DllImport("user32.dll")]
18
+ private static extern bool PrintWindow(IntPtr hWnd, IntPtr hdcBlt, uint nFlags);
19
+
20
+ [DllImport("user32.dll")]
21
+ private static extern IntPtr GetDesktopWindow();
22
+
23
+ [DllImport("user32.dll")]
24
+ private static extern IntPtr GetWindowDC(IntPtr hWnd);
25
+
26
+ [DllImport("user32.dll")]
27
+ private static extern int ReleaseDC(IntPtr hWnd, IntPtr hDC);
28
+
29
+ [DllImport("gdi32.dll")]
30
+ private static extern IntPtr CreateCompatibleDC(IntPtr hdc);
31
+
32
+ [DllImport("gdi32.dll")]
33
+ private static extern IntPtr CreateCompatibleBitmap(IntPtr hdc, int nWidth, int nHeight);
34
+
35
+ [DllImport("gdi32.dll")]
36
+ private static extern IntPtr SelectObject(IntPtr hdc, IntPtr hgdiobj);
37
+
38
+ [DllImport("gdi32.dll")]
39
+ private static extern bool BitBlt(IntPtr hdcDest, int xDest, int yDest, int wDest, int hDest,
40
+ IntPtr hdcSrc, int xSrc, int ySrc, uint rop);
41
+
42
+ [DllImport("gdi32.dll")]
43
+ private static extern bool DeleteDC(IntPtr hdc);
44
+
45
+ [DllImport("gdi32.dll")]
46
+ private static extern bool DeleteObject(IntPtr hObject);
47
+
48
+ [DllImport("user32.dll")]
49
+ private static extern int GetSystemMetrics(int nIndex);
50
+
51
+ [StructLayout(LayoutKind.Sequential)]
52
+ private struct RECT
53
+ {
54
+ public int Left, Top, Right, Bottom;
55
+ }
56
+
57
+ private const int SM_CXSCREEN = 0;
58
+ private const int SM_CYSCREEN = 1;
59
+ private const int SM_XVIRTUALSCREEN = 76;
60
+ private const int SM_YVIRTUALSCREEN = 77;
61
+ private const int SM_CXVIRTUALSCREEN = 78;
62
+ private const int SM_CYVIRTUALSCREEN = 79;
63
+ private const uint SRCCOPY = 0x00CC0020;
64
+ private const uint PW_RENDERFULLCONTENT = 0x00000002;
65
+
66
+ private static readonly string _tempDir = Path.Combine(Path.GetTempPath(), "screenhand");
67
+
68
+ static ScreenCapture()
69
+ {
70
+ Directory.CreateDirectory(_tempDir);
71
+ }
72
+
73
+ /// <summary>
74
+ /// Capture the full screen or a region.
75
+ /// </summary>
76
+ public Dictionary<string, object> CaptureScreen(Dictionary<string, double>? region)
77
+ {
78
+ int x, y, width, height;
79
+
80
+ if (region != null)
81
+ {
82
+ x = (int)region.GetValueOrDefault("x", 0);
83
+ y = (int)region.GetValueOrDefault("y", 0);
84
+ width = (int)region.GetValueOrDefault("width", GetSystemMetrics(SM_CXSCREEN));
85
+ height = (int)region.GetValueOrDefault("height", GetSystemMetrics(SM_CYSCREEN));
86
+ }
87
+ else
88
+ {
89
+ // Capture virtual screen (all monitors)
90
+ x = GetSystemMetrics(SM_XVIRTUALSCREEN);
91
+ y = GetSystemMetrics(SM_YVIRTUALSCREEN);
92
+ width = GetSystemMetrics(SM_CXVIRTUALSCREEN);
93
+ height = GetSystemMetrics(SM_CYVIRTUALSCREEN);
94
+
95
+ // Fallback to primary monitor
96
+ if (width == 0 || height == 0)
97
+ {
98
+ x = 0;
99
+ y = 0;
100
+ width = GetSystemMetrics(SM_CXSCREEN);
101
+ height = GetSystemMetrics(SM_CYSCREEN);
102
+ }
103
+ }
104
+
105
+ using var bitmap = new Bitmap(width, height, PixelFormat.Format32bppArgb);
106
+ using var graphics = Graphics.FromImage(bitmap);
107
+ graphics.CopyFromScreen(x, y, 0, 0, new Size(width, height), CopyPixelOperation.SourceCopy);
108
+
109
+ var filePath = Path.Combine(_tempDir, $"screen_{DateTimeOffset.UtcNow.ToUnixTimeMilliseconds()}.png");
110
+ bitmap.Save(filePath, ImageFormat.Png);
111
+
112
+ return new Dictionary<string, object>
113
+ {
114
+ ["path"] = filePath,
115
+ ["width"] = width,
116
+ ["height"] = height,
117
+ };
118
+ }
119
+
120
+ /// <summary>
121
+ /// Capture a specific window by its window handle (passed as windowId).
122
+ /// </summary>
123
+ public Dictionary<string, object> CaptureWindow(int windowId)
124
+ {
125
+ var hWnd = new IntPtr(windowId);
126
+ GetWindowRect(hWnd, out RECT rect);
127
+
128
+ int width = rect.Right - rect.Left;
129
+ int height = rect.Bottom - rect.Top;
130
+
131
+ if (width <= 0 || height <= 0)
132
+ throw new BridgeException($"Window {windowId} has invalid dimensions");
133
+
134
+ using var bitmap = new Bitmap(width, height, PixelFormat.Format32bppArgb);
135
+ using var graphics = Graphics.FromImage(bitmap);
136
+
137
+ // Try PrintWindow first (works for off-screen windows)
138
+ var hdc = graphics.GetHdc();
139
+ bool success = PrintWindow(hWnd, hdc, PW_RENDERFULLCONTENT);
140
+ graphics.ReleaseHdc(hdc);
141
+
142
+ if (!success)
143
+ {
144
+ // Fallback to screen capture of the window area
145
+ graphics.CopyFromScreen(rect.Left, rect.Top, 0, 0,
146
+ new Size(width, height), CopyPixelOperation.SourceCopy);
147
+ }
148
+
149
+ var filePath = Path.Combine(_tempDir, $"window_{windowId}_{DateTimeOffset.UtcNow.ToUnixTimeMilliseconds()}.png");
150
+ bitmap.Save(filePath, ImageFormat.Png);
151
+
152
+ return new Dictionary<string, object>
153
+ {
154
+ ["path"] = filePath,
155
+ ["width"] = width,
156
+ ["height"] = height,
157
+ };
158
+ }
159
+
160
+ /// <summary>
161
+ /// OCR an image file. Uses Windows.Media.Ocr when available, falls back to basic implementation.
162
+ /// </summary>
163
+ public Dictionary<string, object> Ocr(string imagePath)
164
+ {
165
+ if (!File.Exists(imagePath))
166
+ throw new BridgeException($"Image file not found: {imagePath}");
167
+
168
+ try
169
+ {
170
+ return OcrWithWindowsMediaOcr(imagePath);
171
+ }
172
+ catch
173
+ {
174
+ // Fallback: return empty result with a message
175
+ return new Dictionary<string, object>
176
+ {
177
+ ["text"] = "",
178
+ ["regions"] = new List<object>(),
179
+ ["error"] = "Windows.Media.Ocr not available. Install Windows 10 1809+ for built-in OCR.",
180
+ };
181
+ }
182
+ }
183
+
184
+ /// <summary>
185
+ /// Find text in an image using OCR.
186
+ /// </summary>
187
+ public Dictionary<string, object> FindText(string imagePath, string? searchText)
188
+ {
189
+ var ocrResult = Ocr(imagePath);
190
+
191
+ if (string.IsNullOrEmpty(searchText))
192
+ return ocrResult;
193
+
194
+ var regions = ocrResult["regions"] as List<object> ?? new List<object>();
195
+ var matches = regions
196
+ .Cast<Dictionary<string, object>>()
197
+ .Where(r => r.ContainsKey("text") &&
198
+ r["text"].ToString()!.Contains(searchText, StringComparison.OrdinalIgnoreCase))
199
+ .ToList();
200
+
201
+ return new Dictionary<string, object>
202
+ {
203
+ ["text"] = ocrResult["text"],
204
+ ["matches"] = matches,
205
+ ["matchCount"] = matches.Count,
206
+ };
207
+ }
208
+
209
+ /// <summary>
210
+ /// OCR using Windows.Media.Ocr (available on Windows 10 1809+).
211
+ /// Uses dynamic loading to avoid compile-time dependency on WinRT.
212
+ /// </summary>
213
+ private Dictionary<string, object> OcrWithWindowsMediaOcr(string imagePath)
214
+ {
215
+ // Use PowerShell to invoke Windows.Media.Ocr
216
+ // This avoids WinRT interop complexity while still using the built-in OCR engine
217
+ var script = $@"
218
+ Add-Type -AssemblyName System.Runtime.WindowsRuntime
219
+ $null = [Windows.Media.Ocr.OcrEngine, Windows.Foundation.UniversalApiContract, ContentType = WindowsRuntime]
220
+ $null = [Windows.Graphics.Imaging.BitmapDecoder, Windows.Foundation.UniversalApiContract, ContentType = WindowsRuntime]
221
+ $null = [Windows.Storage.StorageFile, Windows.Foundation.UniversalApiContract, ContentType = WindowsRuntime]
222
+
223
+ function Await($WinRtTask, $ResultType) {{
224
+ $asTask = $WinRtTask.GetType().GetMethod('AsTask', [Type[]]@())
225
+ if ($asTask -eq $null) {{
226
+ $asTaskGeneric = [System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object {{ $_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1 -and $_.IsGenericMethod }} | Select-Object -First 1
227
+ $asTask = $asTaskGeneric.MakeGenericMethod($ResultType)
228
+ $task = $asTask.Invoke($null, @($WinRtTask))
229
+ }} else {{
230
+ $task = $asTask.Invoke($WinRtTask, @())
231
+ }}
232
+ $task.Wait()
233
+ return $task.Result
234
+ }}
235
+
236
+ $file = Await ([Windows.Storage.StorageFile]::GetFileFromPathAsync('{imagePath.Replace("'", "''")}')) ([Windows.Storage.StorageFile])
237
+ $stream = Await ($file.OpenAsync([Windows.Storage.FileAccessMode]::Read)) ([Windows.Storage.Streams.IRandomAccessStream])
238
+ $decoder = Await ([Windows.Graphics.Imaging.BitmapDecoder]::CreateAsync($stream)) ([Windows.Graphics.Imaging.BitmapDecoder])
239
+ $bitmap = Await ($decoder.GetSoftwareBitmapAsync()) ([Windows.Graphics.Imaging.SoftwareBitmap])
240
+
241
+ $engine = [Windows.Media.Ocr.OcrEngine]::TryCreateFromUserProfileLanguages()
242
+ $result = Await ($engine.RecognizeAsync($bitmap)) ([Windows.Media.Ocr.OcrResult])
243
+
244
+ $output = @{{
245
+ text = $result.Text
246
+ regions = @()
247
+ }}
248
+
249
+ foreach ($line in $result.Lines) {{
250
+ foreach ($word in $line.Words) {{
251
+ $output.regions += @{{
252
+ text = $word.Text
253
+ bounds = @{{
254
+ x = $word.BoundingRect.X
255
+ y = $word.BoundingRect.Y
256
+ width = $word.BoundingRect.Width
257
+ height = $word.BoundingRect.Height
258
+ }}
259
+ }}
260
+ }}
261
+ }}
262
+
263
+ $output | ConvertTo-Json -Depth 5
264
+ ";
265
+
266
+ try
267
+ {
268
+ var psi = new System.Diagnostics.ProcessStartInfo
269
+ {
270
+ FileName = "powershell.exe",
271
+ Arguments = $"-NoProfile -NonInteractive -Command -",
272
+ UseShellExecute = false,
273
+ RedirectStandardInput = true,
274
+ RedirectStandardOutput = true,
275
+ RedirectStandardError = true,
276
+ CreateNoWindow = true,
277
+ };
278
+
279
+ using var process = System.Diagnostics.Process.Start(psi)!;
280
+ process.StandardInput.Write(script);
281
+ process.StandardInput.Close();
282
+
283
+ var output = process.StandardOutput.ReadToEnd();
284
+ process.WaitForExit(15000);
285
+
286
+ if (process.ExitCode != 0)
287
+ {
288
+ var stderr = process.StandardError.ReadToEnd();
289
+ throw new Exception($"PowerShell OCR failed: {stderr}");
290
+ }
291
+
292
+ // Parse the JSON output
293
+ var jsonDoc = System.Text.Json.JsonDocument.Parse(output);
294
+ var root = jsonDoc.RootElement;
295
+
296
+ var text = root.GetProperty("text").GetString() ?? "";
297
+ var regions = new List<object>();
298
+
299
+ if (root.TryGetProperty("regions", out var regionsElement))
300
+ {
301
+ foreach (var region in regionsElement.EnumerateArray())
302
+ {
303
+ var bounds = region.GetProperty("bounds");
304
+ regions.Add(new Dictionary<string, object>
305
+ {
306
+ ["text"] = region.GetProperty("text").GetString() ?? "",
307
+ ["bounds"] = new Dictionary<string, object>
308
+ {
309
+ ["x"] = bounds.GetProperty("x").GetDouble(),
310
+ ["y"] = bounds.GetProperty("y").GetDouble(),
311
+ ["width"] = bounds.GetProperty("width").GetDouble(),
312
+ ["height"] = bounds.GetProperty("height").GetDouble(),
313
+ },
314
+ });
315
+ }
316
+ }
317
+
318
+ return new Dictionary<string, object>
319
+ {
320
+ ["text"] = text,
321
+ ["regions"] = regions,
322
+ };
323
+ }
324
+ catch (Exception ex)
325
+ {
326
+ throw new BridgeException($"OCR failed: {ex.Message}");
327
+ }
328
+ }
329
+ }