@leg3ndy/otto-bridge 0.5.5 → 0.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -132,7 +132,132 @@ function extractMeaningfulDescriptionTokens(value) {
132
132
  function descriptionLikelyHasTextAnchor(description) {
133
133
  return extractQuotedPhrases(description).length > 0 || extractMeaningfulDescriptionTokens(description).length > 0;
134
134
  }
135
- function findOcrTextMatch(candidates, description) {
135
+ function regionFromOcrItems(items, kind) {
136
+ if (!items.length) {
137
+ return null;
138
+ }
139
+ const sorted = [...items].sort((left, right) => {
140
+ if (left.y !== right.y)
141
+ return left.y - right.y;
142
+ return left.x - right.x;
143
+ });
144
+ const minX = Math.min(...sorted.map((item) => item.x));
145
+ const minY = Math.min(...sorted.map((item) => item.y));
146
+ const maxX = Math.max(...sorted.map((item) => item.x + item.width));
147
+ const maxY = Math.max(...sorted.map((item) => item.y + item.height));
148
+ const confidenceValues = sorted
149
+ .map((item) => Number(item.confidence))
150
+ .filter((value) => Number.isFinite(value));
151
+ const confidence = confidenceValues.length
152
+ ? confidenceValues.reduce((sum, value) => sum + value, 0) / confidenceValues.length
153
+ : undefined;
154
+ return {
155
+ text: sorted.map((item) => item.text).join(" ").replace(/\s+/g, " ").trim(),
156
+ x: minX,
157
+ y: minY,
158
+ width: Math.max(1, maxX - minX),
159
+ height: Math.max(1, maxY - minY),
160
+ confidence,
161
+ kind,
162
+ };
163
+ }
164
+ function buildStructuredOcrRegions(candidates) {
165
+ if (!candidates.length) {
166
+ return [];
167
+ }
168
+ const sorted = [...candidates].sort((left, right) => {
169
+ const leftCenterY = left.y + (left.height / 2);
170
+ const rightCenterY = right.y + (right.height / 2);
171
+ if (leftCenterY !== rightCenterY) {
172
+ return leftCenterY - rightCenterY;
173
+ }
174
+ return left.x - right.x;
175
+ });
176
+ const lines = [];
177
+ for (const candidate of sorted) {
178
+ const candidateCenterY = candidate.y + (candidate.height / 2);
179
+ const lastLine = lines[lines.length - 1];
180
+ if (!lastLine) {
181
+ lines.push([candidate]);
182
+ continue;
183
+ }
184
+ const referenceCenterY = lastLine.reduce((sum, item) => sum + item.y + (item.height / 2), 0) / lastLine.length;
185
+ const avgHeight = lastLine.reduce((sum, item) => sum + item.height, 0) / lastLine.length;
186
+ const maxDistance = Math.max(16, avgHeight * 0.75);
187
+ if (Math.abs(candidateCenterY - referenceCenterY) <= maxDistance) {
188
+ lastLine.push(candidate);
189
+ }
190
+ else {
191
+ lines.push([candidate]);
192
+ }
193
+ }
194
+ const lineRegions = lines
195
+ .map((line) => regionFromOcrItems(line.sort((left, right) => left.x - right.x), "line"))
196
+ .filter(Boolean);
197
+ const blocks = [];
198
+ for (const line of lineRegions) {
199
+ const lastBlock = blocks[blocks.length - 1];
200
+ if (!lastBlock) {
201
+ blocks.push([line]);
202
+ continue;
203
+ }
204
+ const previous = lastBlock[lastBlock.length - 1];
205
+ const verticalGap = line.y - (previous.y + previous.height);
206
+ const horizontalOverlap = Math.max(0, Math.min(previous.x + previous.width, line.x + line.width) - Math.max(previous.x, line.x));
207
+ const overlapRatio = horizontalOverlap / Math.max(previous.width, line.width, 1);
208
+ const leftAlignmentDelta = Math.abs(previous.x - line.x);
209
+ if (verticalGap <= Math.max(22, previous.height * 1.6) && (overlapRatio >= 0.18 || leftAlignmentDelta <= 120)) {
210
+ lastBlock.push(line);
211
+ }
212
+ else {
213
+ blocks.push([line]);
214
+ }
215
+ }
216
+ const blockRegions = blocks
217
+ .map((block) => {
218
+ const minX = Math.min(...block.map((line) => line.x));
219
+ const minY = Math.min(...block.map((line) => line.y));
220
+ const maxX = Math.max(...block.map((line) => line.x + line.width));
221
+ const maxY = Math.max(...block.map((line) => line.y + line.height));
222
+ const confidenceValues = block
223
+ .map((line) => Number(line.confidence))
224
+ .filter((value) => Number.isFinite(value));
225
+ const confidence = confidenceValues.length
226
+ ? confidenceValues.reduce((sum, value) => sum + value, 0) / confidenceValues.length
227
+ : undefined;
228
+ return {
229
+ text: block.map((line) => line.text).join(" ").replace(/\s+/g, " ").trim(),
230
+ x: minX,
231
+ y: minY,
232
+ width: Math.max(1, maxX - minX),
233
+ height: Math.max(1, maxY - minY),
234
+ confidence,
235
+ kind: "block",
236
+ };
237
+ })
238
+ .filter((region) => region.text);
239
+ const wordRegions = candidates.map((candidate) => ({
240
+ text: candidate.text,
241
+ x: candidate.x,
242
+ y: candidate.y,
243
+ width: candidate.width,
244
+ height: candidate.height,
245
+ confidence: candidate.confidence,
246
+ kind: "word",
247
+ }));
248
+ const unique = new Set();
249
+ const deduped = [];
250
+ for (const region of [...blockRegions, ...lineRegions, ...wordRegions]) {
251
+ const key = `${normalizeText(region.text)}|${Math.round(region.x)}|${Math.round(region.y)}|${region.kind}`;
252
+ if (!region.text || unique.has(key)) {
253
+ continue;
254
+ }
255
+ unique.add(key);
256
+ deduped.push(region);
257
+ }
258
+ return deduped;
259
+ }
260
+ function findOcrTextMatch(regions, description) {
136
261
  const phrases = extractQuotedPhrases(description);
137
262
  const tokens = extractMeaningfulDescriptionTokens(description);
138
263
  const normalizedDescription = normalizeText(description || "");
@@ -140,10 +265,11 @@ function findOcrTextMatch(candidates, description) {
140
265
  if (!phrases.length && !tokens.length) {
141
266
  return null;
142
267
  }
143
- const scored = candidates
144
- .map((candidate, index) => {
145
- const normalizedText = normalizeText(candidate.text || "");
268
+ const scored = regions
269
+ .map((region, index) => {
270
+ const normalizedText = normalizeText(region.text || "");
146
271
  let score = 0;
272
+ let matchedTokens = 0;
147
273
  for (const phrase of phrases) {
148
274
  if (normalizedText.includes(phrase)) {
149
275
  score += 120;
@@ -152,17 +278,27 @@ function findOcrTextMatch(candidates, description) {
152
278
  for (const token of tokens) {
153
279
  if (normalizedText.includes(token)) {
154
280
  score += 18;
281
+ matchedTokens += 1;
155
282
  }
156
283
  }
284
+ if (tokens.length > 1 && matchedTokens === tokens.length) {
285
+ score += 36;
286
+ }
287
+ if (region.kind === "line") {
288
+ score += 8;
289
+ }
290
+ else if (region.kind === "block") {
291
+ score += 16;
292
+ }
157
293
  if (wantsFirst) {
158
- score += Math.max(0, 24 - Math.round(candidate.y / 60));
294
+ score += Math.max(0, 24 - Math.round(region.y / 60));
159
295
  score += Math.max(0, 12 - index);
160
296
  }
161
- if (candidate.confidence) {
162
- score += Math.round(candidate.confidence * 20);
297
+ if (region.confidence) {
298
+ score += Math.round(region.confidence * 20);
163
299
  }
164
300
  return score > 0 ? {
165
- candidate,
301
+ region,
166
302
  score,
167
303
  } : null;
168
304
  })
@@ -171,10 +307,10 @@ function findOcrTextMatch(candidates, description) {
171
307
  if (right.score !== left.score) {
172
308
  return right.score - left.score;
173
309
  }
174
- if (left.candidate.y !== right.candidate.y) {
175
- return left.candidate.y - right.candidate.y;
310
+ if (left.region.y !== right.region.y) {
311
+ return left.region.y - right.region.y;
176
312
  }
177
- return left.candidate.x - right.candidate.x;
313
+ return left.region.x - right.region.x;
178
314
  });
179
315
  return scored[0] || null;
180
316
  }
@@ -505,6 +641,15 @@ function parseStructuredActions(job) {
505
641
  actions.push({ type: "list_files", path: filePath, limit });
506
642
  continue;
507
643
  }
644
+ if (type === "count_files") {
645
+ const filePath = asString(action.path) || "~";
646
+ const extensions = Array.isArray(action.extensions)
647
+ ? action.extensions.map((item) => asString(item)?.toLowerCase().replace(/^\./, "")).filter(Boolean)
648
+ : undefined;
649
+ const recursive = typeof action.recursive === "boolean" ? action.recursive : undefined;
650
+ actions.push({ type: "count_files", path: filePath, extensions, recursive });
651
+ continue;
652
+ }
508
653
  if (type === "run_shell" || type === "shell" || type === "terminal") {
509
654
  const command = asString(action.command) || asString(action.cmd);
510
655
  const cwd = asString(action.cwd);
@@ -520,6 +665,44 @@ function parseStructuredActions(job) {
520
665
  }
521
666
  continue;
522
667
  }
668
+ if (type === "scroll_view" || type === "scroll" || type === "scroll_page") {
669
+ const rawDirection = (asString(action.direction) || asString(action.dir) || "down").toLowerCase();
670
+ const direction = rawDirection === "up" || rawDirection === "cima" ? "up" : "down";
671
+ const rawAmount = (asString(action.amount) || "").toLowerCase();
672
+ const amount = rawAmount === "small" || rawAmount === "medium" || rawAmount === "large"
673
+ ? rawAmount
674
+ : undefined;
675
+ const rawSteps = Number(action.steps);
676
+ const steps = Number.isFinite(rawSteps) ? Math.max(1, Math.min(Math.round(rawSteps), 6)) : undefined;
677
+ actions.push({
678
+ type: "scroll_view",
679
+ direction,
680
+ amount,
681
+ steps,
682
+ app: asString(action.app) || undefined,
683
+ });
684
+ continue;
685
+ }
686
+ if (type === "whatsapp_send_message") {
687
+ const contact = asString(action.contact) || asString(action.recipient);
688
+ const text = asString(action.text) || asString(action.message);
689
+ if (contact && text) {
690
+ actions.push({ type: "whatsapp_send_message", contact, text });
691
+ }
692
+ continue;
693
+ }
694
+ if (type === "whatsapp_read_chat") {
695
+ const contact = asString(action.contact) || asString(action.recipient);
696
+ const rawLimit = Number(action.limit);
697
+ if (contact) {
698
+ actions.push({
699
+ type: "whatsapp_read_chat",
700
+ contact,
701
+ limit: Number.isFinite(rawLimit) ? Math.max(1, Math.min(Math.round(rawLimit), 30)) : undefined,
702
+ });
703
+ }
704
+ continue;
705
+ }
523
706
  if (type === "click_visual_target" || type === "click_target") {
524
707
  const description = asString(action.description) || asString(action.target);
525
708
  if (description) {
@@ -535,6 +718,19 @@ function parseStructuredActions(job) {
535
718
  }
536
719
  continue;
537
720
  }
721
+ if (type === "drag_visual_target" || type === "drag_target") {
722
+ const sourceDescription = asString(action.source_description) || asString(action.source) || asString(action.from);
723
+ const targetDescription = asString(action.target_description) || asString(action.target) || asString(action.to);
724
+ if (sourceDescription && targetDescription) {
725
+ actions.push({
726
+ type: "drag_visual_target",
727
+ source_description: sourceDescription,
728
+ target_description: targetDescription,
729
+ app: asString(action.app) || undefined,
730
+ });
731
+ }
732
+ continue;
733
+ }
538
734
  }
539
735
  return actions;
540
736
  }
@@ -586,6 +782,23 @@ function deriveActionsFromText(job) {
586
782
  }
587
783
  return [{ type: "set_volume", level }];
588
784
  }
785
+ if (/\b(scroll|rola\w*|role\w*|desce\w*|sobe\w*)\b/i.test(task)) {
786
+ const direction = /\b(sobe\w*|suba\w*|para cima|pra cima|scroll up)\b/i.test(task) ? "up" : "down";
787
+ const amount = /\b(pouco|leve|small)\b/i.test(task)
788
+ ? "small"
789
+ : /\b(muito|bastante|fim|grande|large)\b/i.test(task)
790
+ ? "large"
791
+ : "medium";
792
+ const stepsMatch = task.match(/\b(\d{1,2})\s*(?:x|vezes?)\b/i);
793
+ const steps = stepsMatch?.[1] ? Math.max(1, Math.min(Number(stepsMatch[1]), 6)) : 1;
794
+ return [{
795
+ type: "scroll_view",
796
+ direction,
797
+ amount,
798
+ steps,
799
+ app: detectedApp || undefined,
800
+ }];
801
+ }
589
802
  if ((normalizedTask.includes("leia") || normalizedTask.includes("ler")) && detectedUrl) {
590
803
  return [
591
804
  { type: "open_url", url: detectedUrl, app: detectedApp || "Safari" },
@@ -752,6 +965,18 @@ export class NativeMacOSJobExecutor {
752
965
  completionNotes.push(`Arquivos em ${action.path}:\n${listing}`);
753
966
  continue;
754
967
  }
968
+ if (action.type === "count_files") {
969
+ await reporter.progress(progressPercent, `Contando arquivos em ${action.path}`);
970
+ const counted = await this.countLocalFiles(action.path, action.extensions, action.recursive !== false);
971
+ completionNotes.push(`Encontrei ${counted.total} arquivo${counted.total === 1 ? "" : "s"} ${counted.extensionsLabel} em ${counted.path}.`);
972
+ resultPayload.file_count = {
973
+ total: counted.total,
974
+ path: counted.path,
975
+ extensions: counted.extensions,
976
+ recursive: counted.recursive,
977
+ };
978
+ continue;
979
+ }
755
980
  if (action.type === "run_shell") {
756
981
  await reporter.progress(progressPercent, `Rodando comando local: ${action.command}`);
757
982
  const shellOutput = await this.runShellCommand(action.command, action.cwd);
@@ -764,6 +989,68 @@ export class NativeMacOSJobExecutor {
764
989
  completionNotes.push(`Volume ajustado para ${action.level}% no macOS.`);
765
990
  continue;
766
991
  }
992
+ if (action.type === "scroll_view") {
993
+ const scrollApp = action.app || this.lastActiveApp || await this.getFrontmostAppName();
994
+ if (scrollApp) {
995
+ await reporter.progress(progressPercent, `Trazendo ${scrollApp} para frente antes de rolar a tela`);
996
+ await this.focusApp(scrollApp);
997
+ }
998
+ const directionLabel = action.direction === "up" ? "cima" : "baixo";
999
+ await reporter.progress(progressPercent, `Rolando a tela para ${directionLabel}`);
1000
+ await this.scrollView(action.direction, action.amount, action.steps);
1001
+ resultPayload.last_scroll = {
1002
+ direction: action.direction,
1003
+ amount: action.amount || "medium",
1004
+ steps: action.steps || 1,
1005
+ app: scrollApp || null,
1006
+ };
1007
+ completionNotes.push(`Rolei a tela para ${directionLabel} no macOS.`);
1008
+ continue;
1009
+ }
1010
+ if (action.type === "whatsapp_send_message") {
1011
+ await reporter.progress(progressPercent, `Abrindo a conversa do WhatsApp com ${action.contact}`);
1012
+ await this.focusApp("Safari");
1013
+ await this.ensureWhatsAppWebReady();
1014
+ const selected = await this.selectWhatsAppConversation(action.contact);
1015
+ if (!selected) {
1016
+ throw new Error(`Nao consegui localizar a conversa do WhatsApp com ${action.contact}.`);
1017
+ }
1018
+ await reporter.progress(progressPercent, `Digitando a mensagem para ${action.contact} no WhatsApp`);
1019
+ await this.focusWhatsAppComposer();
1020
+ await this.typeText(action.text);
1021
+ await delay(250);
1022
+ await this.pressShortcut("return");
1023
+ await delay(900);
1024
+ const verification = await this.verifyWhatsAppLastMessage(action.text);
1025
+ if (!verification.ok) {
1026
+ throw new Error(verification.reason || `Nao consegui confirmar o envio da mensagem para ${action.contact} no WhatsApp.`);
1027
+ }
1028
+ resultPayload.whatsapp = {
1029
+ action: "send_message",
1030
+ contact: action.contact,
1031
+ text_preview: clipText(action.text, 180),
1032
+ };
1033
+ completionNotes.push(`Enviei a mensagem no WhatsApp para ${action.contact}.`);
1034
+ continue;
1035
+ }
1036
+ if (action.type === "whatsapp_read_chat") {
1037
+ await reporter.progress(progressPercent, `Abrindo a conversa do WhatsApp com ${action.contact}`);
1038
+ await this.focusApp("Safari");
1039
+ await this.ensureWhatsAppWebReady();
1040
+ const selected = await this.selectWhatsAppConversation(action.contact);
1041
+ if (!selected) {
1042
+ throw new Error(`Nao consegui localizar a conversa do WhatsApp com ${action.contact}.`);
1043
+ }
1044
+ await delay(500);
1045
+ const chat = await this.readWhatsAppVisibleConversation(action.contact, action.limit || 12);
1046
+ resultPayload.whatsapp = {
1047
+ action: "read_chat",
1048
+ contact: action.contact,
1049
+ messages: chat.messages,
1050
+ };
1051
+ completionNotes.push(`Mensagens visiveis no WhatsApp com ${action.contact}:\n${chat.summary}`);
1052
+ continue;
1053
+ }
767
1054
  if (action.type === "click_visual_target") {
768
1055
  const browserApp = await this.resolveLikelyBrowserApp(action.app);
769
1056
  if (browserApp) {
@@ -839,15 +1126,15 @@ export class NativeMacOSJobExecutor {
839
1126
  validated = true;
840
1127
  }
841
1128
  if (validated) {
842
- const candidate = ocrClick.candidate || null;
1129
+ const region = ocrClick.region || null;
843
1130
  resultPayload.last_click = {
844
1131
  strategy: ocrClick.strategy || "local_ocr",
845
1132
  score: ocrClick.score || null,
846
- matched_text: candidate?.text || null,
847
- x: candidate ? candidate.x + (candidate.width / 2) : null,
848
- y: candidate ? candidate.y + (candidate.height / 2) : null,
849
- width: candidate?.width || null,
850
- height: candidate?.height || null,
1133
+ matched_text: region?.text || null,
1134
+ x: region ? region.x + (region.width / 2) : null,
1135
+ y: region ? region.y + (region.height / 2) : null,
1136
+ width: region?.width || null,
1137
+ height: region?.height || null,
851
1138
  };
852
1139
  completionNotes.push(`Localizei e cliquei em ${targetDescription} por OCR local.`);
853
1140
  clickSucceeded = true;
@@ -923,6 +1210,35 @@ export class NativeMacOSJobExecutor {
923
1210
  }
924
1211
  continue;
925
1212
  }
1213
+ if (action.type === "drag_visual_target") {
1214
+ const dragApp = await this.resolveLikelyBrowserApp(action.app);
1215
+ if (dragApp) {
1216
+ await reporter.progress(progressPercent, `Trazendo ${dragApp} para frente antes do arraste`);
1217
+ await this.focusApp(dragApp);
1218
+ }
1219
+ else if (action.app) {
1220
+ await reporter.progress(progressPercent, `Trazendo ${action.app} para frente antes do arraste`);
1221
+ await this.focusApp(action.app);
1222
+ }
1223
+ await reporter.progress(progressPercent, `Capturando a tela para localizar ${action.source_description} e ${action.target_description}`);
1224
+ const screenshotPath = await this.takeScreenshot();
1225
+ const sourcePoint = await this.resolveVisualTargetPoint(job.job_id, screenshotPath, action.source_description, artifacts, "drag_source");
1226
+ const targetPoint = await this.resolveVisualTargetPoint(job.job_id, screenshotPath, action.target_description, artifacts, "drag_target");
1227
+ if (!sourcePoint) {
1228
+ throw new Error(`Nao consegui localizar ${action.source_description} com confianca suficiente para arrastar.`);
1229
+ }
1230
+ if (!targetPoint) {
1231
+ throw new Error(`Nao consegui localizar ${action.target_description} com confianca suficiente para concluir o arraste.`);
1232
+ }
1233
+ await reporter.progress(progressPercent, `Arrastando ${action.source_description} para ${action.target_description}`);
1234
+ await this.dragPoint(sourcePoint.x, sourcePoint.y, targetPoint.x, targetPoint.y);
1235
+ resultPayload.last_drag = {
1236
+ source: sourcePoint,
1237
+ target: targetPoint,
1238
+ };
1239
+ completionNotes.push(`Arrastei ${action.source_description} para ${action.target_description}.`);
1240
+ continue;
1241
+ }
926
1242
  await reporter.progress(progressPercent, `Abrindo ${action.url}${action.app ? ` em ${action.app}` : ""}`);
927
1243
  await this.openUrl(action.url, action.app);
928
1244
  await delay(1200);
@@ -1161,6 +1477,253 @@ end tell
1161
1477
  }
1162
1478
  }
1163
1479
  }
1480
+ async scrollView(direction, amount = "medium", steps = 1) {
1481
+ const clampedSteps = Math.max(1, Math.min(Math.round(steps || 1), 6));
1482
+ const lineDelta = {
1483
+ small: 4,
1484
+ medium: 8,
1485
+ large: 14,
1486
+ }[amount];
1487
+ const signedDelta = direction === "up" ? lineDelta : -lineDelta;
1488
+ const iterations = {
1489
+ small: 1,
1490
+ medium: 2,
1491
+ large: 3,
1492
+ }[amount];
1493
+ const swiftScript = `
1494
+ import ApplicationServices
1495
+ import Foundation
1496
+
1497
+ let wheelDelta: Int32 = ${signedDelta}
1498
+ let stepCount = ${clampedSteps}
1499
+ let iterations = ${iterations}
1500
+
1501
+ for _ in 0..<stepCount {
1502
+ for _ in 0..<iterations {
1503
+ if let event = CGEvent(scrollWheelEvent2Source: nil, units: .line, wheelCount: 1, wheel1: wheelDelta, wheel2: 0, wheel3: 0) {
1504
+ event.post(tap: .cghidEventTap)
1505
+ }
1506
+ usleep(35000)
1507
+ }
1508
+ usleep(85000)
1509
+ }
1510
+ `;
1511
+ try {
1512
+ await this.runCommand("swift", ["-e", swiftScript]);
1513
+ }
1514
+ catch {
1515
+ await this.scrollViewWithPageKeys(direction, clampedSteps);
1516
+ }
1517
+ }
1518
+ async scrollViewWithPageKeys(direction, steps) {
1519
+ const keyCode = direction === "up" ? 116 : 121;
1520
+ const clampedSteps = Math.max(1, Math.min(Math.round(steps || 1), 6));
1521
+ const script = `
1522
+ repeat ${clampedSteps} times
1523
+ tell application "System Events" to key code ${keyCode}
1524
+ delay 0.06
1525
+ end repeat
1526
+ `;
1527
+ await this.runCommand("osascript", ["-e", script]);
1528
+ }
1529
+ async ensureWhatsAppWebReady() {
1530
+ const page = await this.readFrontmostPage("Safari");
1531
+ if (!normalizeComparableUrl(page.url || "").includes("web.whatsapp.com")) {
1532
+ throw new Error("O Safari nao esta aberto no WhatsApp Web.");
1533
+ }
1534
+ }
1535
+ async selectWhatsAppConversation(contact) {
1536
+ const prepared = await this.runSafariJsonScript(`
1537
+ const query = String(__input?.contact || "");
1538
+ const normalize = (value) => String(value || "").normalize("NFD").replace(/[\\u0300-\\u036f]/g, "").toLowerCase().trim();
1539
+ const normalizedQuery = normalize(query);
1540
+
1541
+ function isVisible(element) {
1542
+ if (!(element instanceof HTMLElement)) return false;
1543
+ const rect = element.getBoundingClientRect();
1544
+ if (rect.width < 4 || rect.height < 4) return false;
1545
+ const style = window.getComputedStyle(element);
1546
+ if (style.visibility === "hidden" || style.display === "none" || Number(style.opacity || "1") === 0) return false;
1547
+ return rect.bottom >= 0 && rect.right >= 0 && rect.top <= window.innerHeight && rect.left <= window.innerWidth;
1548
+ }
1549
+
1550
+ function focusAndReplaceContent(element, value) {
1551
+ element.focus();
1552
+ const range = document.createRange();
1553
+ range.selectNodeContents(element);
1554
+ const selection = window.getSelection();
1555
+ selection?.removeAllRanges();
1556
+ selection?.addRange(range);
1557
+ document.execCommand("selectAll", false);
1558
+ document.execCommand("delete", false);
1559
+ document.execCommand("insertText", false, value);
1560
+ element.dispatchEvent(new InputEvent("input", { bubbles: true, data: value, inputType: "insertText" }));
1561
+ }
1562
+
1563
+ const candidates = Array.from(document.querySelectorAll('div[contenteditable="true"][role="textbox"], div[contenteditable="true"][data-tab], [data-testid="chat-list-search"] [contenteditable="true"]'))
1564
+ .filter((node) => node instanceof HTMLElement)
1565
+ .filter((node) => isVisible(node))
1566
+ .map((node) => {
1567
+ const element = node;
1568
+ const rect = element.getBoundingClientRect();
1569
+ const label = normalize(element.getAttribute("aria-label") || element.getAttribute("data-testid") || element.textContent || "");
1570
+ let score = 0;
1571
+ if (rect.left < window.innerWidth * 0.45) score += 30;
1572
+ if (rect.top < 240) score += 30;
1573
+ if (label.includes("search") || label.includes("pesquisar") || label.includes("procure") || label.includes("chat list")) score += 80;
1574
+ if (element.closest('[data-testid="chat-list-search"], header')) score += 25;
1575
+ return { element, score };
1576
+ })
1577
+ .sort((left, right) => right.score - left.score);
1578
+
1579
+ if (!candidates.length) {
1580
+ return { ok: false, reason: "Nao achei o campo de busca do WhatsApp Web." };
1581
+ }
1582
+
1583
+ focusAndReplaceContent(candidates[0].element, query);
1584
+ return { ok: true };
1585
+ `, { contact });
1586
+ if (!prepared?.ok) {
1587
+ return false;
1588
+ }
1589
+ await delay(900);
1590
+ const result = await this.runSafariJsonScript(`
1591
+ const query = String(__input?.contact || "");
1592
+ const normalize = (value) => String(value || "").normalize("NFD").replace(/[\\u0300-\\u036f]/g, "").toLowerCase().trim();
1593
+ const normalizedQuery = normalize(query);
1594
+
1595
+ function isVisible(element) {
1596
+ if (!(element instanceof HTMLElement)) return false;
1597
+ const rect = element.getBoundingClientRect();
1598
+ if (rect.width < 6 || rect.height < 6) return false;
1599
+ const style = window.getComputedStyle(element);
1600
+ if (style.visibility === "hidden" || style.display === "none" || Number(style.opacity || "1") === 0) return false;
1601
+ return rect.bottom >= 0 && rect.right >= 0 && rect.top <= window.innerHeight && rect.left <= window.innerWidth;
1602
+ }
1603
+
1604
+ const titleNodes = Array.from(document.querySelectorAll('span[title], div[title]'))
1605
+ .filter((node) => node instanceof HTMLElement)
1606
+ .filter((node) => isVisible(node))
1607
+ .map((node) => {
1608
+ const text = normalize(node.getAttribute("title") || node.textContent || "");
1609
+ let score = 0;
1610
+ if (text === normalizedQuery) score += 160;
1611
+ if (text.includes(normalizedQuery)) score += 100;
1612
+ if (normalizedQuery.includes(text) && text.length >= 3) score += 50;
1613
+ const container = node.closest('[data-testid="cell-frame-container"], [role="listitem"], [role="gridcell"], div[tabindex]');
1614
+ if (container instanceof HTMLElement && isVisible(container)) score += 20;
1615
+ return { node, container, text, score };
1616
+ })
1617
+ .filter((item) => item.score > 0)
1618
+ .sort((left, right) => right.score - left.score);
1619
+
1620
+ if (!titleNodes.length) {
1621
+ return { clicked: false, reason: "Nao achei uma conversa visivel com esse nome." };
1622
+ }
1623
+
1624
+ const winner = titleNodes[0];
1625
+ const target = winner.container instanceof HTMLElement ? winner.container : winner.node;
1626
+ target.scrollIntoView({ block: "center", inline: "center", behavior: "auto" });
1627
+ target.dispatchEvent(new MouseEvent("mousedown", { bubbles: true, cancelable: true, view: window }));
1628
+ target.dispatchEvent(new MouseEvent("mouseup", { bubbles: true, cancelable: true, view: window }));
1629
+ target.dispatchEvent(new MouseEvent("click", { bubbles: true, cancelable: true, view: window }));
1630
+ if (typeof target.click === "function") {
1631
+ target.click();
1632
+ }
1633
+ return { clicked: true };
1634
+ `, { contact });
1635
+ return Boolean(result?.clicked);
1636
+ }
1637
+ async focusWhatsAppComposer() {
1638
+ const result = await this.runSafariJsonScript(`
1639
+ function isVisible(element) {
1640
+ if (!(element instanceof HTMLElement)) return false;
1641
+ const rect = element.getBoundingClientRect();
1642
+ if (rect.width < 6 || rect.height < 6) return false;
1643
+ const style = window.getComputedStyle(element);
1644
+ if (style.visibility === "hidden" || style.display === "none" || Number(style.opacity || "1") === 0) return false;
1645
+ return rect.bottom >= 0 && rect.right >= 0 && rect.top <= window.innerHeight && rect.left <= window.innerWidth;
1646
+ }
1647
+
1648
+ const candidates = Array.from(document.querySelectorAll('footer div[contenteditable="true"], [data-testid="conversation-compose-box-input"], main footer [contenteditable="true"]'))
1649
+ .filter((node) => node instanceof HTMLElement)
1650
+ .filter((node) => isVisible(node))
1651
+ .sort((left, right) => right.getBoundingClientRect().top - left.getBoundingClientRect().top);
1652
+
1653
+ if (!candidates.length) {
1654
+ return { focused: false, reason: "Nao achei o campo de mensagem do WhatsApp Web." };
1655
+ }
1656
+
1657
+ const composer = candidates[0];
1658
+ composer.focus();
1659
+ composer.click();
1660
+ return { focused: true };
1661
+ `);
1662
+ if (!result?.focused) {
1663
+ throw new Error(result?.reason || "Nao consegui focar o campo de mensagem do WhatsApp Web.");
1664
+ }
1665
+ }
1666
+ async readWhatsAppVisibleConversation(contact, limit) {
1667
+ const result = await this.runSafariJsonScript(`
1668
+ const maxMessages = Number(__input?.limit || 12);
1669
+
1670
+ function isVisible(element) {
1671
+ if (!(element instanceof HTMLElement)) return false;
1672
+ const rect = element.getBoundingClientRect();
1673
+ if (rect.width < 6 || rect.height < 6) return false;
1674
+ const style = window.getComputedStyle(element);
1675
+ if (style.visibility === "hidden" || style.display === "none" || Number(style.opacity || "1") === 0) return false;
1676
+ return rect.bottom >= 0 && rect.right >= 0 && rect.top <= window.innerHeight && rect.left <= window.innerWidth;
1677
+ }
1678
+
1679
+ const containers = Array.from(document.querySelectorAll('[data-testid="msg-container"], div[data-id]'))
1680
+ .filter((node) => node instanceof HTMLElement)
1681
+ .filter((node) => isVisible(node));
1682
+
1683
+ const messages = containers.map((node) => {
1684
+ const element = node;
1685
+ const prePlain = element.querySelector('[data-pre-plain-text]')?.getAttribute('data-pre-plain-text') || "";
1686
+ const authorMatch = prePlain.match(/\\]\\s*([^:]+):/);
1687
+ const author = authorMatch?.[1]?.trim() || (element.getAttribute('data-testid')?.includes('out') ? 'Voce' : 'Contato');
1688
+ const text = (element.innerText || "").trim().replace(/\\n{2,}/g, "\\n");
1689
+ return { author, text };
1690
+ }).filter((item) => item.text);
1691
+
1692
+ return { messages: messages.slice(-maxMessages) };
1693
+ `, { contact, limit });
1694
+ const messages = Array.isArray(result?.messages)
1695
+ ? result.messages
1696
+ .map((item) => ({
1697
+ author: clipText(asString(item.author) || "Contato", 80),
1698
+ text: clipText(asString(item.text) || "", 500),
1699
+ }))
1700
+ .filter((item) => item.text)
1701
+ : [];
1702
+ return {
1703
+ messages,
1704
+ summary: messages.length
1705
+ ? messages.map((item) => `${item.author}: ${item.text}`).join("\n")
1706
+ : `(sem mensagens visiveis na conversa com ${contact})`,
1707
+ };
1708
+ }
1709
+ async verifyWhatsAppLastMessage(expectedText) {
1710
+ const chat = await this.readWhatsAppVisibleConversation("Contato", 6);
1711
+ if (!chat.messages.length) {
1712
+ return {
1713
+ ok: false,
1714
+ reason: "Nao consegui ler as mensagens visiveis apos o envio no WhatsApp.",
1715
+ };
1716
+ }
1717
+ const normalizedExpected = normalizeText(expectedText).slice(0, 60);
1718
+ const matched = chat.messages.some((item) => normalizeText(item.text).includes(normalizedExpected));
1719
+ if (!matched) {
1720
+ return {
1721
+ ok: false,
1722
+ reason: "Nao consegui confirmar visualmente a mensagem enviada no WhatsApp.",
1723
+ };
1724
+ }
1725
+ return { ok: true, reason: "" };
1726
+ }
1164
1727
  async takeScreenshot(targetPath) {
1165
1728
  const artifactsDir = path.join(os.homedir(), ".otto-bridge", "artifacts");
1166
1729
  await mkdir(artifactsDir, { recursive: true });
@@ -1567,6 +2130,54 @@ post(.leftMouseUp)
1567
2130
  `;
1568
2131
  await this.runCommand("swift", ["-e", script, String(Math.round(x)), String(Math.round(y))]);
1569
2132
  }
2133
+ async dragPoint(fromX, fromY, toX, toY) {
2134
+ const script = `
2135
+ import Cocoa
2136
+ import ApplicationServices
2137
+
2138
+ let fromX = Double(CommandLine.arguments[1]) ?? 0
2139
+ let fromY = Double(CommandLine.arguments[2]) ?? 0
2140
+ let toX = Double(CommandLine.arguments[3]) ?? 0
2141
+ let toY = Double(CommandLine.arguments[4]) ?? 0
2142
+
2143
+ let startPoint = CGPoint(x: fromX, y: fromY)
2144
+ let endPoint = CGPoint(x: toX, y: toY)
2145
+ let steps = max(8, Int(hypot(endPoint.x - startPoint.x, endPoint.y - startPoint.y) / 60.0))
2146
+
2147
+ func post(_ type: CGEventType, at point: CGPoint) {
2148
+ guard let event = CGEvent(mouseEventSource: nil, mouseType: type, mouseCursorPosition: point, mouseButton: .left) else {
2149
+ fputs("failed to create mouse event\\n", stderr)
2150
+ exit(1)
2151
+ }
2152
+ event.post(tap: .cghidEventTap)
2153
+ }
2154
+
2155
+ post(.mouseMoved, at: startPoint)
2156
+ usleep(100000)
2157
+ post(.leftMouseDown, at: startPoint)
2158
+ usleep(90000)
2159
+
2160
+ for step in 1...steps {
2161
+ let progress = Double(step) / Double(steps)
2162
+ let point = CGPoint(
2163
+ x: startPoint.x + ((endPoint.x - startPoint.x) * progress),
2164
+ y: startPoint.y + ((endPoint.y - startPoint.y) * progress)
2165
+ )
2166
+ post(.leftMouseDragged, at: point)
2167
+ usleep(35000)
2168
+ }
2169
+
2170
+ post(.leftMouseUp, at: endPoint)
2171
+ `;
2172
+ await this.runCommand("swift", [
2173
+ "-e",
2174
+ script,
2175
+ String(Math.round(fromX)),
2176
+ String(Math.round(fromY)),
2177
+ String(Math.round(toX)),
2178
+ String(Math.round(toY)),
2179
+ ]);
2180
+ }
1570
2181
  async runLocalOcr(filePath) {
1571
2182
  const script = `
1572
2183
  import Foundation
@@ -1656,9 +2267,27 @@ if let output = String(data: data, encoding: .utf8) {
1656
2267
  }
1657
2268
  }
1658
2269
  async tryLocalOcrClick(screenshotPath, description) {
1659
- if (!descriptionLikelyHasTextAnchor(description)) {
2270
+ const anchor = await this.resolveLocalOcrAnchor(screenshotPath, description);
2271
+ if (!anchor.region) {
1660
2272
  return {
1661
2273
  clicked: false,
2274
+ reason: anchor.reason,
2275
+ strategy: anchor.strategy,
2276
+ };
2277
+ }
2278
+ const clickX = anchor.region.x + (anchor.region.width / 2);
2279
+ const clickY = anchor.region.y + (anchor.region.height / 2);
2280
+ await this.clickPoint(clickX, clickY);
2281
+ return {
2282
+ clicked: true,
2283
+ score: anchor.score,
2284
+ region: anchor.region,
2285
+ strategy: anchor.strategy,
2286
+ };
2287
+ }
2288
+ async resolveLocalOcrAnchor(screenshotPath, description) {
2289
+ if (!descriptionLikelyHasTextAnchor(description)) {
2290
+ return {
1662
2291
  reason: "A descricao nao traz ancora textual forte para OCR local.",
1663
2292
  strategy: "local_ocr_skipped",
1664
2293
  };
@@ -1666,27 +2295,69 @@ if let output = String(data: data, encoding: .utf8) {
1666
2295
  const candidates = await this.runLocalOcr(screenshotPath);
1667
2296
  if (!candidates.length) {
1668
2297
  return {
1669
- clicked: false,
1670
2298
  reason: "OCR local nao encontrou texto utilizavel na tela.",
1671
2299
  strategy: "local_ocr_empty",
1672
2300
  };
1673
2301
  }
1674
- const match = findOcrTextMatch(candidates, description);
2302
+ const regions = buildStructuredOcrRegions(candidates);
2303
+ const match = findOcrTextMatch(regions, description);
1675
2304
  if (!match || match.score < 24) {
1676
2305
  return {
1677
- clicked: false,
1678
2306
  reason: "OCR local nao encontrou texto suficientemente compativel com a descricao.",
1679
2307
  strategy: "local_ocr_no_match",
1680
2308
  };
1681
2309
  }
1682
- const clickX = match.candidate.x + (match.candidate.width / 2);
1683
- const clickY = match.candidate.y + (match.candidate.height / 2);
1684
- await this.clickPoint(clickX, clickY);
1685
2310
  return {
1686
- clicked: true,
2311
+ region: match.region,
1687
2312
  score: match.score,
1688
- candidate: match.candidate,
1689
- strategy: "local_ocr",
2313
+ strategy: match.region.kind === "block" ? "structured_local_ocr_block" : "local_ocr",
2314
+ };
2315
+ }
2316
+ async resolveVisualTargetPoint(jobId, screenshotPath, description, artifacts, purpose) {
2317
+ const ocrAnchor = await this.resolveLocalOcrAnchor(screenshotPath, description);
2318
+ if (ocrAnchor.region) {
2319
+ return {
2320
+ x: ocrAnchor.region.x + (ocrAnchor.region.width / 2),
2321
+ y: ocrAnchor.region.y + (ocrAnchor.region.height / 2),
2322
+ strategy: ocrAnchor.strategy || "local_ocr",
2323
+ matched_text: ocrAnchor.region.text,
2324
+ score: ocrAnchor.score || null,
2325
+ };
2326
+ }
2327
+ const uploadable = await this.buildUploadableImage(screenshotPath);
2328
+ const artifact = await this.uploadArtifactForJob(jobId, uploadable.path, {
2329
+ kind: "screenshot",
2330
+ mimeTypeOverride: uploadable.mimeType,
2331
+ fileNameOverride: uploadable.filename,
2332
+ metadata: {
2333
+ purpose,
2334
+ visible_in_chat: false,
2335
+ target: description,
2336
+ width: uploadable.dimensions?.width || undefined,
2337
+ height: uploadable.dimensions?.height || undefined,
2338
+ original_width: uploadable.originalDimensions?.width || undefined,
2339
+ original_height: uploadable.originalDimensions?.height || undefined,
2340
+ resized_for_upload: uploadable.resized,
2341
+ },
2342
+ });
2343
+ if (!artifact?.storage_path) {
2344
+ return null;
2345
+ }
2346
+ artifacts.push(artifact);
2347
+ const artifactMetadata = artifact.metadata || {};
2348
+ const width = Number(artifactMetadata.width || 0);
2349
+ const height = Number(artifactMetadata.height || 0);
2350
+ const originalWidth = Number(artifactMetadata.original_width || width || 0);
2351
+ const originalHeight = Number(artifactMetadata.original_height || height || 0);
2352
+ const location = await this.locateVisualTarget(jobId, artifact.storage_path, description, width, height, artifact.mime_type);
2353
+ if (!location?.found || typeof location.x !== "number" || typeof location.y !== "number") {
2354
+ return null;
2355
+ }
2356
+ return {
2357
+ x: width > 0 && originalWidth > 0 ? (location.x / width) * originalWidth : location.x,
2358
+ y: height > 0 && originalHeight > 0 ? (location.y / height) * originalHeight : location.y,
2359
+ strategy: "visual_locator",
2360
+ score: typeof location.confidence === "number" ? location.confidence : null,
1690
2361
  };
1691
2362
  }
1692
2363
  async getImageDimensions(filePath) {
@@ -1798,6 +2469,55 @@ if let output = String(data: data, encoding: .utf8) {
1798
2469
  }));
1799
2470
  return items.length > 0 ? items.join("\n") : "(pasta vazia)";
1800
2471
  }
2472
+ async countLocalFiles(directoryPath, extensions, recursive = true) {
2473
+ const resolved = expandUserPath(directoryPath);
2474
+ const normalizedExtensions = Array.from(new Set((extensions || [])
2475
+ .map((extension) => String(extension || "").trim().toLowerCase().replace(/^\./, ""))
2476
+ .filter(Boolean)));
2477
+ const queue = [resolved];
2478
+ let total = 0;
2479
+ while (queue.length > 0) {
2480
+ const current = queue.shift();
2481
+ if (!current)
2482
+ continue;
2483
+ let entries;
2484
+ try {
2485
+ entries = await readdir(current, { withFileTypes: true });
2486
+ }
2487
+ catch {
2488
+ continue;
2489
+ }
2490
+ for (const entry of entries) {
2491
+ const entryPath = path.join(current, entry.name);
2492
+ if (entry.isDirectory()) {
2493
+ if (recursive) {
2494
+ queue.push(entryPath);
2495
+ }
2496
+ continue;
2497
+ }
2498
+ if (!entry.isFile()) {
2499
+ continue;
2500
+ }
2501
+ if (normalizedExtensions.length > 0) {
2502
+ const entryExtension = path.extname(entry.name).toLowerCase().replace(/^\./, "");
2503
+ if (!normalizedExtensions.includes(entryExtension)) {
2504
+ continue;
2505
+ }
2506
+ }
2507
+ total += 1;
2508
+ }
2509
+ }
2510
+ const extensionsLabel = normalizedExtensions.length > 0
2511
+ ? normalizedExtensions.map((extension) => `.${extension}`).join(", ")
2512
+ : "do tipo solicitado";
2513
+ return {
2514
+ total,
2515
+ path: directoryPath,
2516
+ extensions: normalizedExtensions,
2517
+ recursive,
2518
+ extensionsLabel,
2519
+ };
2520
+ }
1801
2521
  async runShellCommand(command, cwd) {
1802
2522
  if (!isSafeShellCommand(command)) {
1803
2523
  throw new Error("Nenhum comando shell foi informado para execucao local.");
@@ -1849,15 +2569,30 @@ if let output = String(data: data, encoding: .utf8) {
1849
2569
  if (action.type === "list_files") {
1850
2570
  return `Arquivos listados em ${action.path}`;
1851
2571
  }
2572
+ if (action.type === "count_files") {
2573
+ return `Arquivos contados em ${action.path}`;
2574
+ }
1852
2575
  if (action.type === "run_shell") {
1853
2576
  return `Comando ${action.command} executado no macOS`;
1854
2577
  }
1855
2578
  if (action.type === "set_volume") {
1856
2579
  return `Volume ajustado para ${action.level}% no macOS`;
1857
2580
  }
2581
+ if (action.type === "scroll_view") {
2582
+ return `Tela rolada para ${action.direction === "up" ? "cima" : "baixo"} no macOS`;
2583
+ }
2584
+ if (action.type === "whatsapp_send_message") {
2585
+ return `Mensagem enviada no WhatsApp para ${action.contact}`;
2586
+ }
2587
+ if (action.type === "whatsapp_read_chat") {
2588
+ return `Conversa do WhatsApp lida com ${action.contact}`;
2589
+ }
1858
2590
  if (action.type === "click_visual_target") {
1859
2591
  return `Clique guiado executado para ${action.description}`;
1860
2592
  }
2593
+ if (action.type === "drag_visual_target") {
2594
+ return `Arraste guiado executado de ${action.source_description} para ${action.target_description}`;
2595
+ }
1861
2596
  const target = humanizeUrl(action.url);
1862
2597
  return `${target} foi aberto${action.app ? ` em ${action.app}` : ""}`;
1863
2598
  }
package/dist/types.js CHANGED
@@ -1,5 +1,5 @@
1
1
  export const BRIDGE_CONFIG_VERSION = 1;
2
- export const BRIDGE_VERSION = "0.5.5";
2
+ export const BRIDGE_VERSION = "0.5.8";
3
3
  export const BRIDGE_PACKAGE_NAME = "@leg3ndy/otto-bridge";
4
4
  export const DEFAULT_API_BASE_URL = "http://localhost:8000";
5
5
  export const DEFAULT_POLL_INTERVAL_MS = 3000;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@leg3ndy/otto-bridge",
3
- "version": "0.5.5",
3
+ "version": "0.5.8",
4
4
  "private": false,
5
5
  "type": "module",
6
6
  "description": "Local companion for Otto Bridge device pairing and WebSocket runtime.",