@leg3ndy/otto-bridge 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,6 +30,61 @@ const KNOWN_SITES = [
30
30
  { label: "WhatsApp Web", url: "https://web.whatsapp.com", patterns: [/\bwhatsapp\b/i] },
31
31
  { label: "X", url: "https://x.com", patterns: [/\bx\.com\b/i, /\btwitter\b/i, /\bxis\b/i] },
32
32
  ];
33
+ const GENERIC_VISUAL_STOP_WORDS = new Set([
34
+ "o",
35
+ "a",
36
+ "os",
37
+ "as",
38
+ "um",
39
+ "uma",
40
+ "uns",
41
+ "umas",
42
+ "de",
43
+ "da",
44
+ "do",
45
+ "das",
46
+ "dos",
47
+ "em",
48
+ "no",
49
+ "na",
50
+ "nos",
51
+ "nas",
52
+ "por",
53
+ "para",
54
+ "com",
55
+ "sem",
56
+ "que",
57
+ "visivel",
58
+ "visiveis",
59
+ "tela",
60
+ "pagina",
61
+ "site",
62
+ "app",
63
+ "janela",
64
+ "aba",
65
+ "botao",
66
+ "botoes",
67
+ "link",
68
+ "item",
69
+ "resultado",
70
+ "resultados",
71
+ "primeiro",
72
+ "primeira",
73
+ "segundo",
74
+ "segunda",
75
+ "terceiro",
76
+ "terceira",
77
+ "video",
78
+ "videos",
79
+ "musica",
80
+ "faixa",
81
+ "clicar",
82
+ "clique",
83
+ "seleciona",
84
+ "selecionar",
85
+ "abre",
86
+ "abrir",
87
+ ]);
33
88
  function asRecord(value) {
34
89
  return value && typeof value === "object" ? value : {};
35
90
  }
@@ -49,6 +104,80 @@ function normalizeText(value) {
49
104
  function escapeAppleScript(value) {
50
105
  return value.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
51
106
  }
107
+ function normalizeComparableUrl(raw) {
108
+ const input = String(raw || "").trim();
109
+ if (!input) {
110
+ return "";
111
+ }
112
+ try {
113
+ const parsed = new URL(input);
114
+ parsed.hash = "";
115
+ return parsed.toString();
116
+ }
117
+ catch {
118
+ return input;
119
+ }
120
+ }
121
+ function extractQuotedPhrases(value) {
122
+ return Array.from(String(value || "").matchAll(/["'“”‘’]([^"'“”‘’]{2,80})["'“”‘’]/g))
123
+ .map((match) => normalizeText(match[1] || "").trim())
124
+ .filter(Boolean);
125
+ }
126
+ function extractMeaningfulDescriptionTokens(value) {
127
+ return Array.from(new Set(normalizeText(value || "")
128
+ .split(/[^a-z0-9]+/)
129
+ .map((token) => token.trim())
130
+ .filter((token) => token.length >= 3 && !GENERIC_VISUAL_STOP_WORDS.has(token))));
131
+ }
132
+ function descriptionLikelyHasTextAnchor(description) {
133
+ return extractQuotedPhrases(description).length > 0 || extractMeaningfulDescriptionTokens(description).length > 0;
134
+ }
135
+ function findOcrTextMatch(candidates, description) {
136
+ const phrases = extractQuotedPhrases(description);
137
+ const tokens = extractMeaningfulDescriptionTokens(description);
138
+ const normalizedDescription = normalizeText(description || "");
139
+ const wantsFirst = /\b(primeir[ao]?|first)\b/.test(normalizedDescription);
140
+ if (!phrases.length && !tokens.length) {
141
+ return null;
142
+ }
143
+ const scored = candidates
144
+ .map((candidate, index) => {
145
+ const normalizedText = normalizeText(candidate.text || "");
146
+ let score = 0;
147
+ for (const phrase of phrases) {
148
+ if (normalizedText.includes(phrase)) {
149
+ score += 120;
150
+ }
151
+ }
152
+ for (const token of tokens) {
153
+ if (normalizedText.includes(token)) {
154
+ score += 18;
155
+ }
156
+ }
157
+ if (wantsFirst) {
158
+ score += Math.max(0, 24 - Math.round(candidate.y / 60));
159
+ score += Math.max(0, 12 - index);
160
+ }
161
+ if (candidate.confidence) {
162
+ score += Math.round(candidate.confidence * 20);
163
+ }
164
+ return score > 0 ? {
165
+ candidate,
166
+ score,
167
+ } : null;
168
+ })
169
+ .filter(Boolean);
170
+ scored.sort((left, right) => {
171
+ if (right.score !== left.score) {
172
+ return right.score - left.score;
173
+ }
174
+ if (left.candidate.y !== right.candidate.y) {
175
+ return left.candidate.y - right.candidate.y;
176
+ }
177
+ return left.candidate.x - right.candidate.x;
178
+ });
179
+ return scored[0] || null;
180
+ }
52
181
  function extractTaskText(job) {
53
182
  const payload = asRecord(job.payload);
54
183
  const candidates = [
@@ -159,6 +288,38 @@ function clipText(value, maxLength) {
159
288
  }
160
289
  return `${value.slice(0, maxLength)}...`;
161
290
  }
291
+ const TEXTUTIL_READABLE_EXTENSIONS = new Set([
292
+ ".doc",
293
+ ".docx",
294
+ ".odt",
295
+ ".pages",
296
+ ".rtf",
297
+ ".rtfd",
298
+ ".webarchive",
299
+ ]);
300
+ function sanitizeTextForJsonTransport(value) {
301
+ return value
302
+ .replace(/\r\n/g, "\n")
303
+ .replace(/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F]/g, "")
304
+ .trim();
305
+ }
306
+ function isLikelyBinaryBuffer(buffer) {
307
+ if (buffer.length === 0) {
308
+ return false;
309
+ }
310
+ let suspiciousBytes = 0;
311
+ const sampleSize = Math.min(buffer.length, 4096);
312
+ for (let index = 0; index < sampleSize; index += 1) {
313
+ const byte = buffer[index];
314
+ if (byte === 0) {
315
+ return true;
316
+ }
317
+ if (byte < 7 || (byte > 13 && byte < 32) || byte === 127) {
318
+ suspiciousBytes += 1;
319
+ }
320
+ }
321
+ return (suspiciousBytes / sampleSize) > 0.1;
322
+ }
162
323
  function delay(ms) {
163
324
  return new Promise((resolve) => setTimeout(resolve, ms));
164
325
  }
@@ -451,6 +612,7 @@ export class NativeMacOSJobExecutor {
451
612
  bridgeConfig;
452
613
  cancelledJobs = new Set();
453
614
  activeChild = null;
615
+ lastActiveApp = null;
454
616
  constructor(bridgeConfig) {
455
617
  this.bridgeConfig = bridgeConfig;
456
618
  }
@@ -597,7 +759,12 @@ export class NativeMacOSJobExecutor {
597
759
  continue;
598
760
  }
599
761
  if (action.type === "click_visual_target") {
600
- if (action.app) {
762
+ const browserApp = await this.resolveLikelyBrowserApp(action.app);
763
+ if (browserApp) {
764
+ await reporter.progress(progressPercent, `Trazendo ${browserApp} para frente antes do clique`);
765
+ await this.focusApp(browserApp);
766
+ }
767
+ else if (action.app) {
601
768
  await reporter.progress(progressPercent, `Trazendo ${action.app} para frente antes do clique`);
602
769
  await this.focusApp(action.app);
603
770
  }
@@ -606,8 +773,87 @@ export class NativeMacOSJobExecutor {
606
773
  let lastFailureReason = "";
607
774
  for (let attempt = 0; attempt < targetDescriptions.length; attempt += 1) {
608
775
  const targetDescription = targetDescriptions[attempt];
776
+ const initialBrowserState = browserApp
777
+ ? await this.captureBrowserPageState(browserApp).catch(() => null)
778
+ : null;
779
+ if (browserApp === "Safari") {
780
+ await reporter.progress(progressPercent, `Tentando localizar ${targetDescription} diretamente no Safari`);
781
+ const domClick = await this.trySafariDomClick(targetDescription);
782
+ if (domClick?.clicked) {
783
+ let validated = false;
784
+ let validationReason = "";
785
+ if (action.verification_prompt) {
786
+ const verification = await this.validateVisualClickWithVision(job.job_id, targetDescription, action.verification_prompt, progressPercent, reporter, artifacts, "dom_click_result");
787
+ validated = verification.ok;
788
+ validationReason = verification.reason;
789
+ }
790
+ else {
791
+ const browserValidation = await this.confirmBrowserClick(browserApp, initialBrowserState, targetDescription, domClick.matchedHref || null);
792
+ validated = browserValidation.ok;
793
+ validationReason = browserValidation.reason;
794
+ }
795
+ if (validated) {
796
+ resultPayload.last_click = {
797
+ strategy: domClick.strategy || "safari_dom",
798
+ matched_text: domClick.matchedText || null,
799
+ matched_href: domClick.matchedHref || null,
800
+ score: domClick.score || null,
801
+ total_candidates: domClick.totalCandidates || null,
802
+ };
803
+ completionNotes.push(`Localizei e cliquei em ${targetDescription} diretamente no navegador.`);
804
+ clickSucceeded = true;
805
+ break;
806
+ }
807
+ lastFailureReason = validationReason || `Clique DOM em ${targetDescription} nao alterou a pagina como esperado.`;
808
+ }
809
+ else if (domClick?.reason) {
810
+ lastFailureReason = domClick.reason;
811
+ }
812
+ }
813
+ const visualBeforeState = browserApp
814
+ ? await this.captureBrowserPageState(browserApp).catch(() => initialBrowserState)
815
+ : initialBrowserState;
609
816
  await reporter.progress(progressPercent, `Capturando a tela para localizar ${targetDescription}`);
610
- const screenshotPath = await this.takeScreenshot();
817
+ let screenshotPath = await this.takeScreenshot();
818
+ const ocrClick = await this.tryLocalOcrClick(screenshotPath, targetDescription);
819
+ if (ocrClick.clicked) {
820
+ let validated = false;
821
+ let validationReason = "";
822
+ if (action.verification_prompt) {
823
+ const verification = await this.validateVisualClickWithVision(job.job_id, targetDescription, action.verification_prompt, progressPercent, reporter, artifacts, "local_ocr_click_result");
824
+ validated = verification.ok;
825
+ validationReason = verification.reason;
826
+ }
827
+ else if (browserApp) {
828
+ const browserValidation = await this.confirmBrowserClick(browserApp, visualBeforeState, targetDescription, null);
829
+ validated = browserValidation.ok;
830
+ validationReason = browserValidation.reason;
831
+ }
832
+ else {
833
+ validated = true;
834
+ }
835
+ if (validated) {
836
+ const candidate = ocrClick.candidate || null;
837
+ resultPayload.last_click = {
838
+ strategy: ocrClick.strategy || "local_ocr",
839
+ score: ocrClick.score || null,
840
+ matched_text: candidate?.text || null,
841
+ x: candidate ? candidate.x + (candidate.width / 2) : null,
842
+ y: candidate ? candidate.y + (candidate.height / 2) : null,
843
+ width: candidate?.width || null,
844
+ height: candidate?.height || null,
845
+ };
846
+ completionNotes.push(`Localizei e cliquei em ${targetDescription} por OCR local.`);
847
+ clickSucceeded = true;
848
+ break;
849
+ }
850
+ lastFailureReason = validationReason || `O clique por OCR local em ${targetDescription} nao teve efeito confirmavel.`;
851
+ await reporter.progress(progressPercent, "OCR local nao confirmou o clique; vou tentar visão remota");
852
+ screenshotPath = await this.takeScreenshot();
853
+ }
854
+ else if (ocrClick.reason) {
855
+ lastFailureReason = ocrClick.reason;
856
+ }
611
857
  const uploadable = await this.buildUploadableImage(screenshotPath);
612
858
  const artifact = await this.uploadArtifactForJob(job.job_id, uploadable.path, {
613
859
  kind: "screenshot",
@@ -646,34 +892,20 @@ export class NativeMacOSJobExecutor {
646
892
  ...location,
647
893
  x: scaledX,
648
894
  y: scaledY,
895
+ strategy: "visual_locator",
649
896
  };
650
897
  if (action.verification_prompt) {
651
- await delay(1600);
652
- await reporter.progress(progressPercent, "Validando visualmente se a ação funcionou");
653
- const afterClickPath = await this.takeScreenshot();
654
- const afterClickUpload = await this.buildUploadableImage(afterClickPath);
655
- const afterClickArtifact = await this.uploadArtifactForJob(job.job_id, afterClickUpload.path, {
656
- kind: "screenshot",
657
- mimeTypeOverride: afterClickUpload.mimeType,
658
- fileNameOverride: afterClickUpload.filename,
659
- metadata: {
660
- purpose: "visual_click_result",
661
- visible_in_chat: true,
662
- target: targetDescription,
663
- width: afterClickUpload.dimensions?.width || undefined,
664
- height: afterClickUpload.dimensions?.height || undefined,
665
- original_width: afterClickUpload.originalDimensions?.width || undefined,
666
- original_height: afterClickUpload.originalDimensions?.height || undefined,
667
- resized_for_upload: afterClickUpload.resized,
668
- },
669
- });
670
- if (afterClickArtifact?.storage_path) {
671
- artifacts.push(afterClickArtifact);
672
- const verificationAnswer = await this.analyzeUploadedArtifact(job.job_id, afterClickArtifact.storage_path, action.verification_prompt, afterClickArtifact.mime_type);
673
- if (!looksLikeAffirmativeVisualVerification(verificationAnswer)) {
674
- lastFailureReason = verificationAnswer || `Nao consegui validar visualmente se ${targetDescription} foi acionado.`;
675
- continue;
676
- }
898
+ const verification = await this.validateVisualClickWithVision(job.job_id, targetDescription, action.verification_prompt, progressPercent, reporter, artifacts, "visual_click_result");
899
+ if (!verification.ok) {
900
+ lastFailureReason = verification.reason || `Nao consegui validar visualmente se ${targetDescription} foi acionado.`;
901
+ continue;
902
+ }
903
+ }
904
+ else if (browserApp) {
905
+ const browserValidation = await this.confirmBrowserClick(browserApp, visualBeforeState, targetDescription, null);
906
+ if (!browserValidation.ok) {
907
+ lastFailureReason = browserValidation.reason || `O clique em ${targetDescription} nao alterou a pagina como esperado.`;
908
+ continue;
677
909
  }
678
910
  }
679
911
  completionNotes.push(`Localizei e cliquei em ${targetDescription}.`);
@@ -722,6 +954,7 @@ export class NativeMacOSJobExecutor {
722
954
  if (app) {
723
955
  await this.runCommand("open", ["-a", app, url]);
724
956
  await this.focusApp(app);
957
+ this.lastActiveApp = app;
725
958
  return;
726
959
  }
727
960
  await this.runCommand("open", [url]);
@@ -749,6 +982,118 @@ end tell
749
982
  }
750
983
  async focusApp(app) {
751
984
  await this.runCommand("osascript", ["-e", `tell application "${escapeAppleScript(app)}" to activate`]);
985
+ this.lastActiveApp = app;
986
+ }
987
+ async getFrontmostAppName() {
988
+ try {
989
+ const { stdout } = await this.runCommandCapture("osascript", [
990
+ "-e",
991
+ 'tell application "System Events" to get name of first application process whose frontmost is true',
992
+ ]);
993
+ const app = String(stdout || "").trim();
994
+ return app || null;
995
+ }
996
+ catch {
997
+ return null;
998
+ }
999
+ }
1000
+ async resolveLikelyBrowserApp(preferredApp) {
1001
+ const candidates = [
1002
+ preferredApp || null,
1003
+ this.lastActiveApp,
1004
+ await this.getFrontmostAppName(),
1005
+ ];
1006
+ for (const candidate of candidates) {
1007
+ if (candidate === "Safari") {
1008
+ return candidate;
1009
+ }
1010
+ }
1011
+ return null;
1012
+ }
1013
+ async captureBrowserPageState(app) {
1014
+ if (app !== "Safari") {
1015
+ return null;
1016
+ }
1017
+ const page = await this.readFrontmostPage(app);
1018
+ return {
1019
+ app,
1020
+ title: page.title,
1021
+ url: page.url,
1022
+ text: page.text,
1023
+ };
1024
+ }
1025
+ resolveExpectedBrowserHref(rawHref, baseUrl) {
1026
+ const href = String(rawHref || "").trim();
1027
+ if (!href) {
1028
+ return null;
1029
+ }
1030
+ try {
1031
+ const absolute = baseUrl ? new URL(href, baseUrl).toString() : new URL(href).toString();
1032
+ return normalizeComparableUrl(absolute);
1033
+ }
1034
+ catch {
1035
+ return normalizeComparableUrl(href);
1036
+ }
1037
+ }
1038
+ didBrowserPageStateChange(before, after, targetDescription, matchedHref) {
1039
+ if (!after) {
1040
+ return false;
1041
+ }
1042
+ const beforeUrl = normalizeComparableUrl(before?.url || "");
1043
+ const afterUrl = normalizeComparableUrl(after.url || "");
1044
+ const expectedHref = this.resolveExpectedBrowserHref(matchedHref || null, before?.url || after.url);
1045
+ if (expectedHref && afterUrl) {
1046
+ if (afterUrl === expectedHref || afterUrl.startsWith(expectedHref) || expectedHref.startsWith(afterUrl)) {
1047
+ return true;
1048
+ }
1049
+ }
1050
+ if (beforeUrl && afterUrl && beforeUrl !== afterUrl) {
1051
+ return true;
1052
+ }
1053
+ const normalizedDescription = normalizeText(targetDescription || "");
1054
+ if (normalizedDescription.includes("youtube")
1055
+ || normalizedDescription.includes("video")
1056
+ || normalizedDescription.includes("musica")) {
1057
+ if (afterUrl.includes("youtube.com/watch") || afterUrl.includes("youtube.com/shorts/")) {
1058
+ return true;
1059
+ }
1060
+ }
1061
+ const beforeTitle = normalizeText(before?.title || "");
1062
+ const afterTitle = normalizeText(after.title || "");
1063
+ if (beforeTitle && afterTitle && beforeTitle !== afterTitle) {
1064
+ return true;
1065
+ }
1066
+ const beforeText = normalizeText((before?.text || "").slice(0, 320));
1067
+ const afterText = normalizeText((after.text || "").slice(0, 320));
1068
+ if (beforeText && afterText && beforeText !== afterText) {
1069
+ return true;
1070
+ }
1071
+ return false;
1072
+ }
1073
+ async confirmBrowserClick(app, before, targetDescription, matchedHref) {
1074
+ if (app !== "Safari") {
1075
+ return {
1076
+ ok: true,
1077
+ reason: "",
1078
+ afterState: null,
1079
+ };
1080
+ }
1081
+ for (let attempt = 0; attempt < 4; attempt += 1) {
1082
+ await delay(attempt === 0 ? 900 : 700);
1083
+ const afterState = await this.captureBrowserPageState(app).catch(() => null);
1084
+ if (this.didBrowserPageStateChange(before, afterState, targetDescription, matchedHref)) {
1085
+ return {
1086
+ ok: true,
1087
+ reason: "",
1088
+ afterState,
1089
+ };
1090
+ }
1091
+ }
1092
+ return {
1093
+ ok: false,
1094
+ reason: `O clique em ${targetDescription} nao mudou a pagina do navegador de forma verificavel.`,
1095
+ afterState: null,
1096
+ };
752
1097
  }
753
1098
  async pressShortcut(shortcut) {
754
1099
  const { key, modifiers } = parseShortcut(shortcut);
@@ -833,6 +1178,263 @@ end tell
833
1178
  });
834
1179
  return String(response.answer || "").trim();
835
1180
  }
1181
+ async validateVisualClickWithVision(jobId, targetDescription, verificationPrompt, progressPercent, reporter, artifacts, purpose) {
1182
+ await delay(1600);
1183
+ await reporter.progress(progressPercent, "Validando visualmente se a ação funcionou");
1184
+ const afterClickPath = await this.takeScreenshot();
1185
+ const afterClickUpload = await this.buildUploadableImage(afterClickPath);
1186
+ const afterClickArtifact = await this.uploadArtifactForJob(jobId, afterClickUpload.path, {
1187
+ kind: "screenshot",
1188
+ mimeTypeOverride: afterClickUpload.mimeType,
1189
+ fileNameOverride: afterClickUpload.filename,
1190
+ metadata: {
1191
+ purpose,
1192
+ visible_in_chat: true,
1193
+ target: targetDescription,
1194
+ width: afterClickUpload.dimensions?.width || undefined,
1195
+ height: afterClickUpload.dimensions?.height || undefined,
1196
+ original_width: afterClickUpload.originalDimensions?.width || undefined,
1197
+ original_height: afterClickUpload.originalDimensions?.height || undefined,
1198
+ resized_for_upload: afterClickUpload.resized,
1199
+ },
1200
+ });
1201
+ if (!afterClickArtifact?.storage_path) {
1202
+ return {
1203
+ ok: false,
1204
+ reason: `Nao consegui registrar a tela apos tentar clicar em ${targetDescription}.`,
1205
+ };
1206
+ }
1207
+ artifacts.push(afterClickArtifact);
1208
+ const verificationAnswer = await this.analyzeUploadedArtifact(jobId, afterClickArtifact.storage_path, verificationPrompt, afterClickArtifact.mime_type);
1209
+ if (!looksLikeAffirmativeVisualVerification(verificationAnswer)) {
1210
+ return {
1211
+ ok: false,
1212
+ reason: verificationAnswer || `Nao consegui validar visualmente se ${targetDescription} foi acionado.`,
1213
+ };
1214
+ }
1215
+ return {
1216
+ ok: true,
1217
+ reason: verificationAnswer,
1218
+ };
1219
+ }
1220
+ async runSafariJsonScript(scriptBody, input) {
1221
+ const wrappedScript = `
1222
+ (function(){
1223
+ const __input = ${JSON.stringify(input || null)};
1224
+ try {
1225
+ const __result = (() => {
1226
+ ${scriptBody}
1227
+ })();
1228
+ return JSON.stringify({ ok: true, result: __result === undefined ? null : __result });
1229
+ } catch (error) {
1230
+ return JSON.stringify({
1231
+ ok: false,
1232
+ error: String(error && error.message ? error.message : error)
1233
+ });
1234
+ }
1235
+ })()
1236
+ `;
1237
+ const script = `
1238
+ tell application "Safari"
1239
+ activate
1240
+ if (count of windows) = 0 then error "Safari nao possui janelas abertas."
1241
+ delay 0.2
1242
+ set scriptResult to do JavaScript "${escapeAppleScript(wrappedScript)}" in current tab of front window
1243
+ end tell
1244
+ return scriptResult
1245
+ `;
1246
+ const { stdout } = await this.runCommandCapture("osascript", ["-e", script]);
1247
+ const parsed = JSON.parse(stdout.trim() || "{}");
1248
+ if (parsed.ok !== true) {
1249
+ throw new Error(asString(parsed.error) || "Safari JavaScript execution failed");
1250
+ }
1251
+ return parsed.result;
1252
+ }
1253
+ async trySafariDomClick(description) {
1254
+ try {
1255
+ return await this.runSafariJsonScript(`
1256
+ const rawDescription = String(__input?.description || "");
1257
+ const normalize = (value) => String(value || "")
1258
+ .normalize("NFD")
1259
+ .replace(/[\\u0300-\\u036f]/g, "")
1260
+ .toLowerCase();
1261
+ const normalizedDescription = normalize(rawDescription);
1262
+ const wantsFirst = /\\b(primeir[ao]?|first)\\b/.test(normalizedDescription);
1263
+ const wantsVideo = /\\b(video|videos|musica|faixa|youtube|resultado|watch)\\b/.test(normalizedDescription) || location.hostname.includes("youtube");
1264
+ const stopWords = new Set([
1265
+ "o", "a", "os", "as", "um", "uma", "uns", "umas", "de", "da", "do", "das", "dos",
1266
+ "em", "no", "na", "nos", "nas", "para", "por", "com", "que", "visivel", "visiveis",
1267
+ "visivel", "tela", "pagina", "page", "site", "link", "botao", "botao", "clicar",
1268
+ "clique", "seleciona", "selecionar", "resultado", "resultados"
1269
+ ]);
1270
+ const quotedPhrases = Array.from(rawDescription.matchAll(/["'“”‘’]([^"'“”‘’]{2,80})["'“”‘’]/g))
1271
+ .map((match) => normalize(match[1]));
1272
+ const tokens = Array.from(new Set(
1273
+ normalizedDescription
1274
+ .split(/[^a-z0-9]+/)
1275
+ .filter((token) => token.length >= 3 && !stopWords.has(token))
1276
+ ));
1277
+
1278
+ const candidateSelectors = location.hostname.includes("youtube")
1279
+ ? [
1280
+ "ytd-video-renderer a#video-title",
1281
+ "ytd-video-renderer ytd-thumbnail a",
1282
+ "ytd-video-renderer a#thumbnail",
1283
+ "ytd-rich-item-renderer a#video-title-link",
1284
+ "ytd-rich-item-renderer a#video-title",
1285
+ "ytd-rich-grid-media a#video-title-link",
1286
+ "a#video-title",
1287
+ "a[href*='/watch']",
1288
+ "button",
1289
+ "[role='button']",
1290
+ "[role='link']"
1291
+ ]
1292
+ : [
1293
+ "a[href]",
1294
+ "button",
1295
+ "[role='button']",
1296
+ "[role='link']",
1297
+ "input[type='button']",
1298
+ "input[type='submit']"
1299
+ ];
1300
+
1301
+ const seen = new Set();
1302
+ const candidates = [];
1303
+
1304
+ function isVisible(element) {
1305
+ if (!(element instanceof Element)) return false;
1306
+ const rect = element.getBoundingClientRect();
1307
+ if (rect.width < 4 || rect.height < 4) return false;
1308
+ const style = window.getComputedStyle(element);
1309
+ if (style.visibility === "hidden" || style.display === "none" || Number(style.opacity || "1") === 0) return false;
1310
+ return rect.bottom >= 0 && rect.right >= 0 && rect.top <= window.innerHeight && rect.left <= window.innerWidth;
1311
+ }
1312
+
1313
+ function deriveText(element) {
1314
+ const ownText = [
1315
+ element.innerText,
1316
+ element.textContent,
1317
+ element.getAttribute("aria-label"),
1318
+ element.getAttribute("title"),
1319
+ element.getAttribute("alt"),
1320
+ ].find((value) => typeof value === "string" && value.trim());
1321
+ if (ownText && ownText.trim()) return ownText.trim();
1322
+
1323
+ const richVideo = element.closest("ytd-video-renderer, ytd-rich-item-renderer, ytd-rich-grid-media, ytmusic-responsive-list-item-renderer");
1324
+ if (richVideo && richVideo instanceof HTMLElement && richVideo.innerText.trim()) {
1325
+ return richVideo.innerText.trim();
1326
+ }
1327
+
1328
+ return "";
1329
+ }
1330
+
1331
+ function scoreCandidate(element, rank) {
1332
+ const text = deriveText(element);
1333
+ const href = element instanceof HTMLAnchorElement
1334
+ ? (element.href || "")
1335
+ : (element.getAttribute("href") || "");
1336
+ const normalizedText = normalize(text);
1337
+ const normalizedHref = normalize(href);
1338
+ let score = 0;
1339
+
1340
+ if (wantsFirst) score += Math.max(0, 40 - rank);
1341
+ if (wantsVideo && normalizedHref.includes("/watch")) score += 30;
1342
+ if (location.hostname.includes("youtube") && element.closest("ytd-video-renderer, ytd-rich-item-renderer, ytd-rich-grid-media")) score += 20;
1343
+ if (element.id === "video-title") score += 12;
1344
+ if (!normalizedText && normalizedHref.includes("/watch")) score += 8;
1345
+
1346
+ for (const phrase of quotedPhrases) {
1347
+ if (!phrase) continue;
1348
+ if (normalizedText.includes(phrase)) score += 120;
1349
+ if (normalizedHref.includes(phrase)) score += 40;
1350
+ }
1351
+
1352
+ for (const token of tokens) {
1353
+ if (normalizedText.includes(token)) score += 18;
1354
+ if (normalizedHref.includes(token)) score += 8;
1355
+ }
1356
+
1357
+ return {
1358
+ element,
1359
+ text,
1360
+ href,
1361
+ score,
1362
+ rank,
1363
+ };
1364
+ }
1365
+
1366
+ for (const selector of candidateSelectors) {
1367
+ const nodes = document.querySelectorAll(selector);
1368
+ for (const node of nodes) {
1369
+ if (!(node instanceof HTMLElement || node instanceof HTMLAnchorElement)) continue;
1370
+ if (!isVisible(node)) continue;
1371
+ const key = [
1372
+ node.tagName,
1373
+ node.id || "",
1374
+ node.getAttribute("href") || "",
1375
+ deriveText(node).slice(0, 120),
1376
+ ].join("|");
1377
+ if (seen.has(key)) continue;
1378
+ seen.add(key);
1379
+ candidates.push(scoreCandidate(node, candidates.length));
1380
+ }
1381
+ }
1382
+
1383
+ const ranked = candidates
1384
+ .filter((candidate) => candidate.score > 0 || (wantsFirst && normalize(candidate.href).includes("/watch")))
1385
+ .sort((left, right) => right.score - left.score || left.rank - right.rank);
1386
+
1387
+ if (!ranked.length) {
1388
+ return {
1389
+ clicked: false,
1390
+ reason: "Nenhum elemento clicavel no DOM combinou com a descricao atual.",
1391
+ totalCandidates: candidates.length,
1392
+ strategy: "safari_dom",
1393
+ };
1394
+ }
1395
+
1396
+ const winner = ranked[0];
1397
+ winner.element.scrollIntoView({ block: "center", inline: "center", behavior: "auto" });
1398
+ const rect = winner.element.getBoundingClientRect();
1399
+ for (const eventName of ["mouseover", "mousedown", "mouseup", "click"]) {
1400
+ winner.element.dispatchEvent(new MouseEvent(eventName, {
1401
+ bubbles: true,
1402
+ cancelable: true,
1403
+ view: window,
1404
+ clientX: rect.left + (rect.width / 2),
1405
+ clientY: rect.top + (rect.height / 2),
1406
+ }));
1407
+ }
1408
+ if (typeof winner.element.click === "function") {
1409
+ winner.element.click();
1410
+ }
1411
+
1412
+ return {
1413
+ clicked: true,
1414
+ matchedText: String(winner.text || "").slice(0, 180),
1415
+ matchedHref: winner.href || "",
1416
+ score: winner.score,
1417
+ totalCandidates: candidates.length,
1418
+ strategy: "safari_dom",
1419
+ };
1420
+ `, { description });
1421
+ }
1422
+ catch (error) {
1423
+ const detail = error instanceof Error ? error.message : String(error);
1424
+ if (detail.toLowerCase().includes("allow javascript from apple events")) {
1425
+ return {
1426
+ clicked: false,
1427
+ reason: "Safari ainda bloqueia JavaScript por Apple Events, entao o Otto Bridge caiu para o modo visual.",
1428
+ strategy: "safari_dom_blocked",
1429
+ };
1430
+ }
1431
+ return {
1432
+ clicked: false,
1433
+ reason: detail || "Falha ao tentar clicar via DOM no Safari.",
1434
+ strategy: "safari_dom_failed",
1435
+ };
1436
+ }
1437
+ }
836
1438
  async readFrontmostPage(app) {
837
1439
  const targetApp = app || "Safari";
838
1440
  if (targetApp !== "Safari") {
@@ -921,6 +1523,128 @@ post(.leftMouseUp)
921
1523
  `;
922
1524
  await this.runCommand("swift", ["-e", script, String(Math.round(x)), String(Math.round(y))]);
923
1525
  }
1526
+ async runLocalOcr(filePath) {
1527
+ const script = `
1528
+ import Foundation
1529
+ import Vision
1530
+ import ImageIO
1531
+ import CoreGraphics
1532
+
1533
+ let fileURL = URL(fileURLWithPath: CommandLine.arguments[1])
1534
+ guard let source = CGImageSourceCreateWithURL(fileURL as CFURL, nil),
1535
+ let image = CGImageSourceCreateImageAtIndex(source, 0, nil) else {
1536
+ fputs("failed to load image\\n", stderr)
1537
+ exit(1)
1538
+ }
1539
+
1540
+ let width = CGFloat(image.width)
1541
+ let height = CGFloat(image.height)
1542
+ var items: [[String: Any]] = []
1543
+
1544
+ let request = VNRecognizeTextRequest()
1545
+ request.recognitionLevel = .accurate
1546
+ request.usesLanguageCorrection = true
1547
+ request.recognitionLanguages = ["pt-BR", "en-US"]
1548
+ request.minimumTextHeight = 0.012
1549
+
1550
+ let handler = VNImageRequestHandler(cgImage: image, options: [:])
1551
+ try handler.perform([request])
1552
+
1553
+ let observations = request.results ?? []
1554
+ for observation in observations {
1555
+ guard let candidate = observation.topCandidates(1).first else { continue }
1556
+ let text = candidate.string.trimmingCharacters(in: .whitespacesAndNewlines)
1557
+ if text.isEmpty { continue }
1558
+
1559
+ let box = observation.boundingBox
1560
+ let x = box.origin.x * width
1561
+ let y = (1.0 - box.origin.y - box.size.height) * height
1562
+ let w = box.size.width * width
1563
+ let h = box.size.height * height
1564
+
1565
+ items.append([
1566
+ "text": text,
1567
+ "x": Int(round(x)),
1568
+ "y": Int(round(y)),
1569
+ "width": Int(round(w)),
1570
+ "height": Int(round(h)),
1571
+ "confidence": candidate.confidence
1572
+ ])
1573
+ }
1574
+
1575
+ let payload: [String: Any] = ["items": items]
1576
+ let data = try JSONSerialization.data(withJSONObject: payload, options: [])
1577
+ if let output = String(data: data, encoding: .utf8) {
1578
+ print(output)
1579
+ }
1580
+ `;
1581
+ try {
1582
+ const { stdout } = await this.runCommandCapture("swift", ["-e", script, filePath]);
1583
+ const parsed = JSON.parse(stdout.trim() || "{}");
1584
+ const items = Array.isArray(parsed.items) ? parsed.items : [];
1585
+ return items
1586
+ .map((item) => {
1587
+ const row = asRecord(item);
1588
+ const text = asString(row.text);
1589
+ const x = Number(row.x);
1590
+ const y = Number(row.y);
1591
+ const width = Number(row.width);
1592
+ const height = Number(row.height);
1593
+ const confidence = Number(row.confidence);
1594
+ if (!text || !Number.isFinite(x) || !Number.isFinite(y) || !Number.isFinite(width) || !Number.isFinite(height)) {
1595
+ return null;
1596
+ }
1597
+ return {
1598
+ text,
1599
+ x,
1600
+ y,
1601
+ width,
1602
+ height,
1603
+ confidence: Number.isFinite(confidence) ? confidence : undefined,
1604
+ };
1605
+ })
1606
+ .filter(Boolean);
1607
+ }
1608
+ catch (error) {
1609
+ const detail = error instanceof Error ? error.message : String(error);
1610
+ console.warn(`[otto-bridge] local ocr failed=${detail}`);
1611
+ return [];
1612
+ }
1613
+ }
1614
+ async tryLocalOcrClick(screenshotPath, description) {
1615
+ if (!descriptionLikelyHasTextAnchor(description)) {
1616
+ return {
1617
+ clicked: false,
1618
+ reason: "A descricao nao traz ancora textual forte para OCR local.",
1619
+ strategy: "local_ocr_skipped",
1620
+ };
1621
+ }
1622
+ const candidates = await this.runLocalOcr(screenshotPath);
1623
+ if (!candidates.length) {
1624
+ return {
1625
+ clicked: false,
1626
+ reason: "OCR local nao encontrou texto utilizavel na tela.",
1627
+ strategy: "local_ocr_empty",
1628
+ };
1629
+ }
1630
+ const match = findOcrTextMatch(candidates, description);
1631
+ if (!match || match.score < 24) {
1632
+ return {
1633
+ clicked: false,
1634
+ reason: "OCR local nao encontrou texto suficientemente compativel com a descricao.",
1635
+ strategy: "local_ocr_no_match",
1636
+ };
1637
+ }
1638
+ const clickX = match.candidate.x + (match.candidate.width / 2);
1639
+ const clickY = match.candidate.y + (match.candidate.height / 2);
1640
+ await this.clickPoint(clickX, clickY);
1641
+ return {
1642
+ clicked: true,
1643
+ score: match.score,
1644
+ candidate: match.candidate,
1645
+ strategy: "local_ocr",
1646
+ };
1647
+ }
924
1648
  async getImageDimensions(filePath) {
925
1649
  try {
926
1650
  const { stdout } = await this.runCommandCapture("sips", ["-g", "pixelWidth", "-g", "pixelHeight", filePath]);
@@ -993,8 +1717,25 @@ post(.leftMouseUp)
993
1717
  }
994
1718
  async readLocalFile(filePath, maxChars = 4000) {
995
1719
  const resolved = expandUserPath(filePath);
996
- const content = await readFile(resolved, "utf8");
997
- return clipText(content.trim() || "(arquivo vazio)", maxChars);
1720
+ const extension = path.extname(resolved).toLowerCase();
1721
+ if (TEXTUTIL_READABLE_EXTENSIONS.has(extension)) {
1722
+ const { stdout } = await this.runCommandCapture("textutil", [
1723
+ "-convert",
1724
+ "txt",
1725
+ "-stdout",
1726
+ resolved,
1727
+ ]);
1728
+ const content = sanitizeTextForJsonTransport(stdout);
1729
+ return clipText(content || "(arquivo sem texto legivel)", maxChars);
1730
+ }
1731
+ const raw = await readFile(resolved);
1732
+ if (isLikelyBinaryBuffer(raw)) {
1733
+ const filename = path.basename(resolved);
1734
+ const detectedType = extension || "binario";
1735
+ return clipText(`O arquivo ${filename} parece ser binario (${detectedType}) e nao pode ser lido como texto puro pelo Otto Bridge ainda.`, maxChars);
1736
+ }
1737
+ const content = sanitizeTextForJsonTransport(raw.toString("utf8"));
1738
+ return clipText(content || "(arquivo vazio)", maxChars);
998
1739
  }
999
1740
  async listLocalFiles(directoryPath, limit = 40) {
1000
1741
  const resolved = expandUserPath(directoryPath);
package/dist/types.js CHANGED
@@ -1,5 +1,5 @@
1
1
  export const BRIDGE_CONFIG_VERSION = 1;
2
- export const BRIDGE_VERSION = "0.5.2";
2
+ export const BRIDGE_VERSION = "0.5.3";
3
3
  export const BRIDGE_PACKAGE_NAME = "@leg3ndy/otto-bridge";
4
4
  export const DEFAULT_API_BASE_URL = "http://localhost:8000";
5
5
  export const DEFAULT_POLL_INTERVAL_MS = 3000;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@leg3ndy/otto-bridge",
3
- "version": "0.5.2",
3
+ "version": "0.5.3",
4
4
  "private": false,
5
5
  "type": "module",
6
6
  "description": "Local companion for Otto Bridge device pairing and WebSocket runtime.",