@leg3ndy/otto-bridge 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,6 +11,8 @@ Companion local do Otto para:
11
11
 
12
12
  Para um passo a passo de instalacao, pareamento, uso, desconexao e desinstalacao, veja [USER_GUIDE.md](https://github.com/LGCYYL/ottoai/blob/main/otto-bridge/USER_GUIDE.md).
13
13
 
14
+ Para o estado atual da arquitetura, capacidades entregues, limitacoes e roadmap do Otto Bridge, veja [`leg3ndy-ai-backend/docs/OTTO_BRIDGE_ARCHITECTURE.md`](../leg3ndy-ai-backend/docs/OTTO_BRIDGE_ARCHITECTURE.md).
15
+
14
16
  ## Distribuicao
15
17
 
16
18
  Fluxo recomendado agora:
@@ -30,6 +30,61 @@ const KNOWN_SITES = [
30
30
  { label: "WhatsApp Web", url: "https://web.whatsapp.com", patterns: [/\bwhatsapp\b/i] },
31
31
  { label: "X", url: "https://x.com", patterns: [/\bx\.com\b/i, /\btwitter\b/i, /\bxis\b/i] },
32
32
  ];
33
+ const GENERIC_VISUAL_STOP_WORDS = new Set([
34
+ "o",
35
+ "a",
36
+ "os",
37
+ "as",
38
+ "um",
39
+ "uma",
40
+ "uns",
41
+ "umas",
42
+ "de",
43
+ "da",
44
+ "do",
45
+ "das",
46
+ "dos",
47
+ "em",
48
+ "no",
49
+ "na",
50
+ "nos",
51
+ "nas",
52
+ "por",
53
+ "para",
54
+ "com",
55
+ "sem",
56
+ "que",
57
+ "visivel",
58
+ "visiveis",
59
+ "tela",
60
+ "pagina",
61
+ "site",
62
+ "app",
63
+ "janela",
64
+ "aba",
65
+ "botao",
66
+ "botoes",
67
+ "link",
68
+ "item",
69
+ "resultado",
70
+ "resultados",
71
+ "primeiro",
72
+ "primeira",
73
+ "segundo",
74
+ "segunda",
75
+ "terceiro",
76
+ "terceira",
77
+ "video",
78
+ "videos",
79
+ "musica",
80
+ "faixa",
81
+ "clicar",
82
+ "clique",
83
+ "seleciona",
84
+ "selecionar",
85
+ "abre",
86
+ "abrir",
87
+ ]);
33
88
  function asRecord(value) {
34
89
  return value && typeof value === "object" ? value : {};
35
90
  }
@@ -49,6 +104,80 @@ function normalizeText(value) {
49
104
  function escapeAppleScript(value) {
50
105
  return value.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
51
106
  }
107
+ function normalizeComparableUrl(raw) {
108
+ const input = String(raw || "").trim();
109
+ if (!input) {
110
+ return "";
111
+ }
112
+ try {
113
+ const parsed = new URL(input);
114
+ parsed.hash = "";
115
+ return parsed.toString();
116
+ }
117
+ catch {
118
+ return input;
119
+ }
120
+ }
121
+ function extractQuotedPhrases(value) {
122
+ return Array.from(String(value || "").matchAll(/["'“”‘’]([^"'“”‘’]{2,80})["'“”‘’]/g))
123
+ .map((match) => normalizeText(match[1] || "").trim())
124
+ .filter(Boolean);
125
+ }
126
+ function extractMeaningfulDescriptionTokens(value) {
127
+ return Array.from(new Set(normalizeText(value || "")
128
+ .split(/[^a-z0-9]+/)
129
+ .map((token) => token.trim())
130
+ .filter((token) => token.length >= 3 && !GENERIC_VISUAL_STOP_WORDS.has(token))));
131
+ }
132
+ function descriptionLikelyHasTextAnchor(description) {
133
+ return extractQuotedPhrases(description).length > 0 || extractMeaningfulDescriptionTokens(description).length > 0;
134
+ }
135
+ function findOcrTextMatch(candidates, description) {
136
+ const phrases = extractQuotedPhrases(description);
137
+ const tokens = extractMeaningfulDescriptionTokens(description);
138
+ const normalizedDescription = normalizeText(description || "");
139
+ const wantsFirst = /\b(primeir[ao]?|first)\b/.test(normalizedDescription);
140
+ if (!phrases.length && !tokens.length) {
141
+ return null;
142
+ }
143
+ const scored = candidates
144
+ .map((candidate, index) => {
145
+ const normalizedText = normalizeText(candidate.text || "");
146
+ let score = 0;
147
+ for (const phrase of phrases) {
148
+ if (normalizedText.includes(phrase)) {
149
+ score += 120;
150
+ }
151
+ }
152
+ for (const token of tokens) {
153
+ if (normalizedText.includes(token)) {
154
+ score += 18;
155
+ }
156
+ }
157
+ if (wantsFirst) {
158
+ score += Math.max(0, 24 - Math.round(candidate.y / 60));
159
+ score += Math.max(0, 12 - index);
160
+ }
161
+ if (candidate.confidence) {
162
+ score += Math.round(candidate.confidence * 20);
163
+ }
164
+ return score > 0 ? {
165
+ candidate,
166
+ score,
167
+ } : null;
168
+ })
169
+ .filter(Boolean);
170
+ scored.sort((left, right) => {
171
+ if (right.score !== left.score) {
172
+ return right.score - left.score;
173
+ }
174
+ if (left.candidate.y !== right.candidate.y) {
175
+ return left.candidate.y - right.candidate.y;
176
+ }
177
+ return left.candidate.x - right.candidate.x;
178
+ });
179
+ return scored[0] || null;
180
+ }
52
181
  function extractTaskText(job) {
53
182
  const payload = asRecord(job.payload);
54
183
  const candidates = [
@@ -88,6 +217,39 @@ function humanizeUrl(url) {
88
217
  return normalized;
89
218
  }
90
219
  }
220
+ function uniqueStrings(values) {
221
+ const seen = new Set();
222
+ const result = [];
223
+ for (const value of values) {
224
+ const text = String(value || "").trim();
225
+ if (!text)
226
+ continue;
227
+ const key = normalizeText(text);
228
+ if (seen.has(key))
229
+ continue;
230
+ seen.add(key);
231
+ result.push(text);
232
+ }
233
+ return result;
234
+ }
235
+ function looksLikeAffirmativeVisualVerification(answer) {
236
+ const normalized = normalizeText(answer || "");
237
+ if (!normalized)
238
+ return false;
239
+ if (normalized.startsWith("sim"))
240
+ return true;
241
+ if (normalized.startsWith("nao") || normalized.startsWith("não"))
242
+ return false;
243
+ return (normalized.includes("tocando")
244
+ || normalized.includes("reproduzindo")
245
+ || normalized.includes("em reproducao")
246
+ || normalized.includes("em reprodução")
247
+ || normalized.includes("botao de pausa")
248
+ || normalized.includes("botão de pausa")
249
+ || normalized.includes("faixa ativa")
250
+ || normalized.includes("resultado selecionado")
251
+ || normalized.includes("foi acionado"));
252
+ }
91
253
  function mimeTypeFromPath(filePath) {
92
254
  const ext = path.extname(filePath).toLowerCase();
93
255
  if (ext === ".png")
@@ -126,6 +288,38 @@ function clipText(value, maxLength) {
126
288
  }
127
289
  return `${value.slice(0, maxLength)}...`;
128
290
  }
291
+ const TEXTUTIL_READABLE_EXTENSIONS = new Set([
292
+ ".doc",
293
+ ".docx",
294
+ ".odt",
295
+ ".pages",
296
+ ".rtf",
297
+ ".rtfd",
298
+ ".webarchive",
299
+ ]);
300
+ function sanitizeTextForJsonTransport(value) {
301
+ return value
302
+ .replace(/\r\n/g, "\n")
303
+ .replace(/[\u0000-\u0008\u000B\u000C\u000E-\u001F\u007F]/g, "")
304
+ .trim();
305
+ }
306
+ function isLikelyBinaryBuffer(buffer) {
307
+ if (buffer.length === 0) {
308
+ return false;
309
+ }
310
+ let suspiciousBytes = 0;
311
+ const sampleSize = Math.min(buffer.length, 4096);
312
+ for (let index = 0; index < sampleSize; index += 1) {
313
+ const byte = buffer[index];
314
+ if (byte === 0) {
315
+ return true;
316
+ }
317
+ if (byte < 7 || (byte > 13 && byte < 32) || byte === 127) {
318
+ suspiciousBytes += 1;
319
+ }
320
+ }
321
+ return (suspiciousBytes / sampleSize) > 0.1;
322
+ }
129
323
  function delay(ms) {
130
324
  return new Promise((resolve) => setTimeout(resolve, ms));
131
325
  }
@@ -327,6 +521,10 @@ function parseStructuredActions(job) {
327
521
  type: "click_visual_target",
328
522
  description,
329
523
  app: asString(action.app) || undefined,
524
+ verification_prompt: asString(action.verification_prompt) || undefined,
525
+ retry_descriptions: Array.isArray(action.retry_descriptions)
526
+ ? action.retry_descriptions.map((item) => asString(item)).filter(Boolean)
527
+ : undefined,
330
528
  });
331
529
  }
332
530
  continue;
@@ -414,6 +612,7 @@ export class NativeMacOSJobExecutor {
414
612
  bridgeConfig;
415
613
  cancelledJobs = new Set();
416
614
  activeChild = null;
615
+ lastActiveApp = null;
417
616
  constructor(bridgeConfig) {
418
617
  this.bridgeConfig = bridgeConfig;
419
618
  }
@@ -486,6 +685,7 @@ export class NativeMacOSJobExecutor {
486
685
  mimeTypeOverride: uploadable.mimeType,
487
686
  fileNameOverride: uploadable.filename,
488
687
  metadata: {
688
+ visible_in_chat: true,
489
689
  width: uploadable.dimensions?.width || undefined,
490
690
  height: uploadable.dimensions?.height || undefined,
491
691
  original_width: uploadable.originalDimensions?.width || undefined,
@@ -516,6 +716,7 @@ export class NativeMacOSJobExecutor {
516
716
  fileNameOverride: uploadable.filename,
517
717
  metadata: {
518
718
  purpose: "page_read_fallback",
719
+ visible_in_chat: false,
519
720
  width: uploadable.dimensions?.width || undefined,
520
721
  height: uploadable.dimensions?.height || undefined,
521
722
  original_width: uploadable.originalDimensions?.width || undefined,
@@ -558,50 +759,162 @@ export class NativeMacOSJobExecutor {
558
759
  continue;
559
760
  }
560
761
  if (action.type === "click_visual_target") {
561
- if (action.app) {
762
+ const browserApp = await this.resolveLikelyBrowserApp(action.app);
763
+ if (browserApp) {
764
+ await reporter.progress(progressPercent, `Trazendo ${browserApp} para frente antes do clique`);
765
+ await this.focusApp(browserApp);
766
+ }
767
+ else if (action.app) {
562
768
  await reporter.progress(progressPercent, `Trazendo ${action.app} para frente antes do clique`);
563
769
  await this.focusApp(action.app);
564
770
  }
565
- await reporter.progress(progressPercent, `Capturando a tela para localizar ${action.description}`);
566
- const screenshotPath = await this.takeScreenshot();
567
- const uploadable = await this.buildUploadableImage(screenshotPath);
568
- const artifact = await this.uploadArtifactForJob(job.job_id, uploadable.path, {
569
- kind: "screenshot",
570
- mimeTypeOverride: uploadable.mimeType,
571
- fileNameOverride: uploadable.filename,
572
- metadata: {
573
- purpose: "visual_click",
574
- target: action.description,
575
- width: uploadable.dimensions?.width || undefined,
576
- height: uploadable.dimensions?.height || undefined,
577
- original_width: uploadable.originalDimensions?.width || undefined,
578
- original_height: uploadable.originalDimensions?.height || undefined,
579
- resized_for_upload: uploadable.resized,
580
- },
581
- });
582
- if (!artifact?.storage_path) {
583
- throw new Error("Otto Bridge nao conseguiu enviar a screenshot necessaria para localizar o alvo visual.");
771
+ const targetDescriptions = uniqueStrings([action.description, ...(action.retry_descriptions || [])]);
772
+ let clickSucceeded = false;
773
+ let lastFailureReason = "";
774
+ for (let attempt = 0; attempt < targetDescriptions.length; attempt += 1) {
775
+ const targetDescription = targetDescriptions[attempt];
776
+ const initialBrowserState = browserApp
777
+ ? await this.captureBrowserPageState(browserApp).catch(() => null)
778
+ : null;
779
+ if (browserApp === "Safari") {
780
+ await reporter.progress(progressPercent, `Tentando localizar ${targetDescription} diretamente no Safari`);
781
+ const domClick = await this.trySafariDomClick(targetDescription);
782
+ if (domClick?.clicked) {
783
+ let validated = false;
784
+ let validationReason = "";
785
+ if (action.verification_prompt) {
786
+ const verification = await this.validateVisualClickWithVision(job.job_id, targetDescription, action.verification_prompt, progressPercent, reporter, artifacts, "dom_click_result");
787
+ validated = verification.ok;
788
+ validationReason = verification.reason;
789
+ }
790
+ else {
791
+ const browserValidation = await this.confirmBrowserClick(browserApp, initialBrowserState, targetDescription, domClick.matchedHref || null);
792
+ validated = browserValidation.ok;
793
+ validationReason = browserValidation.reason;
794
+ }
795
+ if (validated) {
796
+ resultPayload.last_click = {
797
+ strategy: domClick.strategy || "safari_dom",
798
+ matched_text: domClick.matchedText || null,
799
+ matched_href: domClick.matchedHref || null,
800
+ score: domClick.score || null,
801
+ total_candidates: domClick.totalCandidates || null,
802
+ };
803
+ completionNotes.push(`Localizei e cliquei em ${targetDescription} diretamente no navegador.`);
804
+ clickSucceeded = true;
805
+ break;
806
+ }
807
+ lastFailureReason = validationReason || `Clique DOM em ${targetDescription} nao alterou a pagina como esperado.`;
808
+ }
809
+ else if (domClick?.reason) {
810
+ lastFailureReason = domClick.reason;
811
+ }
812
+ }
813
+ const visualBeforeState = browserApp
814
+ ? await this.captureBrowserPageState(browserApp).catch(() => initialBrowserState)
815
+ : initialBrowserState;
816
+ await reporter.progress(progressPercent, `Capturando a tela para localizar ${targetDescription}`);
817
+ let screenshotPath = await this.takeScreenshot();
818
+ const ocrClick = await this.tryLocalOcrClick(screenshotPath, targetDescription);
819
+ if (ocrClick.clicked) {
820
+ let validated = false;
821
+ let validationReason = "";
822
+ if (action.verification_prompt) {
823
+ const verification = await this.validateVisualClickWithVision(job.job_id, targetDescription, action.verification_prompt, progressPercent, reporter, artifacts, "local_ocr_click_result");
824
+ validated = verification.ok;
825
+ validationReason = verification.reason;
826
+ }
827
+ else if (browserApp) {
828
+ const browserValidation = await this.confirmBrowserClick(browserApp, visualBeforeState, targetDescription, null);
829
+ validated = browserValidation.ok;
830
+ validationReason = browserValidation.reason;
831
+ }
832
+ else {
833
+ validated = true;
834
+ }
835
+ if (validated) {
836
+ const candidate = ocrClick.candidate || null;
837
+ resultPayload.last_click = {
838
+ strategy: ocrClick.strategy || "local_ocr",
839
+ score: ocrClick.score || null,
840
+ matched_text: candidate?.text || null,
841
+ x: candidate ? candidate.x + (candidate.width / 2) : null,
842
+ y: candidate ? candidate.y + (candidate.height / 2) : null,
843
+ width: candidate?.width || null,
844
+ height: candidate?.height || null,
845
+ };
846
+ completionNotes.push(`Localizei e cliquei em ${targetDescription} por OCR local.`);
847
+ clickSucceeded = true;
848
+ break;
849
+ }
850
+ lastFailureReason = validationReason || `O clique por OCR local em ${targetDescription} nao teve efeito confirmavel.`;
851
+ await reporter.progress(progressPercent, "OCR local nao confirmou o clique; vou tentar visão remota");
852
+ screenshotPath = await this.takeScreenshot();
853
+ }
854
+ else if (ocrClick.reason) {
855
+ lastFailureReason = ocrClick.reason;
856
+ }
857
+ const uploadable = await this.buildUploadableImage(screenshotPath);
858
+ const artifact = await this.uploadArtifactForJob(job.job_id, uploadable.path, {
859
+ kind: "screenshot",
860
+ mimeTypeOverride: uploadable.mimeType,
861
+ fileNameOverride: uploadable.filename,
862
+ metadata: {
863
+ purpose: "visual_click",
864
+ visible_in_chat: false,
865
+ target: targetDescription,
866
+ width: uploadable.dimensions?.width || undefined,
867
+ height: uploadable.dimensions?.height || undefined,
868
+ original_width: uploadable.originalDimensions?.width || undefined,
869
+ original_height: uploadable.originalDimensions?.height || undefined,
870
+ resized_for_upload: uploadable.resized,
871
+ },
872
+ });
873
+ if (!artifact?.storage_path) {
874
+ throw new Error("Otto Bridge nao conseguiu enviar a screenshot necessaria para localizar o alvo visual.");
875
+ }
876
+ artifacts.push(artifact);
877
+ const artifactMetadata = artifact.metadata || {};
878
+ const width = Number(artifactMetadata.width || 0);
879
+ const height = Number(artifactMetadata.height || 0);
880
+ const originalWidth = Number(artifactMetadata.original_width || width || 0);
881
+ const originalHeight = Number(artifactMetadata.original_height || height || 0);
882
+ const location = await this.locateVisualTarget(job.job_id, artifact.storage_path, targetDescription, width, height, artifact.mime_type);
883
+ if (!location?.found || typeof location.x !== "number" || typeof location.y !== "number") {
884
+ lastFailureReason = `Nao consegui localizar ${targetDescription} com confianca suficiente na tela.`;
885
+ continue;
886
+ }
887
+ await reporter.progress(progressPercent, `Clicando em ${targetDescription}`);
888
+ const scaledX = width > 0 && originalWidth > 0 ? (location.x / width) * originalWidth : location.x;
889
+ const scaledY = height > 0 && originalHeight > 0 ? (location.y / height) * originalHeight : location.y;
890
+ await this.clickPoint(scaledX, scaledY);
891
+ resultPayload.last_click = {
892
+ ...location,
893
+ x: scaledX,
894
+ y: scaledY,
895
+ strategy: "visual_locator",
896
+ };
897
+ if (action.verification_prompt) {
898
+ const verification = await this.validateVisualClickWithVision(job.job_id, targetDescription, action.verification_prompt, progressPercent, reporter, artifacts, "visual_click_result");
899
+ if (!verification.ok) {
900
+ lastFailureReason = verification.reason || `Nao consegui validar visualmente se ${targetDescription} foi acionado.`;
901
+ continue;
902
+ }
903
+ }
904
+ else if (browserApp) {
905
+ const browserValidation = await this.confirmBrowserClick(browserApp, visualBeforeState, targetDescription, null);
906
+ if (!browserValidation.ok) {
907
+ lastFailureReason = browserValidation.reason || `O clique em ${targetDescription} nao alterou a pagina como esperado.`;
908
+ continue;
909
+ }
910
+ }
911
+ completionNotes.push(`Localizei e cliquei em ${targetDescription}.`);
912
+ clickSucceeded = true;
913
+ break;
584
914
  }
585
- artifacts.push(artifact);
586
- const artifactMetadata = artifact.metadata || {};
587
- const width = Number(artifactMetadata.width || 0);
588
- const height = Number(artifactMetadata.height || 0);
589
- const originalWidth = Number(artifactMetadata.original_width || width || 0);
590
- const originalHeight = Number(artifactMetadata.original_height || height || 0);
591
- const location = await this.locateVisualTarget(job.job_id, artifact.storage_path, action.description, width, height, artifact.mime_type);
592
- if (!location?.found || typeof location.x !== "number" || typeof location.y !== "number") {
593
- throw new Error(`Nao consegui localizar ${action.description} com confianca suficiente na tela.`);
915
+ if (!clickSucceeded) {
916
+ throw new Error(lastFailureReason || `Nao consegui concluir o clique visual para ${action.description}.`);
594
917
  }
595
- await reporter.progress(progressPercent, `Clicando em ${action.description}`);
596
- const scaledX = width > 0 && originalWidth > 0 ? (location.x / width) * originalWidth : location.x;
597
- const scaledY = height > 0 && originalHeight > 0 ? (location.y / height) * originalHeight : location.y;
598
- await this.clickPoint(scaledX, scaledY);
599
- completionNotes.push(`Localizei e cliquei em ${action.description}.`);
600
- resultPayload.last_click = {
601
- ...location,
602
- x: scaledX,
603
- y: scaledY,
604
- };
605
918
  continue;
606
919
  }
607
920
  await reporter.progress(progressPercent, `Abrindo ${action.url}${action.app ? ` em ${action.app}` : ""}`);
@@ -641,6 +954,7 @@ export class NativeMacOSJobExecutor {
641
954
  if (app) {
642
955
  await this.runCommand("open", ["-a", app, url]);
643
956
  await this.focusApp(app);
957
+ this.lastActiveApp = app;
644
958
  return;
645
959
  }
646
960
  await this.runCommand("open", [url]);
@@ -668,6 +982,118 @@ end tell
668
982
  }
669
983
  async focusApp(app) {
670
984
  await this.runCommand("osascript", ["-e", `tell application "${escapeAppleScript(app)}" to activate`]);
985
+ this.lastActiveApp = app;
986
+ }
987
+ async getFrontmostAppName() {
988
+ try {
989
+ const { stdout } = await this.runCommandCapture("osascript", [
990
+ "-e",
991
+ 'tell application "System Events" to get name of first application process whose frontmost is true',
992
+ ]);
993
+ const app = String(stdout || "").trim();
994
+ return app || null;
995
+ }
996
+ catch {
997
+ return null;
998
+ }
999
+ }
1000
+ async resolveLikelyBrowserApp(preferredApp) {
1001
+ const candidates = [
1002
+ preferredApp || null,
1003
+ this.lastActiveApp,
1004
+ await this.getFrontmostAppName(),
1005
+ ];
1006
+ for (const candidate of candidates) {
1007
+ if (candidate === "Safari") {
1008
+ return candidate;
1009
+ }
1010
+ }
1011
+ return null;
1012
+ }
1013
+ async captureBrowserPageState(app) {
1014
+ if (app !== "Safari") {
1015
+ return null;
1016
+ }
1017
+ const page = await this.readFrontmostPage(app);
1018
+ return {
1019
+ app,
1020
+ title: page.title,
1021
+ url: page.url,
1022
+ text: page.text,
1023
+ };
1024
+ }
1025
+ resolveExpectedBrowserHref(rawHref, baseUrl) {
1026
+ const href = String(rawHref || "").trim();
1027
+ if (!href) {
1028
+ return null;
1029
+ }
1030
+ try {
1031
+ const absolute = baseUrl ? new URL(href, baseUrl).toString() : new URL(href).toString();
1032
+ return normalizeComparableUrl(absolute);
1033
+ }
1034
+ catch {
1035
+ return normalizeComparableUrl(href);
1036
+ }
1037
+ }
1038
+ didBrowserPageStateChange(before, after, targetDescription, matchedHref) {
1039
+ if (!after) {
1040
+ return false;
1041
+ }
1042
+ const beforeUrl = normalizeComparableUrl(before?.url || "");
1043
+ const afterUrl = normalizeComparableUrl(after.url || "");
1044
+ const expectedHref = this.resolveExpectedBrowserHref(matchedHref || null, before?.url || after.url);
1045
+ if (expectedHref && afterUrl) {
1046
+ if (afterUrl === expectedHref || afterUrl.startsWith(expectedHref) || expectedHref.startsWith(afterUrl)) {
1047
+ return true;
1048
+ }
1049
+ }
1050
+ if (beforeUrl && afterUrl && beforeUrl !== afterUrl) {
1051
+ return true;
1052
+ }
1053
+ const normalizedDescription = normalizeText(targetDescription || "");
1054
+ if (normalizedDescription.includes("youtube")
1055
+ || normalizedDescription.includes("video")
1056
+ || normalizedDescription.includes("musica")) {
1057
+ if (afterUrl.includes("youtube.com/watch") || afterUrl.includes("youtube.com/shorts/")) {
1058
+ return true;
1059
+ }
1060
+ }
1061
+ const beforeTitle = normalizeText(before?.title || "");
1062
+ const afterTitle = normalizeText(after.title || "");
1063
+ if (beforeTitle && afterTitle && beforeTitle !== afterTitle) {
1064
+ return true;
1065
+ }
1066
+ const beforeText = normalizeText((before?.text || "").slice(0, 320));
1067
+ const afterText = normalizeText((after.text || "").slice(0, 320));
1068
+ if (beforeText && afterText && beforeText !== afterText) {
1069
+ return true;
1070
+ }
1071
+ return false;
1072
+ }
1073
+ async confirmBrowserClick(app, before, targetDescription, matchedHref) {
1074
+ if (app !== "Safari") {
1075
+ return {
1076
+ ok: true,
1077
+ reason: "",
1078
+ afterState: null,
1079
+ };
1080
+ }
1081
+ for (let attempt = 0; attempt < 4; attempt += 1) {
1082
+ await delay(attempt === 0 ? 900 : 700);
1083
+ const afterState = await this.captureBrowserPageState(app).catch(() => null);
1084
+ if (this.didBrowserPageStateChange(before, afterState, targetDescription, matchedHref)) {
1085
+ return {
1086
+ ok: true,
1087
+ reason: "",
1088
+ afterState,
1089
+ };
1090
+ }
1091
+ }
1092
+ return {
1093
+ ok: false,
1094
+ reason: `O clique em ${targetDescription} nao mudou a pagina do navegador de forma verificavel.`,
1095
+ afterState: null,
1096
+ };
671
1097
  }
672
1098
  async pressShortcut(shortcut) {
673
1099
  const { key, modifiers } = parseShortcut(shortcut);
@@ -752,6 +1178,263 @@ end tell
752
1178
  });
753
1179
  return String(response.answer || "").trim();
754
1180
  }
1181
+ async validateVisualClickWithVision(jobId, targetDescription, verificationPrompt, progressPercent, reporter, artifacts, purpose) {
1182
+ await delay(1600);
1183
+ await reporter.progress(progressPercent, "Validando visualmente se a ação funcionou");
1184
+ const afterClickPath = await this.takeScreenshot();
1185
+ const afterClickUpload = await this.buildUploadableImage(afterClickPath);
1186
+ const afterClickArtifact = await this.uploadArtifactForJob(jobId, afterClickUpload.path, {
1187
+ kind: "screenshot",
1188
+ mimeTypeOverride: afterClickUpload.mimeType,
1189
+ fileNameOverride: afterClickUpload.filename,
1190
+ metadata: {
1191
+ purpose,
1192
+ visible_in_chat: true,
1193
+ target: targetDescription,
1194
+ width: afterClickUpload.dimensions?.width || undefined,
1195
+ height: afterClickUpload.dimensions?.height || undefined,
1196
+ original_width: afterClickUpload.originalDimensions?.width || undefined,
1197
+ original_height: afterClickUpload.originalDimensions?.height || undefined,
1198
+ resized_for_upload: afterClickUpload.resized,
1199
+ },
1200
+ });
1201
+ if (!afterClickArtifact?.storage_path) {
1202
+ return {
1203
+ ok: false,
1204
+ reason: `Nao consegui registrar a tela apos tentar clicar em ${targetDescription}.`,
1205
+ };
1206
+ }
1207
+ artifacts.push(afterClickArtifact);
1208
+ const verificationAnswer = await this.analyzeUploadedArtifact(jobId, afterClickArtifact.storage_path, verificationPrompt, afterClickArtifact.mime_type);
1209
+ if (!looksLikeAffirmativeVisualVerification(verificationAnswer)) {
1210
+ return {
1211
+ ok: false,
1212
+ reason: verificationAnswer || `Nao consegui validar visualmente se ${targetDescription} foi acionado.`,
1213
+ };
1214
+ }
1215
+ return {
1216
+ ok: true,
1217
+ reason: verificationAnswer,
1218
+ };
1219
+ }
1220
+ async runSafariJsonScript(scriptBody, input) {
1221
+ const wrappedScript = `
1222
+ (function(){
1223
+ const __input = ${JSON.stringify(input || null)};
1224
+ try {
1225
+ const __result = (() => {
1226
+ ${scriptBody}
1227
+ })();
1228
+ return JSON.stringify({ ok: true, result: __result === undefined ? null : __result });
1229
+ } catch (error) {
1230
+ return JSON.stringify({
1231
+ ok: false,
1232
+ error: String(error && error.message ? error.message : error)
1233
+ });
1234
+ }
1235
+ })()
1236
+ `;
1237
+ const script = `
1238
+ tell application "Safari"
1239
+ activate
1240
+ if (count of windows) = 0 then error "Safari nao possui janelas abertas."
1241
+ delay 0.2
1242
+ set scriptResult to do JavaScript "${escapeAppleScript(wrappedScript)}" in current tab of front window
1243
+ end tell
1244
+ return scriptResult
1245
+ `;
1246
+ const { stdout } = await this.runCommandCapture("osascript", ["-e", script]);
1247
+ const parsed = JSON.parse(stdout.trim() || "{}");
1248
+ if (parsed.ok !== true) {
1249
+ throw new Error(asString(parsed.error) || "Safari JavaScript execution failed");
1250
+ }
1251
+ return parsed.result;
1252
+ }
1253
+ async trySafariDomClick(description) {
1254
+ try {
1255
+ return await this.runSafariJsonScript(`
1256
+ const rawDescription = String(__input?.description || "");
1257
+ const normalize = (value) => String(value || "")
1258
+ .normalize("NFD")
1259
+ .replace(/[\\u0300-\\u036f]/g, "")
1260
+ .toLowerCase();
1261
+ const normalizedDescription = normalize(rawDescription);
1262
+ const wantsFirst = /\\b(primeir[ao]?|first)\\b/.test(normalizedDescription);
1263
+ const wantsVideo = /\\b(video|videos|musica|faixa|youtube|resultado|watch)\\b/.test(normalizedDescription) || location.hostname.includes("youtube");
1264
+ const stopWords = new Set([
1265
+ "o", "a", "os", "as", "um", "uma", "uns", "umas", "de", "da", "do", "das", "dos",
1266
+ "em", "no", "na", "nos", "nas", "para", "por", "com", "que", "visivel", "visiveis",
1267
+ "visivel", "tela", "pagina", "page", "site", "link", "botao", "botao", "clicar",
1268
+ "clique", "seleciona", "selecionar", "resultado", "resultados"
1269
+ ]);
1270
+ const quotedPhrases = Array.from(rawDescription.matchAll(/["'“”‘’]([^"'“”‘’]{2,80})["'“”‘’]/g))
1271
+ .map((match) => normalize(match[1]));
1272
+ const tokens = Array.from(new Set(
1273
+ normalizedDescription
1274
+ .split(/[^a-z0-9]+/)
1275
+ .filter((token) => token.length >= 3 && !stopWords.has(token))
1276
+ ));
1277
+
1278
+ const candidateSelectors = location.hostname.includes("youtube")
1279
+ ? [
1280
+ "ytd-video-renderer a#video-title",
1281
+ "ytd-video-renderer ytd-thumbnail a",
1282
+ "ytd-video-renderer a#thumbnail",
1283
+ "ytd-rich-item-renderer a#video-title-link",
1284
+ "ytd-rich-item-renderer a#video-title",
1285
+ "ytd-rich-grid-media a#video-title-link",
1286
+ "a#video-title",
1287
+ "a[href*='/watch']",
1288
+ "button",
1289
+ "[role='button']",
1290
+ "[role='link']"
1291
+ ]
1292
+ : [
1293
+ "a[href]",
1294
+ "button",
1295
+ "[role='button']",
1296
+ "[role='link']",
1297
+ "input[type='button']",
1298
+ "input[type='submit']"
1299
+ ];
1300
+
1301
+ const seen = new Set();
1302
+ const candidates = [];
1303
+
1304
+ function isVisible(element) {
1305
+ if (!(element instanceof Element)) return false;
1306
+ const rect = element.getBoundingClientRect();
1307
+ if (rect.width < 4 || rect.height < 4) return false;
1308
+ const style = window.getComputedStyle(element);
1309
+ if (style.visibility === "hidden" || style.display === "none" || Number(style.opacity || "1") === 0) return false;
1310
+ return rect.bottom >= 0 && rect.right >= 0 && rect.top <= window.innerHeight && rect.left <= window.innerWidth;
1311
+ }
1312
+
1313
+ function deriveText(element) {
1314
+ const ownText = [
1315
+ element.innerText,
1316
+ element.textContent,
1317
+ element.getAttribute("aria-label"),
1318
+ element.getAttribute("title"),
1319
+ element.getAttribute("alt"),
1320
+ ].find((value) => typeof value === "string" && value.trim());
1321
+ if (ownText && ownText.trim()) return ownText.trim();
1322
+
1323
+ const richVideo = element.closest("ytd-video-renderer, ytd-rich-item-renderer, ytd-rich-grid-media, ytmusic-responsive-list-item-renderer");
1324
+ if (richVideo && richVideo instanceof HTMLElement && richVideo.innerText.trim()) {
1325
+ return richVideo.innerText.trim();
1326
+ }
1327
+
1328
+ return "";
1329
+ }
1330
+
1331
+ function scoreCandidate(element, rank) {
1332
+ const text = deriveText(element);
1333
+ const href = element instanceof HTMLAnchorElement
1334
+ ? (element.href || "")
1335
+ : (element.getAttribute("href") || "");
1336
+ const normalizedText = normalize(text);
1337
+ const normalizedHref = normalize(href);
1338
+ let score = 0;
1339
+
1340
+ if (wantsFirst) score += Math.max(0, 40 - rank);
1341
+ if (wantsVideo && normalizedHref.includes("/watch")) score += 30;
1342
+ if (location.hostname.includes("youtube") && element.closest("ytd-video-renderer, ytd-rich-item-renderer, ytd-rich-grid-media")) score += 20;
1343
+ if (element.id === "video-title") score += 12;
1344
+ if (!normalizedText && normalizedHref.includes("/watch")) score += 8;
1345
+
1346
+ for (const phrase of quotedPhrases) {
1347
+ if (!phrase) continue;
1348
+ if (normalizedText.includes(phrase)) score += 120;
1349
+ if (normalizedHref.includes(phrase)) score += 40;
1350
+ }
1351
+
1352
+ for (const token of tokens) {
1353
+ if (normalizedText.includes(token)) score += 18;
1354
+ if (normalizedHref.includes(token)) score += 8;
1355
+ }
1356
+
1357
+ return {
1358
+ element,
1359
+ text,
1360
+ href,
1361
+ score,
1362
+ rank,
1363
+ };
1364
+ }
1365
+
1366
+ for (const selector of candidateSelectors) {
1367
+ const nodes = document.querySelectorAll(selector);
1368
+ for (const node of nodes) {
1369
+ if (!(node instanceof HTMLElement || node instanceof HTMLAnchorElement)) continue;
1370
+ if (!isVisible(node)) continue;
1371
+ const key = [
1372
+ node.tagName,
1373
+ node.id || "",
1374
+ node.getAttribute("href") || "",
1375
+ deriveText(node).slice(0, 120),
1376
+ ].join("|");
1377
+ if (seen.has(key)) continue;
1378
+ seen.add(key);
1379
+ candidates.push(scoreCandidate(node, candidates.length));
1380
+ }
1381
+ }
1382
+
1383
+ const ranked = candidates
1384
+ .filter((candidate) => candidate.score > 0 || (wantsFirst && normalize(candidate.href).includes("/watch")))
1385
+ .sort((left, right) => right.score - left.score || left.rank - right.rank);
1386
+
1387
+ if (!ranked.length) {
1388
+ return {
1389
+ clicked: false,
1390
+ reason: "Nenhum elemento clicavel no DOM combinou com a descricao atual.",
1391
+ totalCandidates: candidates.length,
1392
+ strategy: "safari_dom",
1393
+ };
1394
+ }
1395
+
1396
+ const winner = ranked[0];
1397
+ winner.element.scrollIntoView({ block: "center", inline: "center", behavior: "auto" });
1398
+ const rect = winner.element.getBoundingClientRect();
1399
+ for (const eventName of ["mouseover", "mousedown", "mouseup", "click"]) {
1400
+ winner.element.dispatchEvent(new MouseEvent(eventName, {
1401
+ bubbles: true,
1402
+ cancelable: true,
1403
+ view: window,
1404
+ clientX: rect.left + (rect.width / 2),
1405
+ clientY: rect.top + (rect.height / 2),
1406
+ }));
1407
+ }
1408
+ if (typeof winner.element.click === "function") {
1409
+ winner.element.click();
1410
+ }
1411
+
1412
+ return {
1413
+ clicked: true,
1414
+ matchedText: String(winner.text || "").slice(0, 180),
1415
+ matchedHref: winner.href || "",
1416
+ score: winner.score,
1417
+ totalCandidates: candidates.length,
1418
+ strategy: "safari_dom",
1419
+ };
1420
+ `, { description });
1421
+ }
1422
+ catch (error) {
1423
+ const detail = error instanceof Error ? error.message : String(error);
1424
+ if (detail.toLowerCase().includes("allow javascript from apple events")) {
1425
+ return {
1426
+ clicked: false,
1427
+ reason: "Safari ainda bloqueia JavaScript por Apple Events, entao o Otto Bridge caiu para o modo visual.",
1428
+ strategy: "safari_dom_blocked",
1429
+ };
1430
+ }
1431
+ return {
1432
+ clicked: false,
1433
+ reason: detail || "Falha ao tentar clicar via DOM no Safari.",
1434
+ strategy: "safari_dom_failed",
1435
+ };
1436
+ }
1437
+ }
755
1438
  async readFrontmostPage(app) {
756
1439
  const targetApp = app || "Safari";
757
1440
  if (targetApp !== "Safari") {
@@ -840,6 +1523,128 @@ post(.leftMouseUp)
840
1523
  `;
841
1524
  await this.runCommand("swift", ["-e", script, String(Math.round(x)), String(Math.round(y))]);
842
1525
  }
1526
+ async runLocalOcr(filePath) {
1527
+ const script = `
1528
+ import Foundation
1529
+ import Vision
1530
+ import ImageIO
1531
+ import CoreGraphics
1532
+
1533
+ let fileURL = URL(fileURLWithPath: CommandLine.arguments[1])
1534
+ guard let source = CGImageSourceCreateWithURL(fileURL as CFURL, nil),
1535
+ let image = CGImageSourceCreateImageAtIndex(source, 0, nil) else {
1536
+ fputs("failed to load image\\n", stderr)
1537
+ exit(1)
1538
+ }
1539
+
1540
+ let width = CGFloat(image.width)
1541
+ let height = CGFloat(image.height)
1542
+ var items: [[String: Any]] = []
1543
+
1544
+ let request = VNRecognizeTextRequest()
1545
+ request.recognitionLevel = .accurate
1546
+ request.usesLanguageCorrection = true
1547
+ request.recognitionLanguages = ["pt-BR", "en-US"]
1548
+ request.minimumTextHeight = 0.012
1549
+
1550
+ let handler = VNImageRequestHandler(cgImage: image, options: [:])
1551
+ try handler.perform([request])
1552
+
1553
+ let observations = request.results ?? []
1554
+ for observation in observations {
1555
+ guard let candidate = observation.topCandidates(1).first else { continue }
1556
+ let text = candidate.string.trimmingCharacters(in: .whitespacesAndNewlines)
1557
+ if text.isEmpty { continue }
1558
+
1559
+ let box = observation.boundingBox
1560
+ let x = box.origin.x * width
1561
+ let y = (1.0 - box.origin.y - box.size.height) * height
1562
+ let w = box.size.width * width
1563
+ let h = box.size.height * height
1564
+
1565
+ items.append([
1566
+ "text": text,
1567
+ "x": Int(round(x)),
1568
+ "y": Int(round(y)),
1569
+ "width": Int(round(w)),
1570
+ "height": Int(round(h)),
1571
+ "confidence": candidate.confidence
1572
+ ])
1573
+ }
1574
+
1575
+ let payload: [String: Any] = ["items": items]
1576
+ let data = try JSONSerialization.data(withJSONObject: payload, options: [])
1577
+ if let output = String(data: data, encoding: .utf8) {
1578
+ print(output)
1579
+ }
1580
+ `;
1581
+ try {
1582
+ const { stdout } = await this.runCommandCapture("swift", ["-e", script, filePath]);
1583
+ const parsed = JSON.parse(stdout.trim() || "{}");
1584
+ const items = Array.isArray(parsed.items) ? parsed.items : [];
1585
+ return items
1586
+ .map((item) => {
1587
+ const row = asRecord(item);
1588
+ const text = asString(row.text);
1589
+ const x = Number(row.x);
1590
+ const y = Number(row.y);
1591
+ const width = Number(row.width);
1592
+ const height = Number(row.height);
1593
+ const confidence = Number(row.confidence);
1594
+ if (!text || !Number.isFinite(x) || !Number.isFinite(y) || !Number.isFinite(width) || !Number.isFinite(height)) {
1595
+ return null;
1596
+ }
1597
+ return {
1598
+ text,
1599
+ x,
1600
+ y,
1601
+ width,
1602
+ height,
1603
+ confidence: Number.isFinite(confidence) ? confidence : undefined,
1604
+ };
1605
+ })
1606
+ .filter(Boolean);
1607
+ }
1608
+ catch (error) {
1609
+ const detail = error instanceof Error ? error.message : String(error);
1610
+ console.warn(`[otto-bridge] local ocr failed=${detail}`);
1611
+ return [];
1612
+ }
1613
+ }
1614
+ async tryLocalOcrClick(screenshotPath, description) {
1615
+ if (!descriptionLikelyHasTextAnchor(description)) {
1616
+ return {
1617
+ clicked: false,
1618
+ reason: "A descricao nao traz ancora textual forte para OCR local.",
1619
+ strategy: "local_ocr_skipped",
1620
+ };
1621
+ }
1622
+ const candidates = await this.runLocalOcr(screenshotPath);
1623
+ if (!candidates.length) {
1624
+ return {
1625
+ clicked: false,
1626
+ reason: "OCR local nao encontrou texto utilizavel na tela.",
1627
+ strategy: "local_ocr_empty",
1628
+ };
1629
+ }
1630
+ const match = findOcrTextMatch(candidates, description);
1631
+ if (!match || match.score < 24) {
1632
+ return {
1633
+ clicked: false,
1634
+ reason: "OCR local nao encontrou texto suficientemente compativel com a descricao.",
1635
+ strategy: "local_ocr_no_match",
1636
+ };
1637
+ }
1638
+ const clickX = match.candidate.x + (match.candidate.width / 2);
1639
+ const clickY = match.candidate.y + (match.candidate.height / 2);
1640
+ await this.clickPoint(clickX, clickY);
1641
+ return {
1642
+ clicked: true,
1643
+ score: match.score,
1644
+ candidate: match.candidate,
1645
+ strategy: "local_ocr",
1646
+ };
1647
+ }
843
1648
  async getImageDimensions(filePath) {
844
1649
  try {
845
1650
  const { stdout } = await this.runCommandCapture("sips", ["-g", "pixelWidth", "-g", "pixelHeight", filePath]);
@@ -873,6 +1678,9 @@ post(.leftMouseUp)
873
1678
  { width: 640, quality: 22 },
874
1679
  { width: 540, quality: 18 },
875
1680
  { width: 480, quality: 16 },
1681
+ { width: 420, quality: 14 },
1682
+ { width: 360, quality: 12 },
1683
+ { width: 320, quality: 10 },
876
1684
  ];
877
1685
  for (const step of conversionSteps) {
878
1686
  const candidatePath = path.join(artifactsDir, `${path.basename(localPath, path.extname(localPath))}-${step.width}w-q${step.quality}.jpg`);
@@ -894,7 +1702,7 @@ post(.leftMouseUp)
894
1702
  mimeType = "image/jpeg";
895
1703
  filename = path.basename(candidatePath);
896
1704
  resized = true;
897
- if (candidateStat.size <= 220_000) {
1705
+ if (candidateStat.size <= 120_000) {
898
1706
  break;
899
1707
  }
900
1708
  }
@@ -909,8 +1717,25 @@ post(.leftMouseUp)
909
1717
  }
910
1718
  async readLocalFile(filePath, maxChars = 4000) {
911
1719
  const resolved = expandUserPath(filePath);
912
- const content = await readFile(resolved, "utf8");
913
- return clipText(content.trim() || "(arquivo vazio)", maxChars);
1720
+ const extension = path.extname(resolved).toLowerCase();
1721
+ if (TEXTUTIL_READABLE_EXTENSIONS.has(extension)) {
1722
+ const { stdout } = await this.runCommandCapture("textutil", [
1723
+ "-convert",
1724
+ "txt",
1725
+ "-stdout",
1726
+ resolved,
1727
+ ]);
1728
+ const content = sanitizeTextForJsonTransport(stdout);
1729
+ return clipText(content || "(arquivo sem texto legivel)", maxChars);
1730
+ }
1731
+ const raw = await readFile(resolved);
1732
+ if (isLikelyBinaryBuffer(raw)) {
1733
+ const filename = path.basename(resolved);
1734
+ const detectedType = extension || "binario";
1735
+ return clipText(`O arquivo ${filename} parece ser binario (${detectedType}) e nao pode ser lido como texto puro pelo Otto Bridge ainda.`, maxChars);
1736
+ }
1737
+ const content = sanitizeTextForJsonTransport(raw.toString("utf8"));
1738
+ return clipText(content || "(arquivo vazio)", maxChars);
914
1739
  }
915
1740
  async listLocalFiles(directoryPath, limit = 40) {
916
1741
  const resolved = expandUserPath(directoryPath);
package/dist/types.js CHANGED
@@ -1,5 +1,5 @@
1
1
  export const BRIDGE_CONFIG_VERSION = 1;
2
- export const BRIDGE_VERSION = "0.5.1";
2
+ export const BRIDGE_VERSION = "0.5.3";
3
3
  export const BRIDGE_PACKAGE_NAME = "@leg3ndy/otto-bridge";
4
4
  export const DEFAULT_API_BASE_URL = "http://localhost:8000";
5
5
  export const DEFAULT_POLL_INTERVAL_MS = 3000;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@leg3ndy/otto-bridge",
3
- "version": "0.5.1",
3
+ "version": "0.5.3",
4
4
  "private": false,
5
5
  "type": "module",
6
6
  "description": "Local companion for Otto Bridge device pairing and WebSocket runtime.",