@leg3ndy/otto-bridge 0.5.4 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/config.js CHANGED
@@ -33,6 +33,26 @@ function sanitizePollIntervalMs(value, fallback = DEFAULT_CLAWD_CURSOR_POLL_INTE
33
33
  }
34
34
  return Math.max(250, Math.floor(parsed));
35
35
  }
36
+ export function normalizeInstalledExtensions(values) {
37
+ if (!Array.isArray(values)) {
38
+ return [];
39
+ }
40
+ const seen = new Set();
41
+ const normalized = [];
42
+ for (const item of values) {
43
+ const slug = String(item || "")
44
+ .trim()
45
+ .toLowerCase()
46
+ .replace(/[^a-z0-9._-]+/g, "-")
47
+ .replace(/^-+|-+$/g, "");
48
+ if (!slug || seen.has(slug)) {
49
+ continue;
50
+ }
51
+ seen.add(slug);
52
+ normalized.push(slug);
53
+ }
54
+ return normalized;
55
+ }
36
56
  function migrateLegacyExecutor(current) {
37
57
  if (platform() === "darwin"
38
58
  && current?.type === "clawd-cursor"
@@ -71,6 +91,7 @@ export async function loadBridgeConfig() {
71
91
  // The runtime must always report the currently installed package version.
72
92
  bridgeVersion: BRIDGE_VERSION,
73
93
  executor: resolveExecutorConfig(undefined, migrateLegacyExecutor(parsed.executor)),
94
+ installedExtensions: normalizeInstalledExtensions(parsed.installedExtensions),
74
95
  };
75
96
  }
76
97
  catch {
@@ -147,6 +168,7 @@ export function buildBridgeConfig(params) {
147
168
  approvalMode: params.approvalMode || "preview",
148
169
  capabilities: Array.isArray(params.capabilities) ? [...params.capabilities] : [],
149
170
  metadata: params.metadata || {},
171
+ installedExtensions: [],
150
172
  pairedAt: new Date().toISOString(),
151
173
  executor: resolveExecutorConfig(undefined, params.executor),
152
174
  };
@@ -132,7 +132,132 @@ function extractMeaningfulDescriptionTokens(value) {
132
132
  function descriptionLikelyHasTextAnchor(description) {
133
133
  return extractQuotedPhrases(description).length > 0 || extractMeaningfulDescriptionTokens(description).length > 0;
134
134
  }
135
- function findOcrTextMatch(candidates, description) {
135
+ function regionFromOcrItems(items, kind) {
136
+ if (!items.length) {
137
+ return null;
138
+ }
139
+ const sorted = [...items].sort((left, right) => {
140
+ if (left.y !== right.y)
141
+ return left.y - right.y;
142
+ return left.x - right.x;
143
+ });
144
+ const minX = Math.min(...sorted.map((item) => item.x));
145
+ const minY = Math.min(...sorted.map((item) => item.y));
146
+ const maxX = Math.max(...sorted.map((item) => item.x + item.width));
147
+ const maxY = Math.max(...sorted.map((item) => item.y + item.height));
148
+ const confidenceValues = sorted
149
+ .map((item) => Number(item.confidence))
150
+ .filter((value) => Number.isFinite(value));
151
+ const confidence = confidenceValues.length
152
+ ? confidenceValues.reduce((sum, value) => sum + value, 0) / confidenceValues.length
153
+ : undefined;
154
+ return {
155
+ text: sorted.map((item) => item.text).join(" ").replace(/\s+/g, " ").trim(),
156
+ x: minX,
157
+ y: minY,
158
+ width: Math.max(1, maxX - minX),
159
+ height: Math.max(1, maxY - minY),
160
+ confidence,
161
+ kind,
162
+ };
163
+ }
164
+ function buildStructuredOcrRegions(candidates) {
165
+ if (!candidates.length) {
166
+ return [];
167
+ }
168
+ const sorted = [...candidates].sort((left, right) => {
169
+ const leftCenterY = left.y + (left.height / 2);
170
+ const rightCenterY = right.y + (right.height / 2);
171
+ if (leftCenterY !== rightCenterY) {
172
+ return leftCenterY - rightCenterY;
173
+ }
174
+ return left.x - right.x;
175
+ });
176
+ const lines = [];
177
+ for (const candidate of sorted) {
178
+ const candidateCenterY = candidate.y + (candidate.height / 2);
179
+ const lastLine = lines[lines.length - 1];
180
+ if (!lastLine) {
181
+ lines.push([candidate]);
182
+ continue;
183
+ }
184
+ const referenceCenterY = lastLine.reduce((sum, item) => sum + item.y + (item.height / 2), 0) / lastLine.length;
185
+ const avgHeight = lastLine.reduce((sum, item) => sum + item.height, 0) / lastLine.length;
186
+ const maxDistance = Math.max(16, avgHeight * 0.75);
187
+ if (Math.abs(candidateCenterY - referenceCenterY) <= maxDistance) {
188
+ lastLine.push(candidate);
189
+ }
190
+ else {
191
+ lines.push([candidate]);
192
+ }
193
+ }
194
+ const lineRegions = lines
195
+ .map((line) => regionFromOcrItems(line.sort((left, right) => left.x - right.x), "line"))
196
+ .filter(Boolean);
197
+ const blocks = [];
198
+ for (const line of lineRegions) {
199
+ const lastBlock = blocks[blocks.length - 1];
200
+ if (!lastBlock) {
201
+ blocks.push([line]);
202
+ continue;
203
+ }
204
+ const previous = lastBlock[lastBlock.length - 1];
205
+ const verticalGap = line.y - (previous.y + previous.height);
206
+ const horizontalOverlap = Math.max(0, Math.min(previous.x + previous.width, line.x + line.width) - Math.max(previous.x, line.x));
207
+ const overlapRatio = horizontalOverlap / Math.max(previous.width, line.width, 1);
208
+ const leftAlignmentDelta = Math.abs(previous.x - line.x);
209
+ if (verticalGap <= Math.max(22, previous.height * 1.6) && (overlapRatio >= 0.18 || leftAlignmentDelta <= 120)) {
210
+ lastBlock.push(line);
211
+ }
212
+ else {
213
+ blocks.push([line]);
214
+ }
215
+ }
216
+ const blockRegions = blocks
217
+ .map((block) => {
218
+ const minX = Math.min(...block.map((line) => line.x));
219
+ const minY = Math.min(...block.map((line) => line.y));
220
+ const maxX = Math.max(...block.map((line) => line.x + line.width));
221
+ const maxY = Math.max(...block.map((line) => line.y + line.height));
222
+ const confidenceValues = block
223
+ .map((line) => Number(line.confidence))
224
+ .filter((value) => Number.isFinite(value));
225
+ const confidence = confidenceValues.length
226
+ ? confidenceValues.reduce((sum, value) => sum + value, 0) / confidenceValues.length
227
+ : undefined;
228
+ return {
229
+ text: block.map((line) => line.text).join(" ").replace(/\s+/g, " ").trim(),
230
+ x: minX,
231
+ y: minY,
232
+ width: Math.max(1, maxX - minX),
233
+ height: Math.max(1, maxY - minY),
234
+ confidence,
235
+ kind: "block",
236
+ };
237
+ })
238
+ .filter((region) => region.text);
239
+ const wordRegions = candidates.map((candidate) => ({
240
+ text: candidate.text,
241
+ x: candidate.x,
242
+ y: candidate.y,
243
+ width: candidate.width,
244
+ height: candidate.height,
245
+ confidence: candidate.confidence,
246
+ kind: "word",
247
+ }));
248
+ const unique = new Set();
249
+ const deduped = [];
250
+ for (const region of [...blockRegions, ...lineRegions, ...wordRegions]) {
251
+ const key = `${normalizeText(region.text)}|${Math.round(region.x)}|${Math.round(region.y)}|${region.kind}`;
252
+ if (!region.text || unique.has(key)) {
253
+ continue;
254
+ }
255
+ unique.add(key);
256
+ deduped.push(region);
257
+ }
258
+ return deduped;
259
+ }
260
+ function findOcrTextMatch(regions, description) {
136
261
  const phrases = extractQuotedPhrases(description);
137
262
  const tokens = extractMeaningfulDescriptionTokens(description);
138
263
  const normalizedDescription = normalizeText(description || "");
@@ -140,10 +265,11 @@ function findOcrTextMatch(candidates, description) {
140
265
  if (!phrases.length && !tokens.length) {
141
266
  return null;
142
267
  }
143
- const scored = candidates
144
- .map((candidate, index) => {
145
- const normalizedText = normalizeText(candidate.text || "");
268
+ const scored = regions
269
+ .map((region, index) => {
270
+ const normalizedText = normalizeText(region.text || "");
146
271
  let score = 0;
272
+ let matchedTokens = 0;
147
273
  for (const phrase of phrases) {
148
274
  if (normalizedText.includes(phrase)) {
149
275
  score += 120;
@@ -152,17 +278,27 @@ function findOcrTextMatch(candidates, description) {
152
278
  for (const token of tokens) {
153
279
  if (normalizedText.includes(token)) {
154
280
  score += 18;
281
+ matchedTokens += 1;
155
282
  }
156
283
  }
284
+ if (tokens.length > 1 && matchedTokens === tokens.length) {
285
+ score += 36;
286
+ }
287
+ if (region.kind === "line") {
288
+ score += 8;
289
+ }
290
+ else if (region.kind === "block") {
291
+ score += 16;
292
+ }
157
293
  if (wantsFirst) {
158
- score += Math.max(0, 24 - Math.round(candidate.y / 60));
294
+ score += Math.max(0, 24 - Math.round(region.y / 60));
159
295
  score += Math.max(0, 12 - index);
160
296
  }
161
- if (candidate.confidence) {
162
- score += Math.round(candidate.confidence * 20);
297
+ if (region.confidence) {
298
+ score += Math.round(region.confidence * 20);
163
299
  }
164
300
  return score > 0 ? {
165
- candidate,
301
+ region,
166
302
  score,
167
303
  } : null;
168
304
  })
@@ -171,10 +307,10 @@ function findOcrTextMatch(candidates, description) {
171
307
  if (right.score !== left.score) {
172
308
  return right.score - left.score;
173
309
  }
174
- if (left.candidate.y !== right.candidate.y) {
175
- return left.candidate.y - right.candidate.y;
310
+ if (left.region.y !== right.region.y) {
311
+ return left.region.y - right.region.y;
176
312
  }
177
- return left.candidate.x - right.candidate.x;
313
+ return left.region.x - right.region.x;
178
314
  });
179
315
  return scored[0] || null;
180
316
  }
@@ -288,6 +424,12 @@ function clipText(value, maxLength) {
288
424
  }
289
425
  return `${value.slice(0, maxLength)}...`;
290
426
  }
427
+ function clipTextPreview(value, maxLength) {
428
+ if (value.length <= maxLength) {
429
+ return value;
430
+ }
431
+ return `${value.slice(0, maxLength)}\n\n[conteudo truncado: mostrando ${maxLength} de ${value.length} caracteres. Peca um trecho mais especifico se quiser continuar.]`;
432
+ }
291
433
  const TEXTUTIL_READABLE_EXTENSIONS = new Set([
292
434
  ".doc",
293
435
  ".docx",
@@ -499,6 +641,15 @@ function parseStructuredActions(job) {
499
641
  actions.push({ type: "list_files", path: filePath, limit });
500
642
  continue;
501
643
  }
644
+ if (type === "count_files") {
645
+ const filePath = asString(action.path) || "~";
646
+ const extensions = Array.isArray(action.extensions)
647
+ ? action.extensions.map((item) => asString(item)?.toLowerCase().replace(/^\./, "")).filter(Boolean)
648
+ : undefined;
649
+ const recursive = typeof action.recursive === "boolean" ? action.recursive : undefined;
650
+ actions.push({ type: "count_files", path: filePath, extensions, recursive });
651
+ continue;
652
+ }
502
653
  if (type === "run_shell" || type === "shell" || type === "terminal") {
503
654
  const command = asString(action.command) || asString(action.cmd);
504
655
  const cwd = asString(action.cwd);
@@ -529,6 +680,19 @@ function parseStructuredActions(job) {
529
680
  }
530
681
  continue;
531
682
  }
683
+ if (type === "drag_visual_target" || type === "drag_target") {
684
+ const sourceDescription = asString(action.source_description) || asString(action.source) || asString(action.from);
685
+ const targetDescription = asString(action.target_description) || asString(action.target) || asString(action.to);
686
+ if (sourceDescription && targetDescription) {
687
+ actions.push({
688
+ type: "drag_visual_target",
689
+ source_description: sourceDescription,
690
+ target_description: targetDescription,
691
+ app: asString(action.app) || undefined,
692
+ });
693
+ }
694
+ continue;
695
+ }
532
696
  }
533
697
  return actions;
534
698
  }
@@ -746,6 +910,18 @@ export class NativeMacOSJobExecutor {
746
910
  completionNotes.push(`Arquivos em ${action.path}:\n${listing}`);
747
911
  continue;
748
912
  }
913
+ if (action.type === "count_files") {
914
+ await reporter.progress(progressPercent, `Contando arquivos em ${action.path}`);
915
+ const counted = await this.countLocalFiles(action.path, action.extensions, action.recursive !== false);
916
+ completionNotes.push(`Encontrei ${counted.total} arquivo${counted.total === 1 ? "" : "s"} ${counted.extensionsLabel} em ${counted.path}.`);
917
+ resultPayload.file_count = {
918
+ total: counted.total,
919
+ path: counted.path,
920
+ extensions: counted.extensions,
921
+ recursive: counted.recursive,
922
+ };
923
+ continue;
924
+ }
749
925
  if (action.type === "run_shell") {
750
926
  await reporter.progress(progressPercent, `Rodando comando local: ${action.command}`);
751
927
  const shellOutput = await this.runShellCommand(action.command, action.cwd);
@@ -833,15 +1009,15 @@ export class NativeMacOSJobExecutor {
833
1009
  validated = true;
834
1010
  }
835
1011
  if (validated) {
836
- const candidate = ocrClick.candidate || null;
1012
+ const region = ocrClick.region || null;
837
1013
  resultPayload.last_click = {
838
1014
  strategy: ocrClick.strategy || "local_ocr",
839
1015
  score: ocrClick.score || null,
840
- matched_text: candidate?.text || null,
841
- x: candidate ? candidate.x + (candidate.width / 2) : null,
842
- y: candidate ? candidate.y + (candidate.height / 2) : null,
843
- width: candidate?.width || null,
844
- height: candidate?.height || null,
1016
+ matched_text: region?.text || null,
1017
+ x: region ? region.x + (region.width / 2) : null,
1018
+ y: region ? region.y + (region.height / 2) : null,
1019
+ width: region?.width || null,
1020
+ height: region?.height || null,
845
1021
  };
846
1022
  completionNotes.push(`Localizei e cliquei em ${targetDescription} por OCR local.`);
847
1023
  clickSucceeded = true;
@@ -917,6 +1093,35 @@ export class NativeMacOSJobExecutor {
917
1093
  }
918
1094
  continue;
919
1095
  }
1096
+ if (action.type === "drag_visual_target") {
1097
+ const dragApp = await this.resolveLikelyBrowserApp(action.app);
1098
+ if (dragApp) {
1099
+ await reporter.progress(progressPercent, `Trazendo ${dragApp} para frente antes do arraste`);
1100
+ await this.focusApp(dragApp);
1101
+ }
1102
+ else if (action.app) {
1103
+ await reporter.progress(progressPercent, `Trazendo ${action.app} para frente antes do arraste`);
1104
+ await this.focusApp(action.app);
1105
+ }
1106
+ await reporter.progress(progressPercent, `Capturando a tela para localizar ${action.source_description} e ${action.target_description}`);
1107
+ const screenshotPath = await this.takeScreenshot();
1108
+ const sourcePoint = await this.resolveVisualTargetPoint(job.job_id, screenshotPath, action.source_description, artifacts, "drag_source");
1109
+ const targetPoint = await this.resolveVisualTargetPoint(job.job_id, screenshotPath, action.target_description, artifacts, "drag_target");
1110
+ if (!sourcePoint) {
1111
+ throw new Error(`Nao consegui localizar ${action.source_description} com confianca suficiente para arrastar.`);
1112
+ }
1113
+ if (!targetPoint) {
1114
+ throw new Error(`Nao consegui localizar ${action.target_description} com confianca suficiente para concluir o arraste.`);
1115
+ }
1116
+ await reporter.progress(progressPercent, `Arrastando ${action.source_description} para ${action.target_description}`);
1117
+ await this.dragPoint(sourcePoint.x, sourcePoint.y, targetPoint.x, targetPoint.y);
1118
+ resultPayload.last_drag = {
1119
+ source: sourcePoint,
1120
+ target: targetPoint,
1121
+ };
1122
+ completionNotes.push(`Arrastei ${action.source_description} para ${action.target_description}.`);
1123
+ continue;
1124
+ }
920
1125
  await reporter.progress(progressPercent, `Abrindo ${action.url}${action.app ? ` em ${action.app}` : ""}`);
921
1126
  await this.openUrl(action.url, action.app);
922
1127
  await delay(1200);
@@ -1020,6 +1225,8 @@ end tell
1020
1225
  title: page.title,
1021
1226
  url: page.url,
1022
1227
  text: page.text,
1228
+ playerTitle: page.playerTitle || "",
1229
+ playerState: page.playerState || "",
1023
1230
  };
1024
1231
  }
1025
1232
  resolveExpectedBrowserHref(rawHref, baseUrl) {
@@ -1058,6 +1265,22 @@ end tell
1058
1265
  return true;
1059
1266
  }
1060
1267
  }
1268
+ const beforePlayerTitle = normalizeText(before?.playerTitle || "");
1269
+ const afterPlayerTitle = normalizeText(after.playerTitle || "");
1270
+ const beforePlayerState = normalizeText(before?.playerState || "");
1271
+ const afterPlayerState = normalizeText(after.playerState || "");
1272
+ const playerLooksActive = afterPlayerState.includes("pause") || afterPlayerState.includes("pausar");
1273
+ if (afterUrl.includes("music.youtube.com")) {
1274
+ if (beforePlayerState && afterPlayerState && beforePlayerState !== afterPlayerState && playerLooksActive) {
1275
+ return true;
1276
+ }
1277
+ if (beforePlayerTitle && afterPlayerTitle && beforePlayerTitle !== afterPlayerTitle) {
1278
+ return true;
1279
+ }
1280
+ if (!beforePlayerTitle && afterPlayerTitle && playerLooksActive) {
1281
+ return true;
1282
+ }
1283
+ }
1061
1284
  const beforeTitle = normalizeText(before?.title || "");
1062
1285
  const afterTitle = normalizeText(after.title || "");
1063
1286
  if (beforeTitle && afterTitle && beforeTitle !== afterTitle) {
@@ -1259,6 +1482,7 @@ const normalize = (value) => String(value || "")
1259
1482
  .replace(/[\\u0300-\\u036f]/g, "")
1260
1483
  .toLowerCase();
1261
1484
  const normalizedDescription = normalize(rawDescription);
1485
+ const isYouTubeMusic = location.hostname.includes("music.youtube.com");
1262
1486
  const wantsFirst = /\\b(primeir[ao]?|first)\\b/.test(normalizedDescription);
1263
1487
  const wantsVideo = /\\b(video|videos|musica|faixa|youtube|resultado|watch)\\b/.test(normalizedDescription) || location.hostname.includes("youtube");
1264
1488
  const stopWords = new Set([
@@ -1275,7 +1499,19 @@ const tokens = Array.from(new Set(
1275
1499
  .filter((token) => token.length >= 3 && !stopWords.has(token))
1276
1500
  ));
1277
1501
 
1278
- const candidateSelectors = location.hostname.includes("youtube")
1502
+ const candidateSelectors = isYouTubeMusic
1503
+ ? [
1504
+ "ytmusic-responsive-list-item-renderer a[href*='watch?v=']",
1505
+ "ytmusic-responsive-list-item-renderer button[aria-label]",
1506
+ "ytmusic-responsive-list-item-renderer tp-yt-paper-icon-button",
1507
+ "ytmusic-responsive-list-item-renderer ytmusic-item-thumbnail-overlay-renderer button",
1508
+ "ytmusic-shelf-renderer ytmusic-responsive-list-item-renderer a[href*='watch?v=']",
1509
+ "a[href*='watch?v=']",
1510
+ "button",
1511
+ "[role='button']",
1512
+ "[role='link']"
1513
+ ]
1514
+ : location.hostname.includes("youtube")
1279
1515
  ? [
1280
1516
  "ytd-video-renderer a#video-title",
1281
1517
  "ytd-video-renderer ytd-thumbnail a",
@@ -1339,8 +1575,11 @@ function scoreCandidate(element, rank) {
1339
1575
 
1340
1576
  if (wantsFirst) score += Math.max(0, 40 - rank);
1341
1577
  if (wantsVideo && normalizedHref.includes("/watch")) score += 30;
1578
+ if (isYouTubeMusic && normalizedHref.includes("watch?v=")) score += 36;
1579
+ if (isYouTubeMusic && element.closest("ytmusic-responsive-list-item-renderer, ytmusic-player-bar")) score += 24;
1342
1580
  if (location.hostname.includes("youtube") && element.closest("ytd-video-renderer, ytd-rich-item-renderer, ytd-rich-grid-media")) score += 20;
1343
1581
  if (element.id === "video-title") score += 12;
1582
+ if (isYouTubeMusic && /\\b(play|pause|reproduzir|tocar)\\b/.test(normalizedText)) score += 12;
1344
1583
  if (!normalizedText && normalizedHref.includes("/watch")) score += 8;
1345
1584
 
1346
1585
  for (const phrase of quotedPhrases) {
@@ -1445,7 +1684,7 @@ tell application "Safari"
1445
1684
  activate
1446
1685
  if (count of windows) = 0 then error "Safari nao possui janelas abertas."
1447
1686
  delay 1
1448
- set pageJson to do JavaScript "(function(){const title=document.title||''; const url=location.href||''; const text=((document.body&&document.body.innerText)||'').trim().slice(0, 12000); return JSON.stringify({title:title,url:url,text:text});})();" in current tab of front window
1687
+ set pageJson to do JavaScript "(function(){const title=document.title||''; const url=location.href||''; const text=((document.body&&document.body.innerText)||'').trim().slice(0, 12000); const playerButton=document.querySelector('ytmusic-player-bar #play-pause-button, ytmusic-player-bar tp-yt-paper-icon-button#play-pause-button, ytmusic-player-bar tp-yt-paper-icon-button.play-pause-button'); const playerTitle=(Array.from(document.querySelectorAll('ytmusic-player-bar .title, ytmusic-player-bar .content-info-wrapper .title, ytmusic-player-bar [slot=\"title\"]')).map((node)=>((node&&node.textContent)||'').trim()).find(Boolean))||''; const playerState=(playerButton&&((playerButton.getAttribute('title')||playerButton.getAttribute('aria-label')||playerButton.textContent)||'').trim())||''; return JSON.stringify({title:title,url:url,text:text,playerTitle:playerTitle,playerState:playerState});})();" in current tab of front window
1449
1688
  end tell
1450
1689
  return pageJson
1451
1690
  `;
@@ -1456,6 +1695,8 @@ return pageJson
1456
1695
  title: asString(parsed.title) || "",
1457
1696
  url: asString(parsed.url) || "",
1458
1697
  text: asString(parsed.text) || "",
1698
+ playerTitle: asString(parsed.playerTitle) || "",
1699
+ playerState: asString(parsed.playerState) || "",
1459
1700
  };
1460
1701
  }
1461
1702
  catch (error) {
@@ -1478,6 +1719,8 @@ return pageTitle & linefeed & pageUrl
1478
1719
  title: String(title || "").trim(),
1479
1720
  url: String(url || "").trim(),
1480
1721
  text: "",
1722
+ playerTitle: "",
1723
+ playerState: "",
1481
1724
  };
1482
1725
  }
1483
1726
  }
@@ -1523,6 +1766,54 @@ post(.leftMouseUp)
1523
1766
  `;
1524
1767
  await this.runCommand("swift", ["-e", script, String(Math.round(x)), String(Math.round(y))]);
1525
1768
  }
1769
+ async dragPoint(fromX, fromY, toX, toY) {
1770
+ const script = `
1771
+ import Cocoa
1772
+ import ApplicationServices
1773
+
1774
+ let fromX = Double(CommandLine.arguments[1]) ?? 0
1775
+ let fromY = Double(CommandLine.arguments[2]) ?? 0
1776
+ let toX = Double(CommandLine.arguments[3]) ?? 0
1777
+ let toY = Double(CommandLine.arguments[4]) ?? 0
1778
+
1779
+ let startPoint = CGPoint(x: fromX, y: fromY)
1780
+ let endPoint = CGPoint(x: toX, y: toY)
1781
+ let steps = max(8, Int(hypot(endPoint.x - startPoint.x, endPoint.y - startPoint.y) / 60.0))
1782
+
1783
+ func post(_ type: CGEventType, at point: CGPoint) {
1784
+ guard let event = CGEvent(mouseEventSource: nil, mouseType: type, mouseCursorPosition: point, mouseButton: .left) else {
1785
+ fputs("failed to create mouse event\\n", stderr)
1786
+ exit(1)
1787
+ }
1788
+ event.post(tap: .cghidEventTap)
1789
+ }
1790
+
1791
+ post(.mouseMoved, at: startPoint)
1792
+ usleep(100000)
1793
+ post(.leftMouseDown, at: startPoint)
1794
+ usleep(90000)
1795
+
1796
+ for step in 1...steps {
1797
+ let progress = Double(step) / Double(steps)
1798
+ let point = CGPoint(
1799
+ x: startPoint.x + ((endPoint.x - startPoint.x) * progress),
1800
+ y: startPoint.y + ((endPoint.y - startPoint.y) * progress)
1801
+ )
1802
+ post(.leftMouseDragged, at: point)
1803
+ usleep(35000)
1804
+ }
1805
+
1806
+ post(.leftMouseUp, at: endPoint)
1807
+ `;
1808
+ await this.runCommand("swift", [
1809
+ "-e",
1810
+ script,
1811
+ String(Math.round(fromX)),
1812
+ String(Math.round(fromY)),
1813
+ String(Math.round(toX)),
1814
+ String(Math.round(toY)),
1815
+ ]);
1816
+ }
1526
1817
  async runLocalOcr(filePath) {
1527
1818
  const script = `
1528
1819
  import Foundation
@@ -1612,9 +1903,27 @@ if let output = String(data: data, encoding: .utf8) {
1612
1903
  }
1613
1904
  }
1614
1905
  async tryLocalOcrClick(screenshotPath, description) {
1615
- if (!descriptionLikelyHasTextAnchor(description)) {
1906
+ const anchor = await this.resolveLocalOcrAnchor(screenshotPath, description);
1907
+ if (!anchor.region) {
1616
1908
  return {
1617
1909
  clicked: false,
1910
+ reason: anchor.reason,
1911
+ strategy: anchor.strategy,
1912
+ };
1913
+ }
1914
+ const clickX = anchor.region.x + (anchor.region.width / 2);
1915
+ const clickY = anchor.region.y + (anchor.region.height / 2);
1916
+ await this.clickPoint(clickX, clickY);
1917
+ return {
1918
+ clicked: true,
1919
+ score: anchor.score,
1920
+ region: anchor.region,
1921
+ strategy: anchor.strategy,
1922
+ };
1923
+ }
1924
+ async resolveLocalOcrAnchor(screenshotPath, description) {
1925
+ if (!descriptionLikelyHasTextAnchor(description)) {
1926
+ return {
1618
1927
  reason: "A descricao nao traz ancora textual forte para OCR local.",
1619
1928
  strategy: "local_ocr_skipped",
1620
1929
  };
@@ -1622,27 +1931,69 @@ if let output = String(data: data, encoding: .utf8) {
1622
1931
  const candidates = await this.runLocalOcr(screenshotPath);
1623
1932
  if (!candidates.length) {
1624
1933
  return {
1625
- clicked: false,
1626
1934
  reason: "OCR local nao encontrou texto utilizavel na tela.",
1627
1935
  strategy: "local_ocr_empty",
1628
1936
  };
1629
1937
  }
1630
- const match = findOcrTextMatch(candidates, description);
1938
+ const regions = buildStructuredOcrRegions(candidates);
1939
+ const match = findOcrTextMatch(regions, description);
1631
1940
  if (!match || match.score < 24) {
1632
1941
  return {
1633
- clicked: false,
1634
1942
  reason: "OCR local nao encontrou texto suficientemente compativel com a descricao.",
1635
1943
  strategy: "local_ocr_no_match",
1636
1944
  };
1637
1945
  }
1638
- const clickX = match.candidate.x + (match.candidate.width / 2);
1639
- const clickY = match.candidate.y + (match.candidate.height / 2);
1640
- await this.clickPoint(clickX, clickY);
1641
1946
  return {
1642
- clicked: true,
1947
+ region: match.region,
1643
1948
  score: match.score,
1644
- candidate: match.candidate,
1645
- strategy: "local_ocr",
1949
+ strategy: match.region.kind === "block" ? "structured_local_ocr_block" : "local_ocr",
1950
+ };
1951
+ }
1952
+ async resolveVisualTargetPoint(jobId, screenshotPath, description, artifacts, purpose) {
1953
+ const ocrAnchor = await this.resolveLocalOcrAnchor(screenshotPath, description);
1954
+ if (ocrAnchor.region) {
1955
+ return {
1956
+ x: ocrAnchor.region.x + (ocrAnchor.region.width / 2),
1957
+ y: ocrAnchor.region.y + (ocrAnchor.region.height / 2),
1958
+ strategy: ocrAnchor.strategy || "local_ocr",
1959
+ matched_text: ocrAnchor.region.text,
1960
+ score: ocrAnchor.score || null,
1961
+ };
1962
+ }
1963
+ const uploadable = await this.buildUploadableImage(screenshotPath);
1964
+ const artifact = await this.uploadArtifactForJob(jobId, uploadable.path, {
1965
+ kind: "screenshot",
1966
+ mimeTypeOverride: uploadable.mimeType,
1967
+ fileNameOverride: uploadable.filename,
1968
+ metadata: {
1969
+ purpose,
1970
+ visible_in_chat: false,
1971
+ target: description,
1972
+ width: uploadable.dimensions?.width || undefined,
1973
+ height: uploadable.dimensions?.height || undefined,
1974
+ original_width: uploadable.originalDimensions?.width || undefined,
1975
+ original_height: uploadable.originalDimensions?.height || undefined,
1976
+ resized_for_upload: uploadable.resized,
1977
+ },
1978
+ });
1979
+ if (!artifact?.storage_path) {
1980
+ return null;
1981
+ }
1982
+ artifacts.push(artifact);
1983
+ const artifactMetadata = artifact.metadata || {};
1984
+ const width = Number(artifactMetadata.width || 0);
1985
+ const height = Number(artifactMetadata.height || 0);
1986
+ const originalWidth = Number(artifactMetadata.original_width || width || 0);
1987
+ const originalHeight = Number(artifactMetadata.original_height || height || 0);
1988
+ const location = await this.locateVisualTarget(jobId, artifact.storage_path, description, width, height, artifact.mime_type);
1989
+ if (!location?.found || typeof location.x !== "number" || typeof location.y !== "number") {
1990
+ return null;
1991
+ }
1992
+ return {
1993
+ x: width > 0 && originalWidth > 0 ? (location.x / width) * originalWidth : location.x,
1994
+ y: height > 0 && originalHeight > 0 ? (location.y / height) * originalHeight : location.y,
1995
+ strategy: "visual_locator",
1996
+ score: typeof location.confidence === "number" ? location.confidence : null,
1646
1997
  };
1647
1998
  }
1648
1999
  async getImageDimensions(filePath) {
@@ -1715,7 +2066,7 @@ if let output = String(data: data, encoding: .utf8) {
1715
2066
  resized,
1716
2067
  };
1717
2068
  }
1718
- async readLocalFile(filePath, maxChars = 4000) {
2069
+ async readLocalFile(filePath, maxChars = 1800) {
1719
2070
  const resolved = expandUserPath(filePath);
1720
2071
  const extension = path.extname(resolved).toLowerCase();
1721
2072
  if (TEXTUTIL_READABLE_EXTENSIONS.has(extension)) {
@@ -1726,7 +2077,7 @@ if let output = String(data: data, encoding: .utf8) {
1726
2077
  resolved,
1727
2078
  ]);
1728
2079
  const content = sanitizeTextForJsonTransport(stdout);
1729
- return clipText(content || "(arquivo sem texto legivel)", maxChars);
2080
+ return clipTextPreview(content || "(arquivo sem texto legivel)", maxChars);
1730
2081
  }
1731
2082
  const raw = await readFile(resolved);
1732
2083
  if (isLikelyBinaryBuffer(raw)) {
@@ -1735,7 +2086,7 @@ if let output = String(data: data, encoding: .utf8) {
1735
2086
  return clipText(`O arquivo ${filename} parece ser binario (${detectedType}) e nao pode ser lido como texto puro pelo Otto Bridge ainda.`, maxChars);
1736
2087
  }
1737
2088
  const content = sanitizeTextForJsonTransport(raw.toString("utf8"));
1738
- return clipText(content || "(arquivo vazio)", maxChars);
2089
+ return clipTextPreview(content || "(arquivo vazio)", maxChars);
1739
2090
  }
1740
2091
  async listLocalFiles(directoryPath, limit = 40) {
1741
2092
  const resolved = expandUserPath(directoryPath);
@@ -1754,6 +2105,55 @@ if let output = String(data: data, encoding: .utf8) {
1754
2105
  }));
1755
2106
  return items.length > 0 ? items.join("\n") : "(pasta vazia)";
1756
2107
  }
2108
+ async countLocalFiles(directoryPath, extensions, recursive = true) {
2109
+ const resolved = expandUserPath(directoryPath);
2110
+ const normalizedExtensions = Array.from(new Set((extensions || [])
2111
+ .map((extension) => String(extension || "").trim().toLowerCase().replace(/^\./, ""))
2112
+ .filter(Boolean)));
2113
+ const queue = [resolved];
2114
+ let total = 0;
2115
+ while (queue.length > 0) {
2116
+ const current = queue.shift();
2117
+ if (!current)
2118
+ continue;
2119
+ let entries;
2120
+ try {
2121
+ entries = await readdir(current, { withFileTypes: true });
2122
+ }
2123
+ catch {
2124
+ continue;
2125
+ }
2126
+ for (const entry of entries) {
2127
+ const entryPath = path.join(current, entry.name);
2128
+ if (entry.isDirectory()) {
2129
+ if (recursive) {
2130
+ queue.push(entryPath);
2131
+ }
2132
+ continue;
2133
+ }
2134
+ if (!entry.isFile()) {
2135
+ continue;
2136
+ }
2137
+ if (normalizedExtensions.length > 0) {
2138
+ const entryExtension = path.extname(entry.name).toLowerCase().replace(/^\./, "");
2139
+ if (!normalizedExtensions.includes(entryExtension)) {
2140
+ continue;
2141
+ }
2142
+ }
2143
+ total += 1;
2144
+ }
2145
+ }
2146
+ const extensionsLabel = normalizedExtensions.length > 0
2147
+ ? normalizedExtensions.map((extension) => `.${extension}`).join(", ")
2148
+ : "do tipo solicitado";
2149
+ return {
2150
+ total,
2151
+ path: directoryPath,
2152
+ extensions: normalizedExtensions,
2153
+ recursive,
2154
+ extensionsLabel,
2155
+ };
2156
+ }
1757
2157
  async runShellCommand(command, cwd) {
1758
2158
  if (!isSafeShellCommand(command)) {
1759
2159
  throw new Error("Nenhum comando shell foi informado para execucao local.");
@@ -1805,6 +2205,9 @@ if let output = String(data: data, encoding: .utf8) {
1805
2205
  if (action.type === "list_files") {
1806
2206
  return `Arquivos listados em ${action.path}`;
1807
2207
  }
2208
+ if (action.type === "count_files") {
2209
+ return `Arquivos contados em ${action.path}`;
2210
+ }
1808
2211
  if (action.type === "run_shell") {
1809
2212
  return `Comando ${action.command} executado no macOS`;
1810
2213
  }
@@ -1814,6 +2217,9 @@ if let output = String(data: data, encoding: .utf8) {
1814
2217
  if (action.type === "click_visual_target") {
1815
2218
  return `Clique guiado executado para ${action.description}`;
1816
2219
  }
2220
+ if (action.type === "drag_visual_target") {
2221
+ return `Arraste guiado executado de ${action.source_description} para ${action.target_description}`;
2222
+ }
1817
2223
  const target = humanizeUrl(action.url);
1818
2224
  return `${target} foi aberto${action.app ? ` em ${action.app}` : ""}`;
1819
2225
  }
package/dist/main.js CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import { spawn } from "node:child_process";
3
3
  import process from "node:process";
4
- import { clearBridgeConfig, getBridgeConfigPath, loadBridgeConfig, resolveApiBaseUrl, resolveExecutorConfig, } from "./config.js";
4
+ import { clearBridgeConfig, getBridgeConfigPath, loadBridgeConfig, normalizeInstalledExtensions, resolveApiBaseUrl, resolveExecutorConfig, saveBridgeConfig, } from "./config.js";
5
5
  import { pairDevice } from "./pairing.js";
6
6
  import { BridgeRuntime } from "./runtime.js";
7
7
  import { BRIDGE_PACKAGE_NAME, BRIDGE_VERSION, DEFAULT_PAIR_TIMEOUT_SECONDS, DEFAULT_POLL_INTERVAL_MS, } from "./types.js";
@@ -56,6 +56,9 @@ function printUsage() {
56
56
  otto-bridge pair --api http://localhost:8000 --code ABC123 [--name "Meu PC"] [--executor native-macos|mock|clawd-cursor]
57
57
  otto-bridge run [--executor native-macos|mock|clawd-cursor] [--clawd-url http://127.0.0.1:3847]
58
58
  otto-bridge status
59
+ otto-bridge extensions --list
60
+ otto-bridge extensions --install github
61
+ otto-bridge extensions --uninstall github
59
62
  otto-bridge version
60
63
  otto-bridge update [--tag latest|next] [--dry-run]
61
64
  otto-bridge unpair
@@ -63,6 +66,8 @@ function printUsage() {
63
66
  Examples:
64
67
  otto-bridge pair --api https://api.leg3ndy.com.br --code ABC123
65
68
  otto-bridge run
69
+ otto-bridge extensions --install github
70
+ otto-bridge extensions --list
66
71
  otto-bridge version
67
72
  otto-bridge update
68
73
  otto-bridge update --dry-run
@@ -108,11 +113,15 @@ async function runPairCommand(args) {
108
113
  console.log(`[otto-bridge] config=${getBridgeConfigPath()}`);
109
114
  console.log("[otto-bridge] next step: run `otto-bridge run` to keep this device online");
110
115
  }
111
- async function runRuntimeCommand(args) {
116
+ async function loadRequiredBridgeConfig() {
112
117
  const config = await loadBridgeConfig();
113
118
  if (!config) {
114
119
  throw new Error("No local pairing found. Run `otto-bridge pair --code <CODE>` first.");
115
120
  }
121
+ return config;
122
+ }
123
+ async function runRuntimeCommand(args) {
124
+ const config = await loadRequiredBridgeConfig();
116
125
  const runtimeConfig = {
117
126
  ...config,
118
127
  executor: resolveExecutorOverrides(args, config.executor),
@@ -136,10 +145,61 @@ async function runStatusCommand() {
136
145
  ws_url: config.wsUrl,
137
146
  approval_mode: config.approvalMode,
138
147
  capabilities: config.capabilities,
148
+ installed_extensions: config.installedExtensions,
139
149
  paired_at: config.pairedAt,
140
150
  executor: config.executor,
141
151
  }, null, 2));
142
152
  }
153
+ async function runExtensionsCommand(args) {
154
+ const config = await loadRequiredBridgeConfig();
155
+ const installValue = option(args, "install");
156
+ const uninstallValue = option(args, "uninstall");
157
+ if (installValue && uninstallValue) {
158
+ throw new Error("Use apenas uma acao por vez: --install ou --uninstall.");
159
+ }
160
+ if (installValue) {
161
+ const nextExtensions = normalizeInstalledExtensions([
162
+ ...config.installedExtensions,
163
+ ...installValue.split(","),
164
+ ]);
165
+ const added = nextExtensions.filter((item) => !config.installedExtensions.includes(item));
166
+ if (!added.length) {
167
+ console.log("[otto-bridge] nenhuma extensao nova para instalar");
168
+ return;
169
+ }
170
+ await saveBridgeConfig({
171
+ ...config,
172
+ installedExtensions: nextExtensions,
173
+ });
174
+ console.log(`[otto-bridge] extensoes instaladas: ${added.join(", ")}`);
175
+ console.log("[otto-bridge] rode `otto-bridge run` novamente se quiser sincronizar agora com a web");
176
+ return;
177
+ }
178
+ if (uninstallValue) {
179
+ const removeSet = new Set(normalizeInstalledExtensions(uninstallValue.split(",")));
180
+ const nextExtensions = config.installedExtensions.filter((item) => !removeSet.has(item));
181
+ const removed = config.installedExtensions.filter((item) => removeSet.has(item));
182
+ if (!removed.length) {
183
+ console.log("[otto-bridge] nenhuma extensao correspondente estava instalada");
184
+ return;
185
+ }
186
+ await saveBridgeConfig({
187
+ ...config,
188
+ installedExtensions: nextExtensions,
189
+ });
190
+ console.log(`[otto-bridge] extensoes removidas: ${removed.join(", ")}`);
191
+ console.log("[otto-bridge] rode `otto-bridge run` novamente se quiser sincronizar agora com a web");
192
+ return;
193
+ }
194
+ if (!config.installedExtensions.length) {
195
+ console.log("[otto-bridge] nenhuma extensao instalada");
196
+ return;
197
+ }
198
+ console.log("[otto-bridge] extensoes instaladas:");
199
+ for (const extension of config.installedExtensions) {
200
+ console.log(`- ${extension}`);
201
+ }
202
+ }
143
203
  async function runUnpairCommand() {
144
204
  await clearBridgeConfig();
145
205
  console.log("[otto-bridge] local pairing cleared");
@@ -173,6 +233,9 @@ async function main() {
173
233
  case "status":
174
234
  await runStatusCommand();
175
235
  return;
236
+ case "extensions":
237
+ await runExtensionsCommand(args);
238
+ return;
176
239
  case "version":
177
240
  printVersion();
178
241
  return;
package/dist/runtime.js CHANGED
@@ -118,7 +118,10 @@ export class BridgeRuntime {
118
118
  device_name: this.config.deviceName,
119
119
  bridge_version: this.config.bridgeVersion,
120
120
  capabilities: this.config.capabilities,
121
- metadata: this.config.metadata,
121
+ metadata: {
122
+ ...(this.config.metadata || {}),
123
+ installed_extensions: this.config.installedExtensions,
124
+ },
122
125
  }));
123
126
  heartbeatTimer = setInterval(() => {
124
127
  if (socket.readyState === WebSocket.OPEN) {
@@ -163,7 +166,6 @@ export class BridgeRuntime {
163
166
  const type = String(message.type || "");
164
167
  switch (type) {
165
168
  case "device.hello":
166
- this.maybeLogBridgeReleaseNotice(message);
167
169
  console.log(`[otto-bridge] server hello device=${String(message.device_id || "")}`);
168
170
  return;
169
171
  case "device.hello_ack":
package/dist/types.js CHANGED
@@ -1,5 +1,5 @@
1
1
  export const BRIDGE_CONFIG_VERSION = 1;
2
- export const BRIDGE_VERSION = "0.5.4";
2
+ export const BRIDGE_VERSION = "0.5.6";
3
3
  export const BRIDGE_PACKAGE_NAME = "@leg3ndy/otto-bridge";
4
4
  export const DEFAULT_API_BASE_URL = "http://localhost:8000";
5
5
  export const DEFAULT_POLL_INTERVAL_MS = 3000;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@leg3ndy/otto-bridge",
3
- "version": "0.5.4",
3
+ "version": "0.5.6",
4
4
  "private": false,
5
5
  "type": "module",
6
6
  "description": "Local companion for Otto Bridge device pairing and WebSocket runtime.",