hypha-debugger 0.2.8 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1969,8 +1969,14 @@ async function toJpeg(node, options = {}) {
1969
1969
  /**
1970
1970
  * Screenshot capture service using html-to-image.
1971
1971
  *
1972
- * Images are downscaled before being returned so agents don't receive
1973
- * multi-megabyte base64 payloads that can crash their context window.
1972
+ * Returns image data in a format directly usable by AI agents:
1973
+ * - `base64`: raw base64 (no data: prefix) what Claude/GPT image
1974
+ * content fields expect.
1975
+ * - `media_type`: e.g. "image/jpeg" — the MIME type to pair with base64.
1976
+ * - `data_url`: full `data:image/jpeg;base64,...` URL for HTML/preview use.
1977
+ *
1978
+ * Images are aggressively downscaled by default (max 800px, JPEG q=0.6)
1979
+ * because most agent context windows can't tolerate multi-MB payloads.
1974
1980
  */
1975
1981
  /** Extract a useful string from an unknown error value. */
1976
1982
  function errorMessage(err) {
@@ -1989,10 +1995,24 @@ function errorMessage(err) {
1989
1995
  return String(err);
1990
1996
  }
1991
1997
  }
1998
+ /** Split a `data:<mime>;base64,<...>` URL into its parts. Throws on malformed. */
1999
+ function splitDataUrl(dataUrl) {
2000
+ const m = /^data:([^;,]+)(?:;[^,]*)?,(.*)$/.exec(dataUrl);
2001
+ if (!m)
2002
+ throw new Error("Output is not a valid data: URL");
2003
+ const mediaType = m[1];
2004
+ let payload = m[2];
2005
+ // If charset=utf-8 (no base64), html-to-image returned an SVG fallback —
2006
+ // which is unusable for agent vision. Reject so the caller knows.
2007
+ if (!/;base64/i.test(dataUrl)) {
2008
+ throw new Error(`Output is not base64-encoded (got ${mediaType}). Capture probably failed silently.`);
2009
+ }
2010
+ return { mediaType, base64: payload };
2011
+ }
1992
2012
  /**
1993
2013
  * Resize an image data URL via a canvas. Returns a new data URL at the
1994
- * requested format/quality. Maintains aspect ratio: fits within
1995
- * (maxWidth × maxHeight) without distortion.
2014
+ * requested format/quality, fitting within (maxWidth × maxHeight) without
2015
+ * distortion.
1996
2016
  */
1997
2017
  async function resizeDataUrl(dataUrl, maxWidth, maxHeight, format, quality) {
1998
2018
  return new Promise((resolve, reject) => {
@@ -2001,6 +2021,10 @@ async function resizeDataUrl(dataUrl, maxWidth, maxHeight, format, quality) {
2001
2021
  try {
2002
2022
  const srcW = img.naturalWidth;
2003
2023
  const srcH = img.naturalHeight;
2024
+ if (!srcW || !srcH) {
2025
+ reject(new Error("Captured image has zero dimensions"));
2026
+ return;
2027
+ }
2004
2028
  const scale = Math.min(maxWidth / srcW, maxHeight / srcH, 1);
2005
2029
  const dstW = Math.max(1, Math.round(srcW * scale));
2006
2030
  const dstH = Math.max(1, Math.round(srcH * scale));
@@ -2030,17 +2054,13 @@ async function resizeDataUrl(dataUrl, maxWidth, maxHeight, format, quality) {
2030
2054
  });
2031
2055
  }
2032
2056
  async function takeScreenshot(selector, format, quality, max_width, max_height, full_page) {
2033
- // Agent-friendly defaults: JPEG, moderate quality, capped at 1024px,
2034
- // viewport-only (not the entire scrollable page).
2057
+ // Agent-friendly defaults: JPEG at q=0.6, capped at 800px.
2058
+ // These are smaller than before because larger images crash some agents.
2035
2059
  const fmt = format ?? "jpeg";
2036
- const qual = quality ?? 0.75;
2037
- const maxW = max_width ?? 1024;
2038
- const maxH = max_height ?? 1024;
2060
+ const qual = quality ?? 0.6;
2061
+ const maxW = max_width ?? 800;
2062
+ const maxH = max_height ?? 800;
2039
2063
  const capturePage = full_page ?? false;
2040
- // Pick target:
2041
- // - explicit selector → that element
2042
- // - full_page=true → document.documentElement (the entire scrollable page)
2043
- // - default → viewport-sized region (clipped to window size)
2044
2064
  let target;
2045
2065
  if (selector) {
2046
2066
  target = document.querySelector(selector);
@@ -2056,29 +2076,23 @@ async function takeScreenshot(selector, format, quality, max_width, max_height,
2056
2076
  }
2057
2077
  try {
2058
2078
  const node = target;
2059
- // For viewport-only captures, limit html-to-image's output size
2060
- // to the viewport dimensions.
2061
2079
  const viewportW = window.innerWidth;
2062
2080
  const viewportH = window.innerHeight;
2063
- // 1x1 transparent PNG — used as placeholder for images that fail
2064
- // to load (CORS-blocked, 404, etc.) so html-to-image doesn't reject.
2065
2081
  const TRANSPARENT_PIXEL = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkAAIAAAoAAv/lxKUAAAAASUVORK5CYII=";
2066
2082
  const captureOptions = {
2067
2083
  quality: qual,
2068
- pixelRatio: 1, // always capture at 1x — we'll resize after
2084
+ pixelRatio: 1,
2069
2085
  cacheBust: true,
2070
2086
  skipAutoScale: true,
2071
- skipFonts: true, // CORS-blocked stylesheets can hang font inlining
2072
- imagePlaceholder: TRANSPARENT_PIXEL, // fallback for broken images
2087
+ skipFonts: true,
2088
+ imagePlaceholder: TRANSPARENT_PIXEL,
2073
2089
  filter: (el) => {
2074
- // Exclude the debugger overlay and cursor from screenshots
2075
2090
  return (el.id !== "hypha-debugger-host" &&
2076
2091
  el.id !== "hypha-debugger-cursor" &&
2077
2092
  el.id !== "playwright-highlight-container");
2078
2093
  },
2079
2094
  };
2080
2095
  if (!selector && !capturePage) {
2081
- // Viewport-only capture: constrain canvas to window size
2082
2096
  captureOptions.width = viewportW;
2083
2097
  captureOptions.height = viewportH;
2084
2098
  }
@@ -2094,8 +2108,7 @@ async function takeScreenshot(selector, format, quality, max_width, max_height,
2094
2108
  dataUrl = await runCapture(captureOptions);
2095
2109
  }
2096
2110
  catch (captureErr) {
2097
- // Fallback: retry without images (filter them out). Some pages have
2098
- // images that html-to-image can't inline even with imagePlaceholder.
2111
+ // Fallback: retry without images
2099
2112
  try {
2100
2113
  const noImagesOpts = {
2101
2114
  ...captureOptions,
@@ -2114,33 +2127,42 @@ async function takeScreenshot(selector, format, quality, max_width, max_height,
2114
2127
  };
2115
2128
  }
2116
2129
  }
2117
- // Resize down to fit within (maxW × maxH) and re-encode. If resize
2118
- // fails (e.g. data URL too large to load back into an Image), fall
2119
- // back to returning the original capture so the caller still gets
2120
- // something useful.
2130
+ // Resize + re-encode through canvas. This both downsizes and ensures
2131
+ // a clean base64 PNG/JPEG (rather than a possibly-broken html-to-image
2132
+ // SVG-via-data-URL that some agent runtimes reject).
2133
+ let resized;
2121
2134
  try {
2122
- const resized = await resizeDataUrl(dataUrl, maxW, maxH, fmt, qual);
2123
- const sizeKb = Math.round((resized.dataUrl.length * 0.75) / 1024);
2135
+ resized = await resizeDataUrl(dataUrl, maxW, maxH, fmt, qual);
2136
+ }
2137
+ catch (resizeErr) {
2124
2138
  return {
2125
- data: resized.dataUrl,
2126
- format: fmt,
2127
- width: resized.width,
2128
- height: resized.height,
2129
- size_kb: sizeKb,
2139
+ error: `Resize failed: ${errorMessage(resizeErr)} (this usually means the captured image was malformed; try lowering max_width or use full_page:false)`,
2130
2140
  };
2131
2141
  }
2132
- catch (resizeErr) {
2133
- const rect = node.getBoundingClientRect();
2134
- const sizeKb = Math.round((dataUrl.length * 0.75) / 1024);
2142
+ // Validate the final data URL — should be data:image/jpeg;base64,...
2143
+ let parts;
2144
+ try {
2145
+ parts = splitDataUrl(resized.dataUrl);
2146
+ }
2147
+ catch (validateErr) {
2148
+ return { error: `Output validation failed: ${errorMessage(validateErr)}` };
2149
+ }
2150
+ // Sanity-check: a valid JPEG/PNG is at least a few hundred bytes.
2151
+ if (parts.base64.length < 200) {
2135
2152
  return {
2136
- data: dataUrl,
2137
- format: fmt,
2138
- width: Math.round(rect.width),
2139
- height: Math.round(rect.height),
2140
- size_kb: sizeKb,
2141
- warning: `Resize failed, returning original: ${errorMessage(resizeErr)}`,
2153
+ error: `Output too small (${parts.base64.length} chars base64) — capture likely failed`,
2142
2154
  };
2143
2155
  }
2156
+ const sizeKb = Math.round((parts.base64.length * 0.75) / 1024);
2157
+ return {
2158
+ base64: parts.base64,
2159
+ media_type: parts.mediaType,
2160
+ data_url: resized.dataUrl,
2161
+ format: fmt,
2162
+ width: resized.width,
2163
+ height: resized.height,
2164
+ size_kb: sizeKb,
2165
+ };
2144
2166
  }
2145
2167
  catch (err) {
2146
2168
  return { error: `Screenshot failed: ${errorMessage(err)}` };
@@ -2149,11 +2171,12 @@ async function takeScreenshot(selector, format, quality, max_width, max_height,
2149
2171
  takeScreenshot.__schema__ = {
2150
2172
  name: "takeScreenshot",
2151
2173
  description: "Capture a screenshot of the current viewport, a specific element, or the full page. " +
2152
- "Downscaled to fit within max_width × max_height (default 1024px) to keep the payload " +
2153
- "small enough for AI agents. Defaults to JPEG at 0.75 quality. " +
2154
- "Returns: { data: 'data:image/jpeg;base64,...', format, width, height, size_kb }. " +
2155
- "Note: the image is in the `data` field as a full data: URL strip the `data:...;base64,` " +
2156
- "prefix before base64-decoding.",
2174
+ "Downscaled to fit within max_width × max_height (default 800px) and JPEG-encoded at " +
2175
+ "quality 0.6 by default for agent-friendly payload sizes. " +
2176
+ "Returns: { base64, media_type, data_url, format, width, height, size_kb }. " +
2177
+ "Use `base64` (raw base64, no prefix) directly with Claude/GPT image content fields. " +
2178
+ "Use `data_url` for HTML <img src=...> previews. " +
2179
+ "On failure returns { error }.",
2157
2180
  parameters: {
2158
2181
  type: "object",
2159
2182
  properties: {
@@ -2164,19 +2187,19 @@ takeScreenshot.__schema__ = {
2164
2187
  format: {
2165
2188
  type: "string",
2166
2189
  enum: ["png", "jpeg"],
2167
- description: 'Image format. Default: "jpeg" (much smaller than PNG). Use "png" for sharp text.',
2190
+ description: 'Image format. Default: "jpeg" (much smaller than PNG). Use "png" only when sharp text really matters.',
2168
2191
  },
2169
2192
  quality: {
2170
2193
  type: "number",
2171
- description: "JPEG quality (0–1). Default: 0.75. Ignored for PNG. Lower = smaller payload.",
2194
+ description: "JPEG quality (0–1). Default: 0.6. Ignored for PNG. Lower = smaller payload.",
2172
2195
  },
2173
2196
  max_width: {
2174
2197
  type: "number",
2175
- description: "Maximum output width in pixels. Default: 1024. Image is scaled down preserving aspect ratio.",
2198
+ description: "Maximum output width in pixels. Default: 800. Image scaled down preserving aspect ratio.",
2176
2199
  },
2177
2200
  max_height: {
2178
2201
  type: "number",
2179
- description: "Maximum output height in pixels. Default: 1024. Image is scaled down preserving aspect ratio.",
2202
+ description: "Maximum output height in pixels. Default: 800. Image scaled down preserving aspect ratio.",
2180
2203
  },
2181
2204
  full_page: {
2182
2205
  type: "boolean",
@@ -2942,7 +2965,7 @@ function generateSkillMd(serviceFunctions, serviceUrl) {
2942
2965
  "",
2943
2966
  "**1. Data-returning functions** (e.g. `take_screenshot`, `get_page_info`, `execute_script`, `get_browser_state`, `get_html`, `get_react_tree`) return function-specific keys:",
2944
2967
  "",
2945
- "- `take_screenshot` → `{data, format, width, height, size_kb}` where `data` is a `data:image/jpeg;base64,...` URL (note: field is `data`, not `screenshot` or `image`)",
2968
+ "- `take_screenshot` → `{base64, media_type, data_url, format, width, height, size_kb}`. Use `base64` (raw, no prefix) for Claude/GPT image content fields. Use `data_url` for HTML `<img src=...>` previews.",
2946
2969
  "- `execute_script` → `{result, type}` (or `{error}` on exception)",
2947
2970
  "- `get_browser_state` → `{url, title, header, content, footer, element_count}`",
2948
2971
  "- `get_page_info` → `{url, title, viewport_width, viewport_height, ...}`",
@@ -3030,7 +3053,7 @@ function generateSkillMd(serviceFunctions, serviceUrl) {
3030
3053
  "- **`execute_script` is the most versatile** — use it for reading state, calling APIs, DOM queries, or anything not covered by other functions. The last expression is auto-returned. Returns `{result, type}`.",
3031
3054
  "- **`get_browser_state` is the best way to see what's on the page** — it detects all interactive elements and shows them as indexed items.",
3032
3055
  "- **After each action, call `get_browser_state` again** — element indices change when the DOM updates.",
3033
- "- **Use `take_screenshot`** to visually verify the page state. The image is returned in the `data` field as a `data:image/jpeg;base64,...` URLstrip the `data:...;base64,` prefix before decoding.",
3056
+ "- **Use `take_screenshot`** to visually verify the page state. The response includes `base64` (raw, ready for Claude/GPT image fields) and `data_url` (for HTML previews). Default size is 800px JPEG q=0.6 bump `max_width` if you need more detail.",
3034
3057
  "- **Use `remove_highlights`** before a screenshot for a clean view.",
3035
3058
  "- **Use `scroll`** with an element index to scroll inside a specific container (e.g. a chat window, sidebar).",
3036
3059
  "- **Use `get_page_info` with `include_logs=true`** to check for JavaScript errors or debug output.",