npm - unbrowse - Versions diffs - 2.0.2 → 2.0.3 - Mend

unbrowse 2.0.2 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +1 -1
package/runtime-src/auth/browser-cookies.ts +11 -6
package/runtime-src/execution/index.ts +26 -6
package/vendor/kuri/darwin-arm64/kuri +0 -0
package/runtime-src/kuri/vendor/kuri/darwin-arm64/kuri +0 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "unbrowse",
-  "version": "2.0.2",
+  "version": "2.0.3",
   "description": "Reverse-engineer any website into reusable API skills. npm CLI + local engine.",
   "type": "module",
   "bin": {

package/runtime-src/auth/browser-cookies.ts CHANGED Viewed

@@ -136,9 +136,10 @@ function getChromiumDecryptionKey(opts?: ChromiumCookieSourceOptions): Buffer |
   if (platform() !== "darwin") return null; // TODO: Linux/Windows support
   try {
-    const keyOutput = execSync(
-      `security find-generic-password -s "${service.replace(/"/g, '\\"')}" -w 2>/dev/null || echo ""`,
-      { encoding: "utf8" },
+    const keyOutput = execFileSync(
+      "security",
+      ["find-generic-password", "-s", service, "-w"],
+      { encoding: "utf8", stdio: ["pipe", "pipe", "pipe"] },
     ).trim();
     if (!keyOutput) return null;
@@ -241,9 +242,13 @@ function buildDomainWhereClause(domain: string, column: string): string {
     `www.${reg}`,
     `.www.${reg}`,
   ]);
-  const escaped = [...variants].map((d) => `'${d.replace(/'/g, "''")}'`);
-  // Also match any subdomain via LIKE (e.g. .api.example.com, .sg.example.com)
-  const likePattern = `'%.${reg.replace(/'/g, "''")}'`;
+  // Use parameterized-safe quoting: reject any domain containing single quotes
+  for (const d of variants) {
+    if (d.includes("'")) throw new Error(`Invalid domain for cookie query: ${d}`);
+  }
+  const escaped = [...variants].map((d) => `'${d}'`);
+  const likeReg = reg.includes("'") ? reg : reg;
+  const likePattern = `'%.${likeReg}'`;
   return `(${column} IN (${escaped.join(", ")}) OR ${column} LIKE ${likePattern})`;
 }

package/runtime-src/execution/index.ts CHANGED Viewed

@@ -596,20 +596,19 @@ export function buildPageArtifactCapture(
   const extracted = extractFromDOM(html, intent);
   if (!extracted.data || extracted.confidence <= 0.2) return {};
   const quality = validateExtractionQuality(extracted.data, extracted.confidence, intent);
-  if (!quality.valid) {
-    return { quality_note: quality.quality_note ?? "low_quality_dom_extraction" };
-  }
   const semanticAssessment = assessIntentResult(extracted.data, intent);
   if (semanticAssessment.verdict === "fail") {
     return { quality_note: semanticAssessment.reason };
   }
+  // Quality gate: low confidence still returns data to the caller (better than
+  // no_endpoints), but marks it so the caller can decide whether to publish.
   const response_schema = inferSchema([extracted.data]);
   const endpoint: EndpointDescriptor = {
     endpoint_id: nanoid(),
     method: "GET",
     url_template: templatizeQueryParams(url),
     idempotency: "safe" as const,
-    verification_status: "verified" as const,
+    verification_status: quality.valid ? "verified" as const : "unverified" as const,
     reliability_score: extracted.confidence,
     description: `Captured page artifact for ${intent}`,
     response_schema,
@@ -637,8 +636,10 @@ export function buildPageArtifactCapture(
         method: extracted.extraction_method,
         confidence: extracted.confidence,
         source: "dom-fallback",
+        ...(quality.quality_note ? { quality_note: quality.quality_note } : {}),
       },
     },
+    ...(!quality.valid ? { quality_note: quality.quality_note } : {}),
   };
 }
@@ -1163,9 +1164,27 @@ async function executeBrowserCapture(
     cleanEndpoints.push(canonicalDocumentEndpoint);
   }
-  const pageArtifact = captured.html
+  let pageArtifact = captured.html
     ? buildPageArtifactCapture(url, intent, captured.html, authBackedCapture)
     : {};
+  // SSR fallback: if Kuri's headless Chrome was bot-detected and served stripped
+  // HTML, the DOM extraction above will fail or return low quality. Try a plain
+  // HTTP fetch — many sites serve full SSR HTML to normal requests.
+  if (!pageArtifact.endpoint) {
+    const kuriHtmlLen = captured.html?.length ?? 0;
+    const ssrFallback = await tryHttpFetch(url, {}, []).catch(() => null);
+    if (ssrFallback && ssrFallback.html.length > kuriHtmlLen * 1.2) {
+      console.log(`[ssr-fallback] Kuri HTML=${kuriHtmlLen}, fetch HTML=${ssrFallback.html.length} — retrying DOM extraction`);
+      const ssrArtifact = buildPageArtifactCapture(ssrFallback.final_url || url, intent, ssrFallback.html, authBackedCapture);
+      if (ssrArtifact.endpoint) {
+        console.log(`[ssr-fallback] success — extracted structured data via plain HTTP fetch`);
+        pageArtifact = ssrArtifact;
+      } else {
+        console.log(`[ssr-fallback] fetch got larger HTML but extraction still failed${ssrArtifact.quality_note ? `: ${ssrArtifact.quality_note}` : ""}`);
+      }
+    }
+  }
   const domArtifactEndpoint = pageArtifact.endpoint;
   const domArtifactResult = pageArtifact.result;
   const inferredOnlyCapture = cleanEndpoints.length > 0 && cleanEndpoints.every((endpoint) => isBundleInferredEndpoint(endpoint));
@@ -1249,7 +1268,8 @@ async function executeBrowserCapture(
         };
       }
-    if (pageArtifact.quality_note) {
+    if (pageArtifact.quality_note && !pageArtifact.endpoint) {
+      // Quality gate rejected AND no endpoint — nothing useful extracted
       const trace: ExecutionTrace = stampTrace({
         trace_id: traceId,
         skill_id: skill.skill_id,

package/vendor/kuri/darwin-arm64/kuri CHANGED Viewed

Binary file

package/runtime-src/kuri/vendor/kuri/darwin-arm64/kuri DELETED Viewed

Binary file