unbrowse 2.0.2 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unbrowse",
3
- "version": "2.0.2",
3
+ "version": "2.0.3",
4
4
  "description": "Reverse-engineer any website into reusable API skills. npm CLI + local engine.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -136,9 +136,10 @@ function getChromiumDecryptionKey(opts?: ChromiumCookieSourceOptions): Buffer |
136
136
  if (platform() !== "darwin") return null; // TODO: Linux/Windows support
137
137
 
138
138
  try {
139
- const keyOutput = execSync(
140
- `security find-generic-password -s "${service.replace(/"/g, '\\"')}" -w 2>/dev/null || echo ""`,
141
- { encoding: "utf8" },
139
+ const keyOutput = execFileSync(
140
+ "security",
141
+ ["find-generic-password", "-s", service, "-w"],
142
+ { encoding: "utf8", stdio: ["pipe", "pipe", "pipe"] },
142
143
  ).trim();
143
144
  if (!keyOutput) return null;
144
145
 
@@ -241,9 +242,13 @@ function buildDomainWhereClause(domain: string, column: string): string {
241
242
  `www.${reg}`,
242
243
  `.www.${reg}`,
243
244
  ]);
244
- const escaped = [...variants].map((d) => `'${d.replace(/'/g, "''")}'`);
245
- // Also match any subdomain via LIKE (e.g. .api.example.com, .sg.example.com)
246
- const likePattern = `'%.${reg.replace(/'/g, "''")}'`;
245
+ // Use parameterized-safe quoting: reject any domain containing single quotes
246
+ for (const d of variants) {
247
+ if (d.includes("'")) throw new Error(`Invalid domain for cookie query: ${d}`);
248
+ }
249
+ const escaped = [...variants].map((d) => `'${d}'`);
250
+ const likeReg = reg.includes("'") ? reg : reg;
251
+ const likePattern = `'%.${likeReg}'`;
247
252
  return `(${column} IN (${escaped.join(", ")}) OR ${column} LIKE ${likePattern})`;
248
253
  }
249
254
 
@@ -596,20 +596,19 @@ export function buildPageArtifactCapture(
596
596
  const extracted = extractFromDOM(html, intent);
597
597
  if (!extracted.data || extracted.confidence <= 0.2) return {};
598
598
  const quality = validateExtractionQuality(extracted.data, extracted.confidence, intent);
599
- if (!quality.valid) {
600
- return { quality_note: quality.quality_note ?? "low_quality_dom_extraction" };
601
- }
602
599
  const semanticAssessment = assessIntentResult(extracted.data, intent);
603
600
  if (semanticAssessment.verdict === "fail") {
604
601
  return { quality_note: semanticAssessment.reason };
605
602
  }
603
+ // Quality gate: low confidence still returns data to the caller (better than
604
+ // no_endpoints), but marks it so the caller can decide whether to publish.
606
605
  const response_schema = inferSchema([extracted.data]);
607
606
  const endpoint: EndpointDescriptor = {
608
607
  endpoint_id: nanoid(),
609
608
  method: "GET",
610
609
  url_template: templatizeQueryParams(url),
611
610
  idempotency: "safe" as const,
612
- verification_status: "verified" as const,
611
+ verification_status: quality.valid ? "verified" as const : "unverified" as const,
613
612
  reliability_score: extracted.confidence,
614
613
  description: `Captured page artifact for ${intent}`,
615
614
  response_schema,
@@ -637,8 +636,10 @@ export function buildPageArtifactCapture(
637
636
  method: extracted.extraction_method,
638
637
  confidence: extracted.confidence,
639
638
  source: "dom-fallback",
639
+ ...(quality.quality_note ? { quality_note: quality.quality_note } : {}),
640
640
  },
641
641
  },
642
+ ...(!quality.valid ? { quality_note: quality.quality_note } : {}),
642
643
  };
643
644
  }
644
645
 
@@ -1163,9 +1164,27 @@ async function executeBrowserCapture(
1163
1164
  cleanEndpoints.push(canonicalDocumentEndpoint);
1164
1165
  }
1165
1166
 
1166
- const pageArtifact = captured.html
1167
+ let pageArtifact = captured.html
1167
1168
  ? buildPageArtifactCapture(url, intent, captured.html, authBackedCapture)
1168
1169
  : {};
1170
+
1171
+ // SSR fallback: if Kuri's headless Chrome was bot-detected and served stripped
1172
+ // HTML, the DOM extraction above will fail or return low quality. Try a plain
1173
+ // HTTP fetch — many sites serve full SSR HTML to normal requests.
1174
+ if (!pageArtifact.endpoint) {
1175
+ const kuriHtmlLen = captured.html?.length ?? 0;
1176
+ const ssrFallback = await tryHttpFetch(url, {}, []).catch(() => null);
1177
+ if (ssrFallback && ssrFallback.html.length > kuriHtmlLen * 1.2) {
1178
+ console.log(`[ssr-fallback] Kuri HTML=${kuriHtmlLen}, fetch HTML=${ssrFallback.html.length} — retrying DOM extraction`);
1179
+ const ssrArtifact = buildPageArtifactCapture(ssrFallback.final_url || url, intent, ssrFallback.html, authBackedCapture);
1180
+ if (ssrArtifact.endpoint) {
1181
+ console.log(`[ssr-fallback] success — extracted structured data via plain HTTP fetch`);
1182
+ pageArtifact = ssrArtifact;
1183
+ } else {
1184
+ console.log(`[ssr-fallback] fetch got larger HTML but extraction still failed${ssrArtifact.quality_note ? `: ${ssrArtifact.quality_note}` : ""}`);
1185
+ }
1186
+ }
1187
+ }
1169
1188
  const domArtifactEndpoint = pageArtifact.endpoint;
1170
1189
  const domArtifactResult = pageArtifact.result;
1171
1190
  const inferredOnlyCapture = cleanEndpoints.length > 0 && cleanEndpoints.every((endpoint) => isBundleInferredEndpoint(endpoint));
@@ -1249,7 +1268,8 @@ async function executeBrowserCapture(
1249
1268
  };
1250
1269
  }
1251
1270
 
1252
- if (pageArtifact.quality_note) {
1271
+ if (pageArtifact.quality_note && !pageArtifact.endpoint) {
1272
+ // Quality gate rejected AND no endpoint — nothing useful extracted
1253
1273
  const trace: ExecutionTrace = stampTrace({
1254
1274
  trace_id: traceId,
1255
1275
  skill_id: skill.skill_id,
Binary file