unbrowse 2.0.2 → 2.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -136,9 +136,10 @@ function getChromiumDecryptionKey(opts?: ChromiumCookieSourceOptions): Buffer |
|
|
|
136
136
|
if (platform() !== "darwin") return null; // TODO: Linux/Windows support
|
|
137
137
|
|
|
138
138
|
try {
|
|
139
|
-
const keyOutput =
|
|
140
|
-
|
|
141
|
-
|
|
139
|
+
const keyOutput = execFileSync(
|
|
140
|
+
"security",
|
|
141
|
+
["find-generic-password", "-s", service, "-w"],
|
|
142
|
+
{ encoding: "utf8", stdio: ["pipe", "pipe", "pipe"] },
|
|
142
143
|
).trim();
|
|
143
144
|
if (!keyOutput) return null;
|
|
144
145
|
|
|
@@ -241,9 +242,13 @@ function buildDomainWhereClause(domain: string, column: string): string {
|
|
|
241
242
|
`www.${reg}`,
|
|
242
243
|
`.www.${reg}`,
|
|
243
244
|
]);
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
245
|
+
// Use parameterized-safe quoting: reject any domain containing single quotes
|
|
246
|
+
for (const d of variants) {
|
|
247
|
+
if (d.includes("'")) throw new Error(`Invalid domain for cookie query: ${d}`);
|
|
248
|
+
}
|
|
249
|
+
const escaped = [...variants].map((d) => `'${d}'`);
|
|
250
|
+
const likeReg = reg.includes("'") ? reg : reg;
|
|
251
|
+
const likePattern = `'%.${likeReg}'`;
|
|
247
252
|
return `(${column} IN (${escaped.join(", ")}) OR ${column} LIKE ${likePattern})`;
|
|
248
253
|
}
|
|
249
254
|
|
|
@@ -596,20 +596,19 @@ export function buildPageArtifactCapture(
|
|
|
596
596
|
const extracted = extractFromDOM(html, intent);
|
|
597
597
|
if (!extracted.data || extracted.confidence <= 0.2) return {};
|
|
598
598
|
const quality = validateExtractionQuality(extracted.data, extracted.confidence, intent);
|
|
599
|
-
if (!quality.valid) {
|
|
600
|
-
return { quality_note: quality.quality_note ?? "low_quality_dom_extraction" };
|
|
601
|
-
}
|
|
602
599
|
const semanticAssessment = assessIntentResult(extracted.data, intent);
|
|
603
600
|
if (semanticAssessment.verdict === "fail") {
|
|
604
601
|
return { quality_note: semanticAssessment.reason };
|
|
605
602
|
}
|
|
603
|
+
// Quality gate: low confidence still returns data to the caller (better than
|
|
604
|
+
// no_endpoints), but marks it so the caller can decide whether to publish.
|
|
606
605
|
const response_schema = inferSchema([extracted.data]);
|
|
607
606
|
const endpoint: EndpointDescriptor = {
|
|
608
607
|
endpoint_id: nanoid(),
|
|
609
608
|
method: "GET",
|
|
610
609
|
url_template: templatizeQueryParams(url),
|
|
611
610
|
idempotency: "safe" as const,
|
|
612
|
-
verification_status: "verified" as const,
|
|
611
|
+
verification_status: quality.valid ? "verified" as const : "unverified" as const,
|
|
613
612
|
reliability_score: extracted.confidence,
|
|
614
613
|
description: `Captured page artifact for ${intent}`,
|
|
615
614
|
response_schema,
|
|
@@ -637,8 +636,10 @@ export function buildPageArtifactCapture(
|
|
|
637
636
|
method: extracted.extraction_method,
|
|
638
637
|
confidence: extracted.confidence,
|
|
639
638
|
source: "dom-fallback",
|
|
639
|
+
...(quality.quality_note ? { quality_note: quality.quality_note } : {}),
|
|
640
640
|
},
|
|
641
641
|
},
|
|
642
|
+
...(!quality.valid ? { quality_note: quality.quality_note } : {}),
|
|
642
643
|
};
|
|
643
644
|
}
|
|
644
645
|
|
|
@@ -1163,9 +1164,27 @@ async function executeBrowserCapture(
|
|
|
1163
1164
|
cleanEndpoints.push(canonicalDocumentEndpoint);
|
|
1164
1165
|
}
|
|
1165
1166
|
|
|
1166
|
-
|
|
1167
|
+
let pageArtifact = captured.html
|
|
1167
1168
|
? buildPageArtifactCapture(url, intent, captured.html, authBackedCapture)
|
|
1168
1169
|
: {};
|
|
1170
|
+
|
|
1171
|
+
// SSR fallback: if Kuri's headless Chrome was bot-detected and served stripped
|
|
1172
|
+
// HTML, the DOM extraction above will fail or return low quality. Try a plain
|
|
1173
|
+
// HTTP fetch — many sites serve full SSR HTML to normal requests.
|
|
1174
|
+
if (!pageArtifact.endpoint) {
|
|
1175
|
+
const kuriHtmlLen = captured.html?.length ?? 0;
|
|
1176
|
+
const ssrFallback = await tryHttpFetch(url, {}, []).catch(() => null);
|
|
1177
|
+
if (ssrFallback && ssrFallback.html.length > kuriHtmlLen * 1.2) {
|
|
1178
|
+
console.log(`[ssr-fallback] Kuri HTML=${kuriHtmlLen}, fetch HTML=${ssrFallback.html.length} — retrying DOM extraction`);
|
|
1179
|
+
const ssrArtifact = buildPageArtifactCapture(ssrFallback.final_url || url, intent, ssrFallback.html, authBackedCapture);
|
|
1180
|
+
if (ssrArtifact.endpoint) {
|
|
1181
|
+
console.log(`[ssr-fallback] success — extracted structured data via plain HTTP fetch`);
|
|
1182
|
+
pageArtifact = ssrArtifact;
|
|
1183
|
+
} else {
|
|
1184
|
+
console.log(`[ssr-fallback] fetch got larger HTML but extraction still failed${ssrArtifact.quality_note ? `: ${ssrArtifact.quality_note}` : ""}`);
|
|
1185
|
+
}
|
|
1186
|
+
}
|
|
1187
|
+
}
|
|
1169
1188
|
const domArtifactEndpoint = pageArtifact.endpoint;
|
|
1170
1189
|
const domArtifactResult = pageArtifact.result;
|
|
1171
1190
|
const inferredOnlyCapture = cleanEndpoints.length > 0 && cleanEndpoints.every((endpoint) => isBundleInferredEndpoint(endpoint));
|
|
@@ -1249,7 +1268,8 @@ async function executeBrowserCapture(
|
|
|
1249
1268
|
};
|
|
1250
1269
|
}
|
|
1251
1270
|
|
|
1252
|
-
if (pageArtifact.quality_note) {
|
|
1271
|
+
if (pageArtifact.quality_note && !pageArtifact.endpoint) {
|
|
1272
|
+
// Quality gate rejected AND no endpoint — nothing useful extracted
|
|
1253
1273
|
const trace: ExecutionTrace = stampTrace({
|
|
1254
1274
|
trace_id: traceId,
|
|
1255
1275
|
skill_id: skill.skill_id,
|
|
Binary file
|
|
Binary file
|