unbrowse 9.7.0 → 9.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/runtime/cli.js +292 -187
- package/runtime/mcp.js +292 -187
- package/vendor/kuri/darwin-arm64/libkuri_ffi.dylib +0 -0
- package/vendor/kuri/darwin-x64/libkuri_ffi.dylib +0 -0
- package/vendor/kuri/linux-arm64/libkuri_ffi.so +0 -0
- package/vendor/kuri/linux-x64/kuri +0 -0
- package/vendor/kuri/linux-x64/libkuri_ffi.so +0 -0
- package/vendor/kuri/manifest.json +7 -7
- package/vendor/kuri/win-x64/kuri.exe +0 -0
package/package.json
CHANGED
package/runtime/cli.js
CHANGED
|
@@ -2350,7 +2350,7 @@ var init_telemetry = __esm(() => {
|
|
|
2350
2350
|
});
|
|
2351
2351
|
|
|
2352
2352
|
// .tmp-runtime-src/build-info.generated.ts
|
|
2353
|
-
var BUILD_RELEASE_VERSION = "9.
|
|
2353
|
+
var BUILD_RELEASE_VERSION = "9.8.0", BUILD_GIT_SHA = "255142bb4c25", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiOS44LjAiLCJnaXRfc2hhIjoiMjU1MTQyYmI0YzI1IiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUAyNTUxNDJiYjRjMjUiLCJpc3N1ZWRfYXQiOiIyMDI2LTA2LTE4VDA2OjA4OjA2LjY3MVoifQ", BUILD_RELEASE_MANIFEST_SIGNATURE = "Tw3ScHlFYGaEtPwKLhcPI_lgQUgjAZmhWKSi4fDFMw4", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai", BUILD_DEFAULT_PROFILE = "";
|
|
2354
2354
|
|
|
2355
2355
|
// .tmp-runtime-src/version.ts
|
|
2356
2356
|
import { createHash as createHash7 } from "crypto";
|
|
@@ -46122,6 +46122,42 @@ function urlPathLooksListLike2(contextUrl) {
|
|
|
46122
46122
|
return false;
|
|
46123
46123
|
}
|
|
46124
46124
|
}
|
|
46125
|
+
function entityPointerTemplate(href) {
|
|
46126
|
+
let path7 = href;
|
|
46127
|
+
try {
|
|
46128
|
+
path7 = new URL(href, "https://_").pathname;
|
|
46129
|
+
} catch {
|
|
46130
|
+
path7 = href.split("?")[0];
|
|
46131
|
+
}
|
|
46132
|
+
const segs = path7.split("/").filter(Boolean);
|
|
46133
|
+
if (segs.length === 0)
|
|
46134
|
+
return null;
|
|
46135
|
+
const shape = [];
|
|
46136
|
+
let hasId = false;
|
|
46137
|
+
for (const s of segs) {
|
|
46138
|
+
const low = s.toLowerCase();
|
|
46139
|
+
if (/\d{3,}/.test(low) || low.length > 30 || /^[0-9a-f-]{8,}$/.test(low) || /-\d{2,}$/.test(low)) {
|
|
46140
|
+
shape.push("{id}");
|
|
46141
|
+
hasId = true;
|
|
46142
|
+
} else {
|
|
46143
|
+
shape.push(low);
|
|
46144
|
+
}
|
|
46145
|
+
}
|
|
46146
|
+
return hasId ? shape.slice(0, 3).join("/") : null;
|
|
46147
|
+
}
|
|
46148
|
+
function linksFormEntityCollection(hrefs, min = 4) {
|
|
46149
|
+
const groups = new Map;
|
|
46150
|
+
for (const href of hrefs) {
|
|
46151
|
+
const t = entityPointerTemplate(href);
|
|
46152
|
+
if (!t)
|
|
46153
|
+
continue;
|
|
46154
|
+
const n = (groups.get(t) ?? 0) + 1;
|
|
46155
|
+
if (n >= min)
|
|
46156
|
+
return true;
|
|
46157
|
+
groups.set(t, n);
|
|
46158
|
+
}
|
|
46159
|
+
return false;
|
|
46160
|
+
}
|
|
46125
46161
|
function cardinalityMatches2(intent, subject, opts) {
|
|
46126
46162
|
const wantsMany = isListLikeIntent2(intent) || urlPathLooksListLike2(opts?.contextUrl);
|
|
46127
46163
|
if (!wantsMany)
|
|
@@ -56330,6 +56366,215 @@ var init_curl_impersonate_fallback = __esm(() => {
|
|
|
56330
56366
|
};
|
|
56331
56367
|
});
|
|
56332
56368
|
|
|
56369
|
+
// .tmp-runtime-src/execution/search-forms.ts
|
|
56370
|
+
var exports_search_forms = {};
|
|
56371
|
+
__export(exports_search_forms, {
|
|
56372
|
+
isStructuredSearchForm: () => isStructuredSearchForm,
|
|
56373
|
+
fillSearchRoute: () => fillSearchRoute,
|
|
56374
|
+
detectSearchForms: () => detectSearchForms,
|
|
56375
|
+
deriveSearchRouteTemplates: () => deriveSearchRouteTemplates
|
|
56376
|
+
});
|
|
56377
|
+
function deriveSearchRouteTemplates(html, minDistinct = 4) {
|
|
56378
|
+
const hrefs = new Set;
|
|
56379
|
+
for (const m of html.matchAll(/href\s*=\s*["'](\/[^"'?#\s]+)["']/gi))
|
|
56380
|
+
hrefs.add(m[1]);
|
|
56381
|
+
const groups = new Map;
|
|
56382
|
+
for (const h of hrefs) {
|
|
56383
|
+
const segs = h.split("/").filter(Boolean);
|
|
56384
|
+
if (segs.length < 1 || segs.length > 4)
|
|
56385
|
+
continue;
|
|
56386
|
+
for (let i = 0;i < segs.length; i++) {
|
|
56387
|
+
const val = segs[i];
|
|
56388
|
+
if (!/^[a-z][a-z0-9-]{1,40}$/i.test(val) || /\d{3,}/.test(val) || /\.[a-z0-9]{1,5}$/i.test(val))
|
|
56389
|
+
continue;
|
|
56390
|
+
const shape = segs.map((s, j) => j === i ? "{query}" : s).join("/");
|
|
56391
|
+
const trailing = h.endsWith("/") ? "/" : "";
|
|
56392
|
+
const key = `/${shape}${trailing}`;
|
|
56393
|
+
if (!groups.has(key))
|
|
56394
|
+
groups.set(key, new Set);
|
|
56395
|
+
groups.get(key).add(val.toLowerCase());
|
|
56396
|
+
}
|
|
56397
|
+
}
|
|
56398
|
+
const out = [];
|
|
56399
|
+
for (const [template, vals] of groups) {
|
|
56400
|
+
if (vals.size >= minDistinct)
|
|
56401
|
+
out.push({ template, samples: [...vals].slice(0, 5), count: vals.size });
|
|
56402
|
+
}
|
|
56403
|
+
return out.sort((a, b) => b.count - a.count);
|
|
56404
|
+
}
|
|
56405
|
+
function fillSearchRoute(origin, template, query) {
|
|
56406
|
+
const slug = encodeURIComponent(query.trim().toLowerCase());
|
|
56407
|
+
return origin.replace(/\/+$/, "") + template.replace("{query}", slug);
|
|
56408
|
+
}
|
|
56409
|
+
function isStructuredSearchForm(spec) {
|
|
56410
|
+
return spec.fields.length > 0 && !!spec.submit_selector;
|
|
56411
|
+
}
|
|
56412
|
+
function formSelectorFromElement(attribs, index) {
|
|
56413
|
+
const id = attribs.id;
|
|
56414
|
+
if (id)
|
|
56415
|
+
return `form#${id}`;
|
|
56416
|
+
const name = attribs.name;
|
|
56417
|
+
if (name)
|
|
56418
|
+
return `form[name="${name}"]`;
|
|
56419
|
+
const action2 = attribs.action;
|
|
56420
|
+
if (action2)
|
|
56421
|
+
return `form[action="${action2}"]`;
|
|
56422
|
+
return `form:nth-of-type(${index + 1})`;
|
|
56423
|
+
}
|
|
56424
|
+
function inputSelectorFromElement(attribs, tagName) {
|
|
56425
|
+
const id = attribs.id;
|
|
56426
|
+
if (id)
|
|
56427
|
+
return `#${id}`;
|
|
56428
|
+
const name = attribs.name;
|
|
56429
|
+
if (name)
|
|
56430
|
+
return `${tagName}[name="${name}"]`;
|
|
56431
|
+
return tagName;
|
|
56432
|
+
}
|
|
56433
|
+
function mapInputType(typeAttr, tagName) {
|
|
56434
|
+
if (tagName === "select")
|
|
56435
|
+
return "select";
|
|
56436
|
+
if (tagName === "textarea")
|
|
56437
|
+
return "text";
|
|
56438
|
+
const t = (typeAttr ?? "text").toLowerCase();
|
|
56439
|
+
if (t === "radio")
|
|
56440
|
+
return "radio";
|
|
56441
|
+
if (t === "checkbox")
|
|
56442
|
+
return "checkbox";
|
|
56443
|
+
if (t === "date")
|
|
56444
|
+
return "date";
|
|
56445
|
+
if (t === "hidden")
|
|
56446
|
+
return "hidden";
|
|
56447
|
+
if (t === "submit" || t === "button" || t === "image" || t === "reset")
|
|
56448
|
+
return null;
|
|
56449
|
+
if (t === "password" || t === "file")
|
|
56450
|
+
return null;
|
|
56451
|
+
if (SUPPORTED_INPUT_TYPES.has(t))
|
|
56452
|
+
return "text";
|
|
56453
|
+
return "text";
|
|
56454
|
+
}
|
|
56455
|
+
function parseAttrs(attrStr) {
|
|
56456
|
+
const attrs = {};
|
|
56457
|
+
const attrRegex = /(\w[\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g;
|
|
56458
|
+
let m;
|
|
56459
|
+
while ((m = attrRegex.exec(attrStr)) !== null) {
|
|
56460
|
+
attrs[m[1]] = m[2] ?? m[3] ?? m[4] ?? "";
|
|
56461
|
+
}
|
|
56462
|
+
return attrs;
|
|
56463
|
+
}
|
|
56464
|
+
function detectSearchForms(html) {
|
|
56465
|
+
const results = [];
|
|
56466
|
+
const formRegex = /<form([^>]*)>([\s\S]*?)<\/form>/gi;
|
|
56467
|
+
let formMatch;
|
|
56468
|
+
let formIndex = 0;
|
|
56469
|
+
while ((formMatch = formRegex.exec(html)) !== null) {
|
|
56470
|
+
const formAttrs = formMatch[1];
|
|
56471
|
+
const formBody = formMatch[2];
|
|
56472
|
+
const formElAttrs = parseAttrs(formAttrs);
|
|
56473
|
+
const fieldRegex = /<(input|select|textarea)([^>]*)\/?>/gi;
|
|
56474
|
+
let fieldMatch;
|
|
56475
|
+
const fields = [];
|
|
56476
|
+
const seenNames = new Set;
|
|
56477
|
+
let hasLoginField = false;
|
|
56478
|
+
let hasSearchLikeField = false;
|
|
56479
|
+
while ((fieldMatch = fieldRegex.exec(formBody)) !== null) {
|
|
56480
|
+
const tagName = fieldMatch[1].toLowerCase();
|
|
56481
|
+
const fieldAttrs = parseAttrs(fieldMatch[2]);
|
|
56482
|
+
const name = fieldAttrs.name ?? "";
|
|
56483
|
+
const typeAttr = fieldAttrs.type;
|
|
56484
|
+
if (LOGIN_FIELD_NAMES.has(name.toLowerCase()) || typeAttr === "password") {
|
|
56485
|
+
hasLoginField = true;
|
|
56486
|
+
}
|
|
56487
|
+
if (SEARCH_FIELD_NAMES.has(name.toLowerCase())) {
|
|
56488
|
+
hasSearchLikeField = true;
|
|
56489
|
+
}
|
|
56490
|
+
const mappedType = mapInputType(typeAttr, tagName);
|
|
56491
|
+
if (!mappedType)
|
|
56492
|
+
continue;
|
|
56493
|
+
if (!name && mappedType !== "text")
|
|
56494
|
+
continue;
|
|
56495
|
+
if (seenNames.has(name) && mappedType !== "radio")
|
|
56496
|
+
continue;
|
|
56497
|
+
if (name)
|
|
56498
|
+
seenNames.add(name);
|
|
56499
|
+
let options;
|
|
56500
|
+
if (tagName === "select") {
|
|
56501
|
+
const optRegex = /<option[^>]*value="([^"]*)"[^>]*>/gi;
|
|
56502
|
+
let optMatch;
|
|
56503
|
+
options = [];
|
|
56504
|
+
while ((optMatch = optRegex.exec(formBody)) !== null) {
|
|
56505
|
+
options.push(optMatch[1]);
|
|
56506
|
+
}
|
|
56507
|
+
if (options.length === 0)
|
|
56508
|
+
options = undefined;
|
|
56509
|
+
}
|
|
56510
|
+
fields.push({
|
|
56511
|
+
name: name || `unnamed_${fields.length}`,
|
|
56512
|
+
type: mappedType,
|
|
56513
|
+
selector: inputSelectorFromElement(fieldAttrs, tagName),
|
|
56514
|
+
...options ? { options } : {},
|
|
56515
|
+
required: fieldAttrs.required !== undefined
|
|
56516
|
+
});
|
|
56517
|
+
}
|
|
56518
|
+
let submitSelector = "";
|
|
56519
|
+
if (/<button[^>]*type\s*=\s*"submit"/i.test(formBody)) {
|
|
56520
|
+
submitSelector = "button[type=submit]";
|
|
56521
|
+
} else if (/<input[^>]*type\s*=\s*"submit"/i.test(formBody)) {
|
|
56522
|
+
submitSelector = 'input[type="submit"]';
|
|
56523
|
+
} else if (/<button/i.test(formBody)) {
|
|
56524
|
+
submitSelector = "button";
|
|
56525
|
+
}
|
|
56526
|
+
const nonHiddenFields = fields.filter((f) => f.type !== "hidden");
|
|
56527
|
+
if (!hasLoginField && nonHiddenFields.length > 0 && submitSelector && (hasSearchLikeField || nonHiddenFields.length >= 1)) {
|
|
56528
|
+
const formSelector = formSelectorFromElement(formElAttrs, formIndex);
|
|
56529
|
+
results.push({
|
|
56530
|
+
form_selector: formSelector,
|
|
56531
|
+
submit_selector: submitSelector,
|
|
56532
|
+
fields
|
|
56533
|
+
});
|
|
56534
|
+
}
|
|
56535
|
+
formIndex++;
|
|
56536
|
+
}
|
|
56537
|
+
return results;
|
|
56538
|
+
}
|
|
56539
|
+
var SEARCH_FIELD_NAMES, LOGIN_FIELD_NAMES, SUPPORTED_INPUT_TYPES;
|
|
56540
|
+
var init_search_forms = __esm(() => {
|
|
56541
|
+
SEARCH_FIELD_NAMES = new Set([
|
|
56542
|
+
"q",
|
|
56543
|
+
"query",
|
|
56544
|
+
"search",
|
|
56545
|
+
"keyword",
|
|
56546
|
+
"keywords",
|
|
56547
|
+
"term",
|
|
56548
|
+
"terms",
|
|
56549
|
+
"find",
|
|
56550
|
+
"lookup",
|
|
56551
|
+
"filter",
|
|
56552
|
+
"s",
|
|
56553
|
+
"text",
|
|
56554
|
+
"input"
|
|
56555
|
+
]);
|
|
56556
|
+
LOGIN_FIELD_NAMES = new Set([
|
|
56557
|
+
"password",
|
|
56558
|
+
"passwd",
|
|
56559
|
+
"pass",
|
|
56560
|
+
"pwd",
|
|
56561
|
+
"confirm_password",
|
|
56562
|
+
"username",
|
|
56563
|
+
"email",
|
|
56564
|
+
"login",
|
|
56565
|
+
"user"
|
|
56566
|
+
]);
|
|
56567
|
+
SUPPORTED_INPUT_TYPES = new Set([
|
|
56568
|
+
"text",
|
|
56569
|
+
"search",
|
|
56570
|
+
"hidden",
|
|
56571
|
+
"date",
|
|
56572
|
+
"number",
|
|
56573
|
+
"tel",
|
|
56574
|
+
"email"
|
|
56575
|
+
]);
|
|
56576
|
+
});
|
|
56577
|
+
|
|
56333
56578
|
// node_modules/.bun/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/Event.js
|
|
56334
56579
|
var require_Event = __commonJS((exports, module) => {
|
|
56335
56580
|
module.exports = Event2;
|
|
@@ -72981,20 +73226,24 @@ function buildDirectDocumentResult(url, html, contentType, intent) {
|
|
|
72981
73226
|
const hits = intentTokens.filter((tok) => haystack.includes(tok));
|
|
72982
73227
|
const hitRate = hits.length / intentTokens.length;
|
|
72983
73228
|
if (hitRate < 0.34) {
|
|
72984
|
-
|
|
72985
|
-
|
|
72986
|
-
|
|
72987
|
-
|
|
72988
|
-
|
|
72989
|
-
|
|
72990
|
-
|
|
72991
|
-
|
|
72992
|
-
|
|
72993
|
-
|
|
73229
|
+
const isCollection = isListLikeIntent2(intent) && linksFormEntityCollection(Array.from(html.matchAll(/href\s*=\s*["']([^"']+)["']/gi), (m) => m[1]));
|
|
73230
|
+
if (!isCollection) {
|
|
73231
|
+
return {
|
|
73232
|
+
rejected: true,
|
|
73233
|
+
reason: "intent_mismatch",
|
|
73234
|
+
evidence: {
|
|
73235
|
+
intent_tokens: intentTokens,
|
|
73236
|
+
response_token_hits: hits,
|
|
73237
|
+
response_token_hit_rate: hitRate,
|
|
73238
|
+
html_bytes: html.length
|
|
73239
|
+
}
|
|
73240
|
+
};
|
|
73241
|
+
}
|
|
72994
73242
|
}
|
|
72995
73243
|
}
|
|
72996
73244
|
}
|
|
72997
73245
|
const { url_template, input_params, path_params, query } = extractHtmlHoles(url);
|
|
73246
|
+
const routing_candidates = buildSearchRouteCandidates(html, url, intent);
|
|
72998
73247
|
return {
|
|
72999
73248
|
rejected: false,
|
|
73000
73249
|
title,
|
|
@@ -73008,12 +73257,40 @@ function buildDirectDocumentResult(url, html, contentType, intent) {
|
|
|
73008
73257
|
text_excerpt: bodyText.slice(0, MARKDOWN_BUDGET),
|
|
73009
73258
|
markdown: htmlToMarkdownSafe(html, bodyText),
|
|
73010
73259
|
tables: extractTables(html),
|
|
73260
|
+
...routing_candidates.length > 0 ? { routing_candidates } : {},
|
|
73011
73261
|
extraction: {
|
|
73012
73262
|
source: "direct-document",
|
|
73013
73263
|
rejected: false
|
|
73014
73264
|
}
|
|
73015
73265
|
};
|
|
73016
73266
|
}
|
|
73267
|
+
function intentQueryTerm(intent, url) {
|
|
73268
|
+
let domTokens = new Set;
|
|
73269
|
+
try {
|
|
73270
|
+
domTokens = new Set(new URL(url).hostname.toLowerCase().split(/[.-]/));
|
|
73271
|
+
} catch {}
|
|
73272
|
+
const toks = (intent.toLowerCase().match(/[a-z][a-z0-9]{2,}/g) ?? []).filter((t) => !QUERY_STOPWORDS.has(t) && !domTokens.has(t));
|
|
73273
|
+
return [...new Set(toks)].join(" ").trim();
|
|
73274
|
+
}
|
|
73275
|
+
function buildSearchRouteCandidates(html, url, intent) {
|
|
73276
|
+
if (!intent || !isListLikeIntent2(intent))
|
|
73277
|
+
return [];
|
|
73278
|
+
const queryTerm = intentQueryTerm(intent, url);
|
|
73279
|
+
if (!queryTerm)
|
|
73280
|
+
return [];
|
|
73281
|
+
let origin = "";
|
|
73282
|
+
try {
|
|
73283
|
+
origin = new URL(url).origin;
|
|
73284
|
+
} catch {
|
|
73285
|
+
return [];
|
|
73286
|
+
}
|
|
73287
|
+
return deriveSearchRouteTemplates(html).slice(0, 3).map((t) => ({
|
|
73288
|
+
url: fillSearchRoute(origin, t.template, queryTerm),
|
|
73289
|
+
template: t.template,
|
|
73290
|
+
query: queryTerm,
|
|
73291
|
+
samples: t.samples
|
|
73292
|
+
}));
|
|
73293
|
+
}
|
|
73017
73294
|
async function fetchDirectDocument(url) {
|
|
73018
73295
|
if (!isDirectDocumentEligibleUrl(url))
|
|
73019
73296
|
return null;
|
|
@@ -73201,10 +73478,12 @@ function cellText(html) {
|
|
|
73201
73478
|
function decodeHtmlEntityText(input) {
|
|
73202
73479
|
return input.replace(/ /g, " ").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, '"').replace(/'/g, "'");
|
|
73203
73480
|
}
|
|
73204
|
-
var HTML_RE, MIN_DIRECT_DOCUMENT_HTML_BYTES = 5000, CHALLENGE_RE, INTERSTITIAL_RE, MIN_DIRECT_DOCUMENT_BODY_TEXT = 500, SPA_HYDRATION_RE, SPA_HYDRATION_BODY_TEXT_FLOOR = 2000, SPA_ROOT_CONTAINER_RE, PARKED_RE, INTENT_STOPWORDS, MARKDOWN_BUDGET, MAX_TABLES = 10, MAX_TABLE_ROWS = 50, buildBloombergDirectDocumentResult;
|
|
73481
|
+
var HTML_RE, MIN_DIRECT_DOCUMENT_HTML_BYTES = 5000, CHALLENGE_RE, INTERSTITIAL_RE, MIN_DIRECT_DOCUMENT_BODY_TEXT = 500, SPA_HYDRATION_RE, SPA_HYDRATION_BODY_TEXT_FLOOR = 2000, SPA_ROOT_CONTAINER_RE, PARKED_RE, INTENT_STOPWORDS, MARKDOWN_BUDGET, MAX_TABLES = 10, MAX_TABLE_ROWS = 50, QUERY_STOPWORDS, buildBloombergDirectDocumentResult;
|
|
73205
73482
|
var init_direct_document = __esm(() => {
|
|
73206
73483
|
init_curl_impersonate_fallback();
|
|
73207
73484
|
init_proxy_fetch();
|
|
73485
|
+
init_cardinality2();
|
|
73486
|
+
init_search_forms();
|
|
73208
73487
|
HTML_RE = /text\/html|application\/xhtml\+xml/i;
|
|
73209
73488
|
CHALLENGE_RE = /\b(access denied|are you a robot|captcha|just a moment|pardon our interruption|robot check|unusual traffic|verify you are human)\b/i;
|
|
73210
73489
|
INTERSTITIAL_RE = /\b(please wait for verification|just a moment|cf-mitigated|datadome|akamai bot|perimeterx|sign in to continue|log in to (?:continue|access)|javascript is not available)\b/i;
|
|
@@ -73287,6 +73566,7 @@ var init_direct_document = __esm(() => {
|
|
|
73287
73566
|
"look"
|
|
73288
73567
|
]);
|
|
73289
73568
|
MARKDOWN_BUDGET = Math.max(1000, Number(process.env.UNBROWSE_MARKDOWN_BUDGET ?? "12000") || 12000);
|
|
73569
|
+
QUERY_STOPWORDS = new Set(("resolve unbrowse execute run walk go fetch open view want need please " + "find search browse list lookup discover show get me a an the on of for in to " + "with and or all my your this that some good best top new latest cheap near").split(" "));
|
|
73290
73570
|
buildBloombergDirectDocumentResult = buildDirectDocumentResult;
|
|
73291
73571
|
});
|
|
73292
73572
|
|
|
@@ -121959,181 +122239,6 @@ function clampToFloor(score, demotion, floor) {
|
|
|
121959
122239
|
}
|
|
121960
122240
|
var HARD_NEGATIVE_FLOOR = -2000, WEAK_NEGATIVE_FLOOR = -400, PAGE_ARTIFACT_DEMOTION = 800, EMPTY_ENTITY_BAG_DEMOTION = 650, EMPTY_ENTITY_BAG_FLOOR = -700;
|
|
121961
122241
|
|
|
121962
|
-
// .tmp-runtime-src/execution/search-forms.ts
|
|
121963
|
-
var exports_search_forms = {};
|
|
121964
|
-
__export(exports_search_forms, {
|
|
121965
|
-
isStructuredSearchForm: () => isStructuredSearchForm,
|
|
121966
|
-
detectSearchForms: () => detectSearchForms
|
|
121967
|
-
});
|
|
121968
|
-
function isStructuredSearchForm(spec) {
|
|
121969
|
-
return spec.fields.length > 0 && !!spec.submit_selector;
|
|
121970
|
-
}
|
|
121971
|
-
function formSelectorFromElement(attribs, index2) {
|
|
121972
|
-
const id = attribs.id;
|
|
121973
|
-
if (id)
|
|
121974
|
-
return `form#${id}`;
|
|
121975
|
-
const name = attribs.name;
|
|
121976
|
-
if (name)
|
|
121977
|
-
return `form[name="${name}"]`;
|
|
121978
|
-
const action2 = attribs.action;
|
|
121979
|
-
if (action2)
|
|
121980
|
-
return `form[action="${action2}"]`;
|
|
121981
|
-
return `form:nth-of-type(${index2 + 1})`;
|
|
121982
|
-
}
|
|
121983
|
-
function inputSelectorFromElement(attribs, tagName) {
|
|
121984
|
-
const id = attribs.id;
|
|
121985
|
-
if (id)
|
|
121986
|
-
return `#${id}`;
|
|
121987
|
-
const name = attribs.name;
|
|
121988
|
-
if (name)
|
|
121989
|
-
return `${tagName}[name="${name}"]`;
|
|
121990
|
-
return tagName;
|
|
121991
|
-
}
|
|
121992
|
-
function mapInputType(typeAttr, tagName) {
|
|
121993
|
-
if (tagName === "select")
|
|
121994
|
-
return "select";
|
|
121995
|
-
if (tagName === "textarea")
|
|
121996
|
-
return "text";
|
|
121997
|
-
const t = (typeAttr ?? "text").toLowerCase();
|
|
121998
|
-
if (t === "radio")
|
|
121999
|
-
return "radio";
|
|
122000
|
-
if (t === "checkbox")
|
|
122001
|
-
return "checkbox";
|
|
122002
|
-
if (t === "date")
|
|
122003
|
-
return "date";
|
|
122004
|
-
if (t === "hidden")
|
|
122005
|
-
return "hidden";
|
|
122006
|
-
if (t === "submit" || t === "button" || t === "image" || t === "reset")
|
|
122007
|
-
return null;
|
|
122008
|
-
if (t === "password" || t === "file")
|
|
122009
|
-
return null;
|
|
122010
|
-
if (SUPPORTED_INPUT_TYPES.has(t))
|
|
122011
|
-
return "text";
|
|
122012
|
-
return "text";
|
|
122013
|
-
}
|
|
122014
|
-
function parseAttrs(attrStr) {
|
|
122015
|
-
const attrs = {};
|
|
122016
|
-
const attrRegex = /(\w[\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g;
|
|
122017
|
-
let m;
|
|
122018
|
-
while ((m = attrRegex.exec(attrStr)) !== null) {
|
|
122019
|
-
attrs[m[1]] = m[2] ?? m[3] ?? m[4] ?? "";
|
|
122020
|
-
}
|
|
122021
|
-
return attrs;
|
|
122022
|
-
}
|
|
122023
|
-
function detectSearchForms(html3) {
|
|
122024
|
-
const results = [];
|
|
122025
|
-
const formRegex = /<form([^>]*)>([\s\S]*?)<\/form>/gi;
|
|
122026
|
-
let formMatch;
|
|
122027
|
-
let formIndex = 0;
|
|
122028
|
-
while ((formMatch = formRegex.exec(html3)) !== null) {
|
|
122029
|
-
const formAttrs = formMatch[1];
|
|
122030
|
-
const formBody = formMatch[2];
|
|
122031
|
-
const formElAttrs = parseAttrs(formAttrs);
|
|
122032
|
-
const fieldRegex = /<(input|select|textarea)([^>]*)\/?>/gi;
|
|
122033
|
-
let fieldMatch;
|
|
122034
|
-
const fields = [];
|
|
122035
|
-
const seenNames = new Set;
|
|
122036
|
-
let hasLoginField = false;
|
|
122037
|
-
let hasSearchLikeField = false;
|
|
122038
|
-
while ((fieldMatch = fieldRegex.exec(formBody)) !== null) {
|
|
122039
|
-
const tagName = fieldMatch[1].toLowerCase();
|
|
122040
|
-
const fieldAttrs = parseAttrs(fieldMatch[2]);
|
|
122041
|
-
const name = fieldAttrs.name ?? "";
|
|
122042
|
-
const typeAttr = fieldAttrs.type;
|
|
122043
|
-
if (LOGIN_FIELD_NAMES.has(name.toLowerCase()) || typeAttr === "password") {
|
|
122044
|
-
hasLoginField = true;
|
|
122045
|
-
}
|
|
122046
|
-
if (SEARCH_FIELD_NAMES.has(name.toLowerCase())) {
|
|
122047
|
-
hasSearchLikeField = true;
|
|
122048
|
-
}
|
|
122049
|
-
const mappedType = mapInputType(typeAttr, tagName);
|
|
122050
|
-
if (!mappedType)
|
|
122051
|
-
continue;
|
|
122052
|
-
if (!name && mappedType !== "text")
|
|
122053
|
-
continue;
|
|
122054
|
-
if (seenNames.has(name) && mappedType !== "radio")
|
|
122055
|
-
continue;
|
|
122056
|
-
if (name)
|
|
122057
|
-
seenNames.add(name);
|
|
122058
|
-
let options;
|
|
122059
|
-
if (tagName === "select") {
|
|
122060
|
-
const optRegex = /<option[^>]*value="([^"]*)"[^>]*>/gi;
|
|
122061
|
-
let optMatch;
|
|
122062
|
-
options = [];
|
|
122063
|
-
while ((optMatch = optRegex.exec(formBody)) !== null) {
|
|
122064
|
-
options.push(optMatch[1]);
|
|
122065
|
-
}
|
|
122066
|
-
if (options.length === 0)
|
|
122067
|
-
options = undefined;
|
|
122068
|
-
}
|
|
122069
|
-
fields.push({
|
|
122070
|
-
name: name || `unnamed_${fields.length}`,
|
|
122071
|
-
type: mappedType,
|
|
122072
|
-
selector: inputSelectorFromElement(fieldAttrs, tagName),
|
|
122073
|
-
...options ? { options } : {},
|
|
122074
|
-
required: fieldAttrs.required !== undefined
|
|
122075
|
-
});
|
|
122076
|
-
}
|
|
122077
|
-
let submitSelector = "";
|
|
122078
|
-
if (/<button[^>]*type\s*=\s*"submit"/i.test(formBody)) {
|
|
122079
|
-
submitSelector = "button[type=submit]";
|
|
122080
|
-
} else if (/<input[^>]*type\s*=\s*"submit"/i.test(formBody)) {
|
|
122081
|
-
submitSelector = 'input[type="submit"]';
|
|
122082
|
-
} else if (/<button/i.test(formBody)) {
|
|
122083
|
-
submitSelector = "button";
|
|
122084
|
-
}
|
|
122085
|
-
const nonHiddenFields = fields.filter((f) => f.type !== "hidden");
|
|
122086
|
-
if (!hasLoginField && nonHiddenFields.length > 0 && submitSelector && (hasSearchLikeField || nonHiddenFields.length >= 1)) {
|
|
122087
|
-
const formSelector = formSelectorFromElement(formElAttrs, formIndex);
|
|
122088
|
-
results.push({
|
|
122089
|
-
form_selector: formSelector,
|
|
122090
|
-
submit_selector: submitSelector,
|
|
122091
|
-
fields
|
|
122092
|
-
});
|
|
122093
|
-
}
|
|
122094
|
-
formIndex++;
|
|
122095
|
-
}
|
|
122096
|
-
return results;
|
|
122097
|
-
}
|
|
122098
|
-
var SEARCH_FIELD_NAMES, LOGIN_FIELD_NAMES, SUPPORTED_INPUT_TYPES;
|
|
122099
|
-
var init_search_forms = __esm(() => {
|
|
122100
|
-
SEARCH_FIELD_NAMES = new Set([
|
|
122101
|
-
"q",
|
|
122102
|
-
"query",
|
|
122103
|
-
"search",
|
|
122104
|
-
"keyword",
|
|
122105
|
-
"keywords",
|
|
122106
|
-
"term",
|
|
122107
|
-
"terms",
|
|
122108
|
-
"find",
|
|
122109
|
-
"lookup",
|
|
122110
|
-
"filter",
|
|
122111
|
-
"s",
|
|
122112
|
-
"text",
|
|
122113
|
-
"input"
|
|
122114
|
-
]);
|
|
122115
|
-
LOGIN_FIELD_NAMES = new Set([
|
|
122116
|
-
"password",
|
|
122117
|
-
"passwd",
|
|
122118
|
-
"pass",
|
|
122119
|
-
"pwd",
|
|
122120
|
-
"confirm_password",
|
|
122121
|
-
"username",
|
|
122122
|
-
"email",
|
|
122123
|
-
"login",
|
|
122124
|
-
"user"
|
|
122125
|
-
]);
|
|
122126
|
-
SUPPORTED_INPUT_TYPES = new Set([
|
|
122127
|
-
"text",
|
|
122128
|
-
"search",
|
|
122129
|
-
"hidden",
|
|
122130
|
-
"date",
|
|
122131
|
-
"number",
|
|
122132
|
-
"tel",
|
|
122133
|
-
"email"
|
|
122134
|
-
]);
|
|
122135
|
-
});
|
|
122136
|
-
|
|
122137
122242
|
// .tmp-runtime-src/state/stateless.ts
|
|
122138
122243
|
function isStateless() {
|
|
122139
122244
|
const v = process.env.UNBROWSE_STATELESS;
|
package/runtime/mcp.js
CHANGED
|
@@ -36310,7 +36310,7 @@ var init_cached_resolution = __esm(() => {
|
|
|
36310
36310
|
});
|
|
36311
36311
|
|
|
36312
36312
|
// .tmp-runtime-src/build-info.generated.ts
|
|
36313
|
-
var BUILD_RELEASE_VERSION = "9.
|
|
36313
|
+
var BUILD_RELEASE_VERSION = "9.8.0", BUILD_GIT_SHA = "255142bb4c25", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiOS44LjAiLCJnaXRfc2hhIjoiMjU1MTQyYmI0YzI1IiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUAyNTUxNDJiYjRjMjUiLCJpc3N1ZWRfYXQiOiIyMDI2LTA2LTE4VDA2OjA4OjA2LjY3MVoifQ", BUILD_RELEASE_MANIFEST_SIGNATURE = "Tw3ScHlFYGaEtPwKLhcPI_lgQUgjAZmhWKSi4fDFMw4", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai", BUILD_DEFAULT_PROFILE = "";
|
|
36314
36314
|
|
|
36315
36315
|
// .tmp-runtime-src/version.ts
|
|
36316
36316
|
import { createHash as createHash4 } from "crypto";
|
|
@@ -43252,6 +43252,42 @@ function urlPathLooksListLike(contextUrl) {
|
|
|
43252
43252
|
return false;
|
|
43253
43253
|
}
|
|
43254
43254
|
}
|
|
43255
|
+
function entityPointerTemplate(href) {
|
|
43256
|
+
let path5 = href;
|
|
43257
|
+
try {
|
|
43258
|
+
path5 = new URL(href, "https://_").pathname;
|
|
43259
|
+
} catch {
|
|
43260
|
+
path5 = href.split("?")[0];
|
|
43261
|
+
}
|
|
43262
|
+
const segs = path5.split("/").filter(Boolean);
|
|
43263
|
+
if (segs.length === 0)
|
|
43264
|
+
return null;
|
|
43265
|
+
const shape = [];
|
|
43266
|
+
let hasId = false;
|
|
43267
|
+
for (const s of segs) {
|
|
43268
|
+
const low = s.toLowerCase();
|
|
43269
|
+
if (/\d{3,}/.test(low) || low.length > 30 || /^[0-9a-f-]{8,}$/.test(low) || /-\d{2,}$/.test(low)) {
|
|
43270
|
+
shape.push("{id}");
|
|
43271
|
+
hasId = true;
|
|
43272
|
+
} else {
|
|
43273
|
+
shape.push(low);
|
|
43274
|
+
}
|
|
43275
|
+
}
|
|
43276
|
+
return hasId ? shape.slice(0, 3).join("/") : null;
|
|
43277
|
+
}
|
|
43278
|
+
function linksFormEntityCollection(hrefs, min = 4) {
|
|
43279
|
+
const groups = new Map;
|
|
43280
|
+
for (const href of hrefs) {
|
|
43281
|
+
const t = entityPointerTemplate(href);
|
|
43282
|
+
if (!t)
|
|
43283
|
+
continue;
|
|
43284
|
+
const n = (groups.get(t) ?? 0) + 1;
|
|
43285
|
+
if (n >= min)
|
|
43286
|
+
return true;
|
|
43287
|
+
groups.set(t, n);
|
|
43288
|
+
}
|
|
43289
|
+
return false;
|
|
43290
|
+
}
|
|
43255
43291
|
function cardinalityMatches(intent, subject, opts) {
|
|
43256
43292
|
const wantsMany = isListLikeIntent(intent) || urlPathLooksListLike(opts?.contextUrl);
|
|
43257
43293
|
if (!wantsMany)
|
|
@@ -54387,6 +54423,215 @@ var init_curl_impersonate_fallback = __esm(() => {
|
|
|
54387
54423
|
};
|
|
54388
54424
|
});
|
|
54389
54425
|
|
|
54426
|
+
// .tmp-runtime-src/execution/search-forms.ts
|
|
54427
|
+
var exports_search_forms = {};
|
|
54428
|
+
__export(exports_search_forms, {
|
|
54429
|
+
isStructuredSearchForm: () => isStructuredSearchForm,
|
|
54430
|
+
fillSearchRoute: () => fillSearchRoute,
|
|
54431
|
+
detectSearchForms: () => detectSearchForms,
|
|
54432
|
+
deriveSearchRouteTemplates: () => deriveSearchRouteTemplates
|
|
54433
|
+
});
|
|
54434
|
+
function deriveSearchRouteTemplates(html, minDistinct = 4) {
|
|
54435
|
+
const hrefs = new Set;
|
|
54436
|
+
for (const m of html.matchAll(/href\s*=\s*["'](\/[^"'?#\s]+)["']/gi))
|
|
54437
|
+
hrefs.add(m[1]);
|
|
54438
|
+
const groups = new Map;
|
|
54439
|
+
for (const h of hrefs) {
|
|
54440
|
+
const segs = h.split("/").filter(Boolean);
|
|
54441
|
+
if (segs.length < 1 || segs.length > 4)
|
|
54442
|
+
continue;
|
|
54443
|
+
for (let i = 0;i < segs.length; i++) {
|
|
54444
|
+
const val = segs[i];
|
|
54445
|
+
if (!/^[a-z][a-z0-9-]{1,40}$/i.test(val) || /\d{3,}/.test(val) || /\.[a-z0-9]{1,5}$/i.test(val))
|
|
54446
|
+
continue;
|
|
54447
|
+
const shape = segs.map((s, j) => j === i ? "{query}" : s).join("/");
|
|
54448
|
+
const trailing = h.endsWith("/") ? "/" : "";
|
|
54449
|
+
const key = `/${shape}${trailing}`;
|
|
54450
|
+
if (!groups.has(key))
|
|
54451
|
+
groups.set(key, new Set);
|
|
54452
|
+
groups.get(key).add(val.toLowerCase());
|
|
54453
|
+
}
|
|
54454
|
+
}
|
|
54455
|
+
const out = [];
|
|
54456
|
+
for (const [template, vals] of groups) {
|
|
54457
|
+
if (vals.size >= minDistinct)
|
|
54458
|
+
out.push({ template, samples: [...vals].slice(0, 5), count: vals.size });
|
|
54459
|
+
}
|
|
54460
|
+
return out.sort((a, b) => b.count - a.count);
|
|
54461
|
+
}
|
|
54462
|
+
function fillSearchRoute(origin, template, query) {
|
|
54463
|
+
const slug = encodeURIComponent(query.trim().toLowerCase());
|
|
54464
|
+
return origin.replace(/\/+$/, "") + template.replace("{query}", slug);
|
|
54465
|
+
}
|
|
54466
|
+
function isStructuredSearchForm(spec) {
|
|
54467
|
+
return spec.fields.length > 0 && !!spec.submit_selector;
|
|
54468
|
+
}
|
|
54469
|
+
function formSelectorFromElement(attribs, index) {
|
|
54470
|
+
const id = attribs.id;
|
|
54471
|
+
if (id)
|
|
54472
|
+
return `form#${id}`;
|
|
54473
|
+
const name = attribs.name;
|
|
54474
|
+
if (name)
|
|
54475
|
+
return `form[name="${name}"]`;
|
|
54476
|
+
const action2 = attribs.action;
|
|
54477
|
+
if (action2)
|
|
54478
|
+
return `form[action="${action2}"]`;
|
|
54479
|
+
return `form:nth-of-type(${index + 1})`;
|
|
54480
|
+
}
|
|
54481
|
+
function inputSelectorFromElement(attribs, tagName) {
|
|
54482
|
+
const id = attribs.id;
|
|
54483
|
+
if (id)
|
|
54484
|
+
return `#${id}`;
|
|
54485
|
+
const name = attribs.name;
|
|
54486
|
+
if (name)
|
|
54487
|
+
return `${tagName}[name="${name}"]`;
|
|
54488
|
+
return tagName;
|
|
54489
|
+
}
|
|
54490
|
+
function mapInputType(typeAttr, tagName) {
|
|
54491
|
+
if (tagName === "select")
|
|
54492
|
+
return "select";
|
|
54493
|
+
if (tagName === "textarea")
|
|
54494
|
+
return "text";
|
|
54495
|
+
const t = (typeAttr ?? "text").toLowerCase();
|
|
54496
|
+
if (t === "radio")
|
|
54497
|
+
return "radio";
|
|
54498
|
+
if (t === "checkbox")
|
|
54499
|
+
return "checkbox";
|
|
54500
|
+
if (t === "date")
|
|
54501
|
+
return "date";
|
|
54502
|
+
if (t === "hidden")
|
|
54503
|
+
return "hidden";
|
|
54504
|
+
if (t === "submit" || t === "button" || t === "image" || t === "reset")
|
|
54505
|
+
return null;
|
|
54506
|
+
if (t === "password" || t === "file")
|
|
54507
|
+
return null;
|
|
54508
|
+
if (SUPPORTED_INPUT_TYPES.has(t))
|
|
54509
|
+
return "text";
|
|
54510
|
+
return "text";
|
|
54511
|
+
}
|
|
54512
|
+
function parseAttrs(attrStr) {
|
|
54513
|
+
const attrs = {};
|
|
54514
|
+
const attrRegex = /(\w[\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g;
|
|
54515
|
+
let m;
|
|
54516
|
+
while ((m = attrRegex.exec(attrStr)) !== null) {
|
|
54517
|
+
attrs[m[1]] = m[2] ?? m[3] ?? m[4] ?? "";
|
|
54518
|
+
}
|
|
54519
|
+
return attrs;
|
|
54520
|
+
}
|
|
54521
|
+
function detectSearchForms(html) {
|
|
54522
|
+
const results = [];
|
|
54523
|
+
const formRegex = /<form([^>]*)>([\s\S]*?)<\/form>/gi;
|
|
54524
|
+
let formMatch;
|
|
54525
|
+
let formIndex = 0;
|
|
54526
|
+
while ((formMatch = formRegex.exec(html)) !== null) {
|
|
54527
|
+
const formAttrs = formMatch[1];
|
|
54528
|
+
const formBody = formMatch[2];
|
|
54529
|
+
const formElAttrs = parseAttrs(formAttrs);
|
|
54530
|
+
const fieldRegex = /<(input|select|textarea)([^>]*)\/?>/gi;
|
|
54531
|
+
let fieldMatch;
|
|
54532
|
+
const fields = [];
|
|
54533
|
+
const seenNames = new Set;
|
|
54534
|
+
let hasLoginField = false;
|
|
54535
|
+
let hasSearchLikeField = false;
|
|
54536
|
+
while ((fieldMatch = fieldRegex.exec(formBody)) !== null) {
|
|
54537
|
+
const tagName = fieldMatch[1].toLowerCase();
|
|
54538
|
+
const fieldAttrs = parseAttrs(fieldMatch[2]);
|
|
54539
|
+
const name = fieldAttrs.name ?? "";
|
|
54540
|
+
const typeAttr = fieldAttrs.type;
|
|
54541
|
+
if (LOGIN_FIELD_NAMES.has(name.toLowerCase()) || typeAttr === "password") {
|
|
54542
|
+
hasLoginField = true;
|
|
54543
|
+
}
|
|
54544
|
+
if (SEARCH_FIELD_NAMES.has(name.toLowerCase())) {
|
|
54545
|
+
hasSearchLikeField = true;
|
|
54546
|
+
}
|
|
54547
|
+
const mappedType = mapInputType(typeAttr, tagName);
|
|
54548
|
+
if (!mappedType)
|
|
54549
|
+
continue;
|
|
54550
|
+
if (!name && mappedType !== "text")
|
|
54551
|
+
continue;
|
|
54552
|
+
if (seenNames.has(name) && mappedType !== "radio")
|
|
54553
|
+
continue;
|
|
54554
|
+
if (name)
|
|
54555
|
+
seenNames.add(name);
|
|
54556
|
+
let options;
|
|
54557
|
+
if (tagName === "select") {
|
|
54558
|
+
const optRegex = /<option[^>]*value="([^"]*)"[^>]*>/gi;
|
|
54559
|
+
let optMatch;
|
|
54560
|
+
options = [];
|
|
54561
|
+
while ((optMatch = optRegex.exec(formBody)) !== null) {
|
|
54562
|
+
options.push(optMatch[1]);
|
|
54563
|
+
}
|
|
54564
|
+
if (options.length === 0)
|
|
54565
|
+
options = undefined;
|
|
54566
|
+
}
|
|
54567
|
+
fields.push({
|
|
54568
|
+
name: name || `unnamed_${fields.length}`,
|
|
54569
|
+
type: mappedType,
|
|
54570
|
+
selector: inputSelectorFromElement(fieldAttrs, tagName),
|
|
54571
|
+
...options ? { options } : {},
|
|
54572
|
+
required: fieldAttrs.required !== undefined
|
|
54573
|
+
});
|
|
54574
|
+
}
|
|
54575
|
+
let submitSelector = "";
|
|
54576
|
+
if (/<button[^>]*type\s*=\s*"submit"/i.test(formBody)) {
|
|
54577
|
+
submitSelector = "button[type=submit]";
|
|
54578
|
+
} else if (/<input[^>]*type\s*=\s*"submit"/i.test(formBody)) {
|
|
54579
|
+
submitSelector = 'input[type="submit"]';
|
|
54580
|
+
} else if (/<button/i.test(formBody)) {
|
|
54581
|
+
submitSelector = "button";
|
|
54582
|
+
}
|
|
54583
|
+
const nonHiddenFields = fields.filter((f) => f.type !== "hidden");
|
|
54584
|
+
if (!hasLoginField && nonHiddenFields.length > 0 && submitSelector && (hasSearchLikeField || nonHiddenFields.length >= 1)) {
|
|
54585
|
+
const formSelector = formSelectorFromElement(formElAttrs, formIndex);
|
|
54586
|
+
results.push({
|
|
54587
|
+
form_selector: formSelector,
|
|
54588
|
+
submit_selector: submitSelector,
|
|
54589
|
+
fields
|
|
54590
|
+
});
|
|
54591
|
+
}
|
|
54592
|
+
formIndex++;
|
|
54593
|
+
}
|
|
54594
|
+
return results;
|
|
54595
|
+
}
|
|
54596
|
+
var SEARCH_FIELD_NAMES, LOGIN_FIELD_NAMES, SUPPORTED_INPUT_TYPES;
|
|
54597
|
+
var init_search_forms = __esm(() => {
|
|
54598
|
+
SEARCH_FIELD_NAMES = new Set([
|
|
54599
|
+
"q",
|
|
54600
|
+
"query",
|
|
54601
|
+
"search",
|
|
54602
|
+
"keyword",
|
|
54603
|
+
"keywords",
|
|
54604
|
+
"term",
|
|
54605
|
+
"terms",
|
|
54606
|
+
"find",
|
|
54607
|
+
"lookup",
|
|
54608
|
+
"filter",
|
|
54609
|
+
"s",
|
|
54610
|
+
"text",
|
|
54611
|
+
"input"
|
|
54612
|
+
]);
|
|
54613
|
+
LOGIN_FIELD_NAMES = new Set([
|
|
54614
|
+
"password",
|
|
54615
|
+
"passwd",
|
|
54616
|
+
"pass",
|
|
54617
|
+
"pwd",
|
|
54618
|
+
"confirm_password",
|
|
54619
|
+
"username",
|
|
54620
|
+
"email",
|
|
54621
|
+
"login",
|
|
54622
|
+
"user"
|
|
54623
|
+
]);
|
|
54624
|
+
SUPPORTED_INPUT_TYPES = new Set([
|
|
54625
|
+
"text",
|
|
54626
|
+
"search",
|
|
54627
|
+
"hidden",
|
|
54628
|
+
"date",
|
|
54629
|
+
"number",
|
|
54630
|
+
"tel",
|
|
54631
|
+
"email"
|
|
54632
|
+
]);
|
|
54633
|
+
});
|
|
54634
|
+
|
|
54390
54635
|
// node_modules/.bun/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/Event.js
|
|
54391
54636
|
var require_Event = __commonJS((exports, module) => {
|
|
54392
54637
|
module.exports = Event2;
|
|
@@ -71038,20 +71283,24 @@ function buildDirectDocumentResult(url, html, contentType, intent) {
|
|
|
71038
71283
|
const hits = intentTokens.filter((tok) => haystack.includes(tok));
|
|
71039
71284
|
const hitRate = hits.length / intentTokens.length;
|
|
71040
71285
|
if (hitRate < 0.34) {
|
|
71041
|
-
|
|
71042
|
-
|
|
71043
|
-
|
|
71044
|
-
|
|
71045
|
-
|
|
71046
|
-
|
|
71047
|
-
|
|
71048
|
-
|
|
71049
|
-
|
|
71050
|
-
|
|
71286
|
+
const isCollection = isListLikeIntent(intent) && linksFormEntityCollection(Array.from(html.matchAll(/href\s*=\s*["']([^"']+)["']/gi), (m) => m[1]));
|
|
71287
|
+
if (!isCollection) {
|
|
71288
|
+
return {
|
|
71289
|
+
rejected: true,
|
|
71290
|
+
reason: "intent_mismatch",
|
|
71291
|
+
evidence: {
|
|
71292
|
+
intent_tokens: intentTokens,
|
|
71293
|
+
response_token_hits: hits,
|
|
71294
|
+
response_token_hit_rate: hitRate,
|
|
71295
|
+
html_bytes: html.length
|
|
71296
|
+
}
|
|
71297
|
+
};
|
|
71298
|
+
}
|
|
71051
71299
|
}
|
|
71052
71300
|
}
|
|
71053
71301
|
}
|
|
71054
71302
|
const { url_template, input_params, path_params, query } = extractHtmlHoles(url);
|
|
71303
|
+
const routing_candidates = buildSearchRouteCandidates(html, url, intent);
|
|
71055
71304
|
return {
|
|
71056
71305
|
rejected: false,
|
|
71057
71306
|
title,
|
|
@@ -71065,12 +71314,40 @@ function buildDirectDocumentResult(url, html, contentType, intent) {
|
|
|
71065
71314
|
text_excerpt: bodyText.slice(0, MARKDOWN_BUDGET),
|
|
71066
71315
|
markdown: htmlToMarkdownSafe(html, bodyText),
|
|
71067
71316
|
tables: extractTables(html),
|
|
71317
|
+
...routing_candidates.length > 0 ? { routing_candidates } : {},
|
|
71068
71318
|
extraction: {
|
|
71069
71319
|
source: "direct-document",
|
|
71070
71320
|
rejected: false
|
|
71071
71321
|
}
|
|
71072
71322
|
};
|
|
71073
71323
|
}
|
|
71324
|
+
function intentQueryTerm(intent, url) {
|
|
71325
|
+
let domTokens = new Set;
|
|
71326
|
+
try {
|
|
71327
|
+
domTokens = new Set(new URL(url).hostname.toLowerCase().split(/[.-]/));
|
|
71328
|
+
} catch {}
|
|
71329
|
+
const toks = (intent.toLowerCase().match(/[a-z][a-z0-9]{2,}/g) ?? []).filter((t) => !QUERY_STOPWORDS.has(t) && !domTokens.has(t));
|
|
71330
|
+
return [...new Set(toks)].join(" ").trim();
|
|
71331
|
+
}
|
|
71332
|
+
function buildSearchRouteCandidates(html, url, intent) {
|
|
71333
|
+
if (!intent || !isListLikeIntent(intent))
|
|
71334
|
+
return [];
|
|
71335
|
+
const queryTerm = intentQueryTerm(intent, url);
|
|
71336
|
+
if (!queryTerm)
|
|
71337
|
+
return [];
|
|
71338
|
+
let origin = "";
|
|
71339
|
+
try {
|
|
71340
|
+
origin = new URL(url).origin;
|
|
71341
|
+
} catch {
|
|
71342
|
+
return [];
|
|
71343
|
+
}
|
|
71344
|
+
return deriveSearchRouteTemplates(html).slice(0, 3).map((t) => ({
|
|
71345
|
+
url: fillSearchRoute(origin, t.template, queryTerm),
|
|
71346
|
+
template: t.template,
|
|
71347
|
+
query: queryTerm,
|
|
71348
|
+
samples: t.samples
|
|
71349
|
+
}));
|
|
71350
|
+
}
|
|
71074
71351
|
async function fetchDirectDocument(url) {
|
|
71075
71352
|
if (!isDirectDocumentEligibleUrl(url))
|
|
71076
71353
|
return null;
|
|
@@ -71258,10 +71535,12 @@ function cellText(html) {
|
|
|
71258
71535
|
function decodeHtmlEntityText(input) {
|
|
71259
71536
|
return input.replace(/ /g, " ").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, '"').replace(/'/g, "'");
|
|
71260
71537
|
}
|
|
71261
|
-
var HTML_RE, MIN_DIRECT_DOCUMENT_HTML_BYTES = 5000, CHALLENGE_RE, INTERSTITIAL_RE, MIN_DIRECT_DOCUMENT_BODY_TEXT = 500, SPA_HYDRATION_RE, SPA_HYDRATION_BODY_TEXT_FLOOR = 2000, SPA_ROOT_CONTAINER_RE, PARKED_RE, INTENT_STOPWORDS, MARKDOWN_BUDGET, MAX_TABLES = 10, MAX_TABLE_ROWS = 50, buildBloombergDirectDocumentResult;
|
|
71538
|
+
var HTML_RE, MIN_DIRECT_DOCUMENT_HTML_BYTES = 5000, CHALLENGE_RE, INTERSTITIAL_RE, MIN_DIRECT_DOCUMENT_BODY_TEXT = 500, SPA_HYDRATION_RE, SPA_HYDRATION_BODY_TEXT_FLOOR = 2000, SPA_ROOT_CONTAINER_RE, PARKED_RE, INTENT_STOPWORDS, MARKDOWN_BUDGET, MAX_TABLES = 10, MAX_TABLE_ROWS = 50, QUERY_STOPWORDS, buildBloombergDirectDocumentResult;
|
|
71262
71539
|
var init_direct_document = __esm(() => {
|
|
71263
71540
|
init_curl_impersonate_fallback();
|
|
71264
71541
|
init_proxy_fetch();
|
|
71542
|
+
init_cardinality();
|
|
71543
|
+
init_search_forms();
|
|
71265
71544
|
HTML_RE = /text\/html|application\/xhtml\+xml/i;
|
|
71266
71545
|
CHALLENGE_RE = /\b(access denied|are you a robot|captcha|just a moment|pardon our interruption|robot check|unusual traffic|verify you are human)\b/i;
|
|
71267
71546
|
INTERSTITIAL_RE = /\b(please wait for verification|just a moment|cf-mitigated|datadome|akamai bot|perimeterx|sign in to continue|log in to (?:continue|access)|javascript is not available)\b/i;
|
|
@@ -71344,6 +71623,7 @@ var init_direct_document = __esm(() => {
|
|
|
71344
71623
|
"look"
|
|
71345
71624
|
]);
|
|
71346
71625
|
MARKDOWN_BUDGET = Math.max(1000, Number(process.env.UNBROWSE_MARKDOWN_BUDGET ?? "12000") || 12000);
|
|
71626
|
+
QUERY_STOPWORDS = new Set(("resolve unbrowse execute run walk go fetch open view want need please " + "find search browse list lookup discover show get me a an the on of for in to " + "with and or all my your this that some good best top new latest cheap near").split(" "));
|
|
71347
71627
|
buildBloombergDirectDocumentResult = buildDirectDocumentResult;
|
|
71348
71628
|
});
|
|
71349
71629
|
|
|
@@ -120056,181 +120336,6 @@ function clampToFloor(score, demotion, floor) {
|
|
|
120056
120336
|
}
|
|
120057
120337
|
var HARD_NEGATIVE_FLOOR = -2000, WEAK_NEGATIVE_FLOOR = -400, PAGE_ARTIFACT_DEMOTION = 800, EMPTY_ENTITY_BAG_DEMOTION = 650, EMPTY_ENTITY_BAG_FLOOR = -700;
|
|
120058
120338
|
|
|
120059
|
-
// .tmp-runtime-src/execution/search-forms.ts
|
|
120060
|
-
var exports_search_forms = {};
|
|
120061
|
-
__export(exports_search_forms, {
|
|
120062
|
-
isStructuredSearchForm: () => isStructuredSearchForm,
|
|
120063
|
-
detectSearchForms: () => detectSearchForms
|
|
120064
|
-
});
|
|
120065
|
-
function isStructuredSearchForm(spec) {
|
|
120066
|
-
return spec.fields.length > 0 && !!spec.submit_selector;
|
|
120067
|
-
}
|
|
120068
|
-
function formSelectorFromElement(attribs, index2) {
|
|
120069
|
-
const id = attribs.id;
|
|
120070
|
-
if (id)
|
|
120071
|
-
return `form#${id}`;
|
|
120072
|
-
const name = attribs.name;
|
|
120073
|
-
if (name)
|
|
120074
|
-
return `form[name="${name}"]`;
|
|
120075
|
-
const action2 = attribs.action;
|
|
120076
|
-
if (action2)
|
|
120077
|
-
return `form[action="${action2}"]`;
|
|
120078
|
-
return `form:nth-of-type(${index2 + 1})`;
|
|
120079
|
-
}
|
|
120080
|
-
function inputSelectorFromElement(attribs, tagName) {
|
|
120081
|
-
const id = attribs.id;
|
|
120082
|
-
if (id)
|
|
120083
|
-
return `#${id}`;
|
|
120084
|
-
const name = attribs.name;
|
|
120085
|
-
if (name)
|
|
120086
|
-
return `${tagName}[name="${name}"]`;
|
|
120087
|
-
return tagName;
|
|
120088
|
-
}
|
|
120089
|
-
function mapInputType(typeAttr, tagName) {
|
|
120090
|
-
if (tagName === "select")
|
|
120091
|
-
return "select";
|
|
120092
|
-
if (tagName === "textarea")
|
|
120093
|
-
return "text";
|
|
120094
|
-
const t = (typeAttr ?? "text").toLowerCase();
|
|
120095
|
-
if (t === "radio")
|
|
120096
|
-
return "radio";
|
|
120097
|
-
if (t === "checkbox")
|
|
120098
|
-
return "checkbox";
|
|
120099
|
-
if (t === "date")
|
|
120100
|
-
return "date";
|
|
120101
|
-
if (t === "hidden")
|
|
120102
|
-
return "hidden";
|
|
120103
|
-
if (t === "submit" || t === "button" || t === "image" || t === "reset")
|
|
120104
|
-
return null;
|
|
120105
|
-
if (t === "password" || t === "file")
|
|
120106
|
-
return null;
|
|
120107
|
-
if (SUPPORTED_INPUT_TYPES.has(t))
|
|
120108
|
-
return "text";
|
|
120109
|
-
return "text";
|
|
120110
|
-
}
|
|
120111
|
-
function parseAttrs(attrStr) {
|
|
120112
|
-
const attrs = {};
|
|
120113
|
-
const attrRegex = /(\w[\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g;
|
|
120114
|
-
let m;
|
|
120115
|
-
while ((m = attrRegex.exec(attrStr)) !== null) {
|
|
120116
|
-
attrs[m[1]] = m[2] ?? m[3] ?? m[4] ?? "";
|
|
120117
|
-
}
|
|
120118
|
-
return attrs;
|
|
120119
|
-
}
|
|
120120
|
-
function detectSearchForms(html3) {
|
|
120121
|
-
const results = [];
|
|
120122
|
-
const formRegex = /<form([^>]*)>([\s\S]*?)<\/form>/gi;
|
|
120123
|
-
let formMatch;
|
|
120124
|
-
let formIndex = 0;
|
|
120125
|
-
while ((formMatch = formRegex.exec(html3)) !== null) {
|
|
120126
|
-
const formAttrs = formMatch[1];
|
|
120127
|
-
const formBody = formMatch[2];
|
|
120128
|
-
const formElAttrs = parseAttrs(formAttrs);
|
|
120129
|
-
const fieldRegex = /<(input|select|textarea)([^>]*)\/?>/gi;
|
|
120130
|
-
let fieldMatch;
|
|
120131
|
-
const fields = [];
|
|
120132
|
-
const seenNames = new Set;
|
|
120133
|
-
let hasLoginField = false;
|
|
120134
|
-
let hasSearchLikeField = false;
|
|
120135
|
-
while ((fieldMatch = fieldRegex.exec(formBody)) !== null) {
|
|
120136
|
-
const tagName = fieldMatch[1].toLowerCase();
|
|
120137
|
-
const fieldAttrs = parseAttrs(fieldMatch[2]);
|
|
120138
|
-
const name = fieldAttrs.name ?? "";
|
|
120139
|
-
const typeAttr = fieldAttrs.type;
|
|
120140
|
-
if (LOGIN_FIELD_NAMES.has(name.toLowerCase()) || typeAttr === "password") {
|
|
120141
|
-
hasLoginField = true;
|
|
120142
|
-
}
|
|
120143
|
-
if (SEARCH_FIELD_NAMES.has(name.toLowerCase())) {
|
|
120144
|
-
hasSearchLikeField = true;
|
|
120145
|
-
}
|
|
120146
|
-
const mappedType = mapInputType(typeAttr, tagName);
|
|
120147
|
-
if (!mappedType)
|
|
120148
|
-
continue;
|
|
120149
|
-
if (!name && mappedType !== "text")
|
|
120150
|
-
continue;
|
|
120151
|
-
if (seenNames.has(name) && mappedType !== "radio")
|
|
120152
|
-
continue;
|
|
120153
|
-
if (name)
|
|
120154
|
-
seenNames.add(name);
|
|
120155
|
-
let options;
|
|
120156
|
-
if (tagName === "select") {
|
|
120157
|
-
const optRegex = /<option[^>]*value="([^"]*)"[^>]*>/gi;
|
|
120158
|
-
let optMatch;
|
|
120159
|
-
options = [];
|
|
120160
|
-
while ((optMatch = optRegex.exec(formBody)) !== null) {
|
|
120161
|
-
options.push(optMatch[1]);
|
|
120162
|
-
}
|
|
120163
|
-
if (options.length === 0)
|
|
120164
|
-
options = undefined;
|
|
120165
|
-
}
|
|
120166
|
-
fields.push({
|
|
120167
|
-
name: name || `unnamed_${fields.length}`,
|
|
120168
|
-
type: mappedType,
|
|
120169
|
-
selector: inputSelectorFromElement(fieldAttrs, tagName),
|
|
120170
|
-
...options ? { options } : {},
|
|
120171
|
-
required: fieldAttrs.required !== undefined
|
|
120172
|
-
});
|
|
120173
|
-
}
|
|
120174
|
-
let submitSelector = "";
|
|
120175
|
-
if (/<button[^>]*type\s*=\s*"submit"/i.test(formBody)) {
|
|
120176
|
-
submitSelector = "button[type=submit]";
|
|
120177
|
-
} else if (/<input[^>]*type\s*=\s*"submit"/i.test(formBody)) {
|
|
120178
|
-
submitSelector = 'input[type="submit"]';
|
|
120179
|
-
} else if (/<button/i.test(formBody)) {
|
|
120180
|
-
submitSelector = "button";
|
|
120181
|
-
}
|
|
120182
|
-
const nonHiddenFields = fields.filter((f) => f.type !== "hidden");
|
|
120183
|
-
if (!hasLoginField && nonHiddenFields.length > 0 && submitSelector && (hasSearchLikeField || nonHiddenFields.length >= 1)) {
|
|
120184
|
-
const formSelector = formSelectorFromElement(formElAttrs, formIndex);
|
|
120185
|
-
results.push({
|
|
120186
|
-
form_selector: formSelector,
|
|
120187
|
-
submit_selector: submitSelector,
|
|
120188
|
-
fields
|
|
120189
|
-
});
|
|
120190
|
-
}
|
|
120191
|
-
formIndex++;
|
|
120192
|
-
}
|
|
120193
|
-
return results;
|
|
120194
|
-
}
|
|
120195
|
-
var SEARCH_FIELD_NAMES, LOGIN_FIELD_NAMES, SUPPORTED_INPUT_TYPES;
|
|
120196
|
-
var init_search_forms = __esm(() => {
|
|
120197
|
-
SEARCH_FIELD_NAMES = new Set([
|
|
120198
|
-
"q",
|
|
120199
|
-
"query",
|
|
120200
|
-
"search",
|
|
120201
|
-
"keyword",
|
|
120202
|
-
"keywords",
|
|
120203
|
-
"term",
|
|
120204
|
-
"terms",
|
|
120205
|
-
"find",
|
|
120206
|
-
"lookup",
|
|
120207
|
-
"filter",
|
|
120208
|
-
"s",
|
|
120209
|
-
"text",
|
|
120210
|
-
"input"
|
|
120211
|
-
]);
|
|
120212
|
-
LOGIN_FIELD_NAMES = new Set([
|
|
120213
|
-
"password",
|
|
120214
|
-
"passwd",
|
|
120215
|
-
"pass",
|
|
120216
|
-
"pwd",
|
|
120217
|
-
"confirm_password",
|
|
120218
|
-
"username",
|
|
120219
|
-
"email",
|
|
120220
|
-
"login",
|
|
120221
|
-
"user"
|
|
120222
|
-
]);
|
|
120223
|
-
SUPPORTED_INPUT_TYPES = new Set([
|
|
120224
|
-
"text",
|
|
120225
|
-
"search",
|
|
120226
|
-
"hidden",
|
|
120227
|
-
"date",
|
|
120228
|
-
"number",
|
|
120229
|
-
"tel",
|
|
120230
|
-
"email"
|
|
120231
|
-
]);
|
|
120232
|
-
});
|
|
120233
|
-
|
|
120234
120339
|
// .tmp-runtime-src/state/stateless.ts
|
|
120235
120340
|
function isStateless() {
|
|
120236
120341
|
const v = process.env.UNBROWSE_STATELESS;
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"repo_url": "https://github.com/justrach/kuri.git",
|
|
3
3
|
"branch": "adding-extensions",
|
|
4
4
|
"source_sha": "149881254046a20778f642b69f20f0c6468f6fb4",
|
|
5
|
-
"built_at": "2026-06-
|
|
5
|
+
"built_at": "2026-06-18T05:53:02.585Z",
|
|
6
6
|
"binaries": {
|
|
7
7
|
"darwin-arm64": {
|
|
8
8
|
"zig_target": "aarch64-macos",
|
|
@@ -21,11 +21,11 @@
|
|
|
21
21
|
},
|
|
22
22
|
"linux-x64": {
|
|
23
23
|
"zig_target": "x86_64-linux",
|
|
24
|
-
"sha256": "
|
|
24
|
+
"sha256": "f39955d73d86150fba2a4bec6393e7745feb42f5152870b6c27fd68a5cff3a6e"
|
|
25
25
|
},
|
|
26
26
|
"win-x64": {
|
|
27
27
|
"zig_target": "x86_64-windows-gnu",
|
|
28
|
-
"sha256": "
|
|
28
|
+
"sha256": "376a34f508ea6a4e140150f9f6ddc00519f8bb0894ee2dd7a60bc0e7613d89b0",
|
|
29
29
|
"source": "pre-staged"
|
|
30
30
|
}
|
|
31
31
|
},
|
|
@@ -33,22 +33,22 @@
|
|
|
33
33
|
"darwin-arm64": {
|
|
34
34
|
"zig_target": "aarch64-macos",
|
|
35
35
|
"lib": "libkuri_ffi.dylib",
|
|
36
|
-
"sha256": "
|
|
36
|
+
"sha256": "2ca1be4d477f28c4a4ab1dd993cfe6766b9f6858c42befda3f503a3e2940bf6f"
|
|
37
37
|
},
|
|
38
38
|
"darwin-x64": {
|
|
39
39
|
"zig_target": "x86_64-macos",
|
|
40
40
|
"lib": "libkuri_ffi.dylib",
|
|
41
|
-
"sha256": "
|
|
41
|
+
"sha256": "80e27865e521b4bc6a79dfcb4fd481535d70e03226125ce10d9625ba02320f47"
|
|
42
42
|
},
|
|
43
43
|
"linux-arm64": {
|
|
44
44
|
"zig_target": "aarch64-linux",
|
|
45
45
|
"lib": "libkuri_ffi.so",
|
|
46
|
-
"sha256": "
|
|
46
|
+
"sha256": "6fa04fc6b505212e5ae9cfdc0d21eb2f9b2eb97dc41850e02ba3f4112e252e9d"
|
|
47
47
|
},
|
|
48
48
|
"linux-x64": {
|
|
49
49
|
"zig_target": "x86_64-linux",
|
|
50
50
|
"lib": "libkuri_ffi.so",
|
|
51
|
-
"sha256": "
|
|
51
|
+
"sha256": "fa520df6cca6eab9260bd43ed7a60d665c746645f12f6fb4e3ecddbb026bdb05"
|
|
52
52
|
}
|
|
53
53
|
}
|
|
54
54
|
}
|
|
Binary file
|