unbrowse 9.6.2 → 9.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unbrowse",
3
- "version": "9.6.2",
3
+ "version": "9.8.0",
4
4
  "repository": {
5
5
  "type": "git",
6
6
  "url": "git+https://github.com/unbrowse-ai/unbrowse.git"
package/runtime/cli.js CHANGED
@@ -2350,7 +2350,7 @@ var init_telemetry = __esm(() => {
2350
2350
  });
2351
2351
 
2352
2352
  // .tmp-runtime-src/build-info.generated.ts
2353
- var BUILD_RELEASE_VERSION = "9.6.2", BUILD_GIT_SHA = "d2d14a6629a0", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiOS42LjIiLCJnaXRfc2hhIjoiZDJkMTRhNjYyOWEwIiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUBkMmQxNGE2NjI5YTAiLCJpc3N1ZWRfYXQiOiIyMDI2LTA2LTE4VDA0OjE3OjAzLjg1M1oifQ", BUILD_RELEASE_MANIFEST_SIGNATURE = "OefvyW2iLVVPQ-0HMg9Mz-bciCmP8LE5u7fVEss_98E", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai", BUILD_DEFAULT_PROFILE = "";
2353
+ var BUILD_RELEASE_VERSION = "9.8.0", BUILD_GIT_SHA = "255142bb4c25", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiOS44LjAiLCJnaXRfc2hhIjoiMjU1MTQyYmI0YzI1IiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUAyNTUxNDJiYjRjMjUiLCJpc3N1ZWRfYXQiOiIyMDI2LTA2LTE4VDA2OjA4OjA2LjY3MVoifQ", BUILD_RELEASE_MANIFEST_SIGNATURE = "Tw3ScHlFYGaEtPwKLhcPI_lgQUgjAZmhWKSi4fDFMw4", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai", BUILD_DEFAULT_PROFILE = "";
2354
2354
 
2355
2355
  // .tmp-runtime-src/version.ts
2356
2356
  import { createHash as createHash7 } from "crypto";
@@ -46122,6 +46122,42 @@ function urlPathLooksListLike2(contextUrl) {
46122
46122
  return false;
46123
46123
  }
46124
46124
  }
46125
+ function entityPointerTemplate(href) {
46126
+ let path7 = href;
46127
+ try {
46128
+ path7 = new URL(href, "https://_").pathname;
46129
+ } catch {
46130
+ path7 = href.split("?")[0];
46131
+ }
46132
+ const segs = path7.split("/").filter(Boolean);
46133
+ if (segs.length === 0)
46134
+ return null;
46135
+ const shape = [];
46136
+ let hasId = false;
46137
+ for (const s of segs) {
46138
+ const low = s.toLowerCase();
46139
+ if (/\d{3,}/.test(low) || low.length > 30 || /^[0-9a-f-]{8,}$/.test(low) || /-\d{2,}$/.test(low)) {
46140
+ shape.push("{id}");
46141
+ hasId = true;
46142
+ } else {
46143
+ shape.push(low);
46144
+ }
46145
+ }
46146
+ return hasId ? shape.slice(0, 3).join("/") : null;
46147
+ }
46148
+ function linksFormEntityCollection(hrefs, min = 4) {
46149
+ const groups = new Map;
46150
+ for (const href of hrefs) {
46151
+ const t = entityPointerTemplate(href);
46152
+ if (!t)
46153
+ continue;
46154
+ const n = (groups.get(t) ?? 0) + 1;
46155
+ if (n >= min)
46156
+ return true;
46157
+ groups.set(t, n);
46158
+ }
46159
+ return false;
46160
+ }
46125
46161
  function cardinalityMatches2(intent, subject, opts) {
46126
46162
  const wantsMany = isListLikeIntent2(intent) || urlPathLooksListLike2(opts?.contextUrl);
46127
46163
  if (!wantsMany)
@@ -46694,6 +46730,7 @@ __export(exports_capture, {
46694
46730
  tagRequestProvenance: () => tagRequestProvenance,
46695
46731
  shutdownAllBrowsers: () => shutdownAllBrowsers,
46696
46732
  shouldStopHydrationWait: () => shouldStopHydrationWait,
46733
+ shouldScrollStimulate: () => shouldScrollStimulate,
46697
46734
  selectPerformanceReplayCandidates: () => selectPerformanceReplayCandidates,
46698
46735
  registerDocumentStartScript: () => registerDocumentStartScript,
46699
46736
  navigatePageForCapture: () => navigatePageForCapture,
@@ -46956,6 +46993,9 @@ function extractRouteHint(url) {
46956
46993
  } catch {}
46957
46994
  return null;
46958
46995
  }
46996
+ function shouldScrollStimulate(captureUrl, intent) {
46997
+ return isListLikeIntent2(intent) || urlPathLooksListLike2(captureUrl);
46998
+ }
46959
46999
  function deriveIntentHints(captureUrl, intent) {
46960
47000
  const derivedHints = new Set;
46961
47001
  if (captureUrl) {
@@ -47583,8 +47623,7 @@ async function waitForContentReady(tabId, captureUrl, intent, responseBodies) {
47583
47623
  log("capture", `intent-aware wait: already captured API matching one of [${[...derivedHints].join(", ")}], skipping`);
47584
47624
  }
47585
47625
  }
47586
- const lowerIntent = intent?.toLowerCase() ?? "";
47587
- if (captureUrl && responseBodies && (/search|explore|trending|tabs|discover/i.test(captureUrl) || /\b(person|people|profile|profiles|user|users|member|members|company|companies|organization|organisations|business|post|posts|tweet|tweets|status|statuses)\b/.test(lowerIntent))) {
47626
+ if (captureUrl && responseBodies && shouldScrollStimulate(captureUrl, intent)) {
47588
47627
  try {
47589
47628
  const before = responseBodies.size;
47590
47629
  await evaluate(tabId, "window.scrollTo(0, Math.max(window.innerHeight, Math.min(document.body.scrollHeight, window.innerHeight * 2)))");
@@ -48658,6 +48697,7 @@ var init_capture = __esm(async () => {
48658
48697
  init_domain();
48659
48698
  init_logger();
48660
48699
  init_header_classify();
48700
+ init_cardinality2();
48661
48701
  init_browser_access();
48662
48702
  await init_vault();
48663
48703
  waitQueue = [];
@@ -56326,6 +56366,215 @@ var init_curl_impersonate_fallback = __esm(() => {
56326
56366
  };
56327
56367
  });
56328
56368
 
56369
+ // .tmp-runtime-src/execution/search-forms.ts
56370
+ var exports_search_forms = {};
56371
+ __export(exports_search_forms, {
56372
+ isStructuredSearchForm: () => isStructuredSearchForm,
56373
+ fillSearchRoute: () => fillSearchRoute,
56374
+ detectSearchForms: () => detectSearchForms,
56375
+ deriveSearchRouteTemplates: () => deriveSearchRouteTemplates
56376
+ });
56377
+ function deriveSearchRouteTemplates(html, minDistinct = 4) {
56378
+ const hrefs = new Set;
56379
+ for (const m of html.matchAll(/href\s*=\s*["'](\/[^"'?#\s]+)["']/gi))
56380
+ hrefs.add(m[1]);
56381
+ const groups = new Map;
56382
+ for (const h of hrefs) {
56383
+ const segs = h.split("/").filter(Boolean);
56384
+ if (segs.length < 1 || segs.length > 4)
56385
+ continue;
56386
+ for (let i = 0;i < segs.length; i++) {
56387
+ const val = segs[i];
56388
+ if (!/^[a-z][a-z0-9-]{1,40}$/i.test(val) || /\d{3,}/.test(val) || /\.[a-z0-9]{1,5}$/i.test(val))
56389
+ continue;
56390
+ const shape = segs.map((s, j) => j === i ? "{query}" : s).join("/");
56391
+ const trailing = h.endsWith("/") ? "/" : "";
56392
+ const key = `/${shape}${trailing}`;
56393
+ if (!groups.has(key))
56394
+ groups.set(key, new Set);
56395
+ groups.get(key).add(val.toLowerCase());
56396
+ }
56397
+ }
56398
+ const out = [];
56399
+ for (const [template, vals] of groups) {
56400
+ if (vals.size >= minDistinct)
56401
+ out.push({ template, samples: [...vals].slice(0, 5), count: vals.size });
56402
+ }
56403
+ return out.sort((a, b) => b.count - a.count);
56404
+ }
56405
+ function fillSearchRoute(origin, template, query) {
56406
+ const slug = encodeURIComponent(query.trim().toLowerCase());
56407
+ return origin.replace(/\/+$/, "") + template.replace("{query}", slug);
56408
+ }
56409
+ function isStructuredSearchForm(spec) {
56410
+ return spec.fields.length > 0 && !!spec.submit_selector;
56411
+ }
56412
+ function formSelectorFromElement(attribs, index) {
56413
+ const id = attribs.id;
56414
+ if (id)
56415
+ return `form#${id}`;
56416
+ const name = attribs.name;
56417
+ if (name)
56418
+ return `form[name="${name}"]`;
56419
+ const action2 = attribs.action;
56420
+ if (action2)
56421
+ return `form[action="${action2}"]`;
56422
+ return `form:nth-of-type(${index + 1})`;
56423
+ }
56424
+ function inputSelectorFromElement(attribs, tagName) {
56425
+ const id = attribs.id;
56426
+ if (id)
56427
+ return `#${id}`;
56428
+ const name = attribs.name;
56429
+ if (name)
56430
+ return `${tagName}[name="${name}"]`;
56431
+ return tagName;
56432
+ }
56433
+ function mapInputType(typeAttr, tagName) {
56434
+ if (tagName === "select")
56435
+ return "select";
56436
+ if (tagName === "textarea")
56437
+ return "text";
56438
+ const t = (typeAttr ?? "text").toLowerCase();
56439
+ if (t === "radio")
56440
+ return "radio";
56441
+ if (t === "checkbox")
56442
+ return "checkbox";
56443
+ if (t === "date")
56444
+ return "date";
56445
+ if (t === "hidden")
56446
+ return "hidden";
56447
+ if (t === "submit" || t === "button" || t === "image" || t === "reset")
56448
+ return null;
56449
+ if (t === "password" || t === "file")
56450
+ return null;
56451
+ if (SUPPORTED_INPUT_TYPES.has(t))
56452
+ return "text";
56453
+ return "text";
56454
+ }
56455
+ function parseAttrs(attrStr) {
56456
+ const attrs = {};
56457
+ const attrRegex = /(\w[\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g;
56458
+ let m;
56459
+ while ((m = attrRegex.exec(attrStr)) !== null) {
56460
+ attrs[m[1]] = m[2] ?? m[3] ?? m[4] ?? "";
56461
+ }
56462
+ return attrs;
56463
+ }
56464
+ function detectSearchForms(html) {
56465
+ const results = [];
56466
+ const formRegex = /<form([^>]*)>([\s\S]*?)<\/form>/gi;
56467
+ let formMatch;
56468
+ let formIndex = 0;
56469
+ while ((formMatch = formRegex.exec(html)) !== null) {
56470
+ const formAttrs = formMatch[1];
56471
+ const formBody = formMatch[2];
56472
+ const formElAttrs = parseAttrs(formAttrs);
56473
+ const fieldRegex = /<(input|select|textarea)([^>]*)\/?>/gi;
56474
+ let fieldMatch;
56475
+ const fields = [];
56476
+ const seenNames = new Set;
56477
+ let hasLoginField = false;
56478
+ let hasSearchLikeField = false;
56479
+ while ((fieldMatch = fieldRegex.exec(formBody)) !== null) {
56480
+ const tagName = fieldMatch[1].toLowerCase();
56481
+ const fieldAttrs = parseAttrs(fieldMatch[2]);
56482
+ const name = fieldAttrs.name ?? "";
56483
+ const typeAttr = fieldAttrs.type;
56484
+ if (LOGIN_FIELD_NAMES.has(name.toLowerCase()) || typeAttr === "password") {
56485
+ hasLoginField = true;
56486
+ }
56487
+ if (SEARCH_FIELD_NAMES.has(name.toLowerCase())) {
56488
+ hasSearchLikeField = true;
56489
+ }
56490
+ const mappedType = mapInputType(typeAttr, tagName);
56491
+ if (!mappedType)
56492
+ continue;
56493
+ if (!name && mappedType !== "text")
56494
+ continue;
56495
+ if (seenNames.has(name) && mappedType !== "radio")
56496
+ continue;
56497
+ if (name)
56498
+ seenNames.add(name);
56499
+ let options;
56500
+ if (tagName === "select") {
56501
+ const optRegex = /<option[^>]*value="([^"]*)"[^>]*>/gi;
56502
+ let optMatch;
56503
+ options = [];
56504
+ while ((optMatch = optRegex.exec(formBody)) !== null) {
56505
+ options.push(optMatch[1]);
56506
+ }
56507
+ if (options.length === 0)
56508
+ options = undefined;
56509
+ }
56510
+ fields.push({
56511
+ name: name || `unnamed_${fields.length}`,
56512
+ type: mappedType,
56513
+ selector: inputSelectorFromElement(fieldAttrs, tagName),
56514
+ ...options ? { options } : {},
56515
+ required: fieldAttrs.required !== undefined
56516
+ });
56517
+ }
56518
+ let submitSelector = "";
56519
+ if (/<button[^>]*type\s*=\s*"submit"/i.test(formBody)) {
56520
+ submitSelector = "button[type=submit]";
56521
+ } else if (/<input[^>]*type\s*=\s*"submit"/i.test(formBody)) {
56522
+ submitSelector = 'input[type="submit"]';
56523
+ } else if (/<button/i.test(formBody)) {
56524
+ submitSelector = "button";
56525
+ }
56526
+ const nonHiddenFields = fields.filter((f) => f.type !== "hidden");
56527
+ if (!hasLoginField && nonHiddenFields.length > 0 && submitSelector && (hasSearchLikeField || nonHiddenFields.length >= 1)) {
56528
+ const formSelector = formSelectorFromElement(formElAttrs, formIndex);
56529
+ results.push({
56530
+ form_selector: formSelector,
56531
+ submit_selector: submitSelector,
56532
+ fields
56533
+ });
56534
+ }
56535
+ formIndex++;
56536
+ }
56537
+ return results;
56538
+ }
56539
+ var SEARCH_FIELD_NAMES, LOGIN_FIELD_NAMES, SUPPORTED_INPUT_TYPES;
56540
+ var init_search_forms = __esm(() => {
56541
+ SEARCH_FIELD_NAMES = new Set([
56542
+ "q",
56543
+ "query",
56544
+ "search",
56545
+ "keyword",
56546
+ "keywords",
56547
+ "term",
56548
+ "terms",
56549
+ "find",
56550
+ "lookup",
56551
+ "filter",
56552
+ "s",
56553
+ "text",
56554
+ "input"
56555
+ ]);
56556
+ LOGIN_FIELD_NAMES = new Set([
56557
+ "password",
56558
+ "passwd",
56559
+ "pass",
56560
+ "pwd",
56561
+ "confirm_password",
56562
+ "username",
56563
+ "email",
56564
+ "login",
56565
+ "user"
56566
+ ]);
56567
+ SUPPORTED_INPUT_TYPES = new Set([
56568
+ "text",
56569
+ "search",
56570
+ "hidden",
56571
+ "date",
56572
+ "number",
56573
+ "tel",
56574
+ "email"
56575
+ ]);
56576
+ });
56577
+
56329
56578
  // node_modules/.bun/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/Event.js
56330
56579
  var require_Event = __commonJS((exports, module) => {
56331
56580
  module.exports = Event2;
@@ -72977,20 +73226,24 @@ function buildDirectDocumentResult(url, html, contentType, intent) {
72977
73226
  const hits = intentTokens.filter((tok) => haystack.includes(tok));
72978
73227
  const hitRate = hits.length / intentTokens.length;
72979
73228
  if (hitRate < 0.34) {
72980
- return {
72981
- rejected: true,
72982
- reason: "intent_mismatch",
72983
- evidence: {
72984
- intent_tokens: intentTokens,
72985
- response_token_hits: hits,
72986
- response_token_hit_rate: hitRate,
72987
- html_bytes: html.length
72988
- }
72989
- };
73229
+ const isCollection = isListLikeIntent2(intent) && linksFormEntityCollection(Array.from(html.matchAll(/href\s*=\s*["']([^"']+)["']/gi), (m) => m[1]));
73230
+ if (!isCollection) {
73231
+ return {
73232
+ rejected: true,
73233
+ reason: "intent_mismatch",
73234
+ evidence: {
73235
+ intent_tokens: intentTokens,
73236
+ response_token_hits: hits,
73237
+ response_token_hit_rate: hitRate,
73238
+ html_bytes: html.length
73239
+ }
73240
+ };
73241
+ }
72990
73242
  }
72991
73243
  }
72992
73244
  }
72993
73245
  const { url_template, input_params, path_params, query } = extractHtmlHoles(url);
73246
+ const routing_candidates = buildSearchRouteCandidates(html, url, intent);
72994
73247
  return {
72995
73248
  rejected: false,
72996
73249
  title,
@@ -73004,12 +73257,40 @@ function buildDirectDocumentResult(url, html, contentType, intent) {
73004
73257
  text_excerpt: bodyText.slice(0, MARKDOWN_BUDGET),
73005
73258
  markdown: htmlToMarkdownSafe(html, bodyText),
73006
73259
  tables: extractTables(html),
73260
+ ...routing_candidates.length > 0 ? { routing_candidates } : {},
73007
73261
  extraction: {
73008
73262
  source: "direct-document",
73009
73263
  rejected: false
73010
73264
  }
73011
73265
  };
73012
73266
  }
73267
+ function intentQueryTerm(intent, url) {
73268
+ let domTokens = new Set;
73269
+ try {
73270
+ domTokens = new Set(new URL(url).hostname.toLowerCase().split(/[.-]/));
73271
+ } catch {}
73272
+ const toks = (intent.toLowerCase().match(/[a-z][a-z0-9]{2,}/g) ?? []).filter((t) => !QUERY_STOPWORDS.has(t) && !domTokens.has(t));
73273
+ return [...new Set(toks)].join(" ").trim();
73274
+ }
73275
+ function buildSearchRouteCandidates(html, url, intent) {
73276
+ if (!intent || !isListLikeIntent2(intent))
73277
+ return [];
73278
+ const queryTerm = intentQueryTerm(intent, url);
73279
+ if (!queryTerm)
73280
+ return [];
73281
+ let origin = "";
73282
+ try {
73283
+ origin = new URL(url).origin;
73284
+ } catch {
73285
+ return [];
73286
+ }
73287
+ return deriveSearchRouteTemplates(html).slice(0, 3).map((t) => ({
73288
+ url: fillSearchRoute(origin, t.template, queryTerm),
73289
+ template: t.template,
73290
+ query: queryTerm,
73291
+ samples: t.samples
73292
+ }));
73293
+ }
73013
73294
  async function fetchDirectDocument(url) {
73014
73295
  if (!isDirectDocumentEligibleUrl(url))
73015
73296
  return null;
@@ -73197,10 +73478,12 @@ function cellText(html) {
73197
73478
  function decodeHtmlEntityText(input) {
73198
73479
  return input.replace(/&nbsp;/g, " ").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, '"').replace(/&#39;/g, "'");
73199
73480
  }
73200
- var HTML_RE, MIN_DIRECT_DOCUMENT_HTML_BYTES = 5000, CHALLENGE_RE, INTERSTITIAL_RE, MIN_DIRECT_DOCUMENT_BODY_TEXT = 500, SPA_HYDRATION_RE, SPA_HYDRATION_BODY_TEXT_FLOOR = 2000, SPA_ROOT_CONTAINER_RE, PARKED_RE, INTENT_STOPWORDS, MARKDOWN_BUDGET, MAX_TABLES = 10, MAX_TABLE_ROWS = 50, buildBloombergDirectDocumentResult;
73481
+ var HTML_RE, MIN_DIRECT_DOCUMENT_HTML_BYTES = 5000, CHALLENGE_RE, INTERSTITIAL_RE, MIN_DIRECT_DOCUMENT_BODY_TEXT = 500, SPA_HYDRATION_RE, SPA_HYDRATION_BODY_TEXT_FLOOR = 2000, SPA_ROOT_CONTAINER_RE, PARKED_RE, INTENT_STOPWORDS, MARKDOWN_BUDGET, MAX_TABLES = 10, MAX_TABLE_ROWS = 50, QUERY_STOPWORDS, buildBloombergDirectDocumentResult;
73201
73482
  var init_direct_document = __esm(() => {
73202
73483
  init_curl_impersonate_fallback();
73203
73484
  init_proxy_fetch();
73485
+ init_cardinality2();
73486
+ init_search_forms();
73204
73487
  HTML_RE = /text\/html|application\/xhtml\+xml/i;
73205
73488
  CHALLENGE_RE = /\b(access denied|are you a robot|captcha|just a moment|pardon our interruption|robot check|unusual traffic|verify you are human)\b/i;
73206
73489
  INTERSTITIAL_RE = /\b(please wait for verification|just a moment|cf-mitigated|datadome|akamai bot|perimeterx|sign in to continue|log in to (?:continue|access)|javascript is not available)\b/i;
@@ -73283,6 +73566,7 @@ var init_direct_document = __esm(() => {
73283
73566
  "look"
73284
73567
  ]);
73285
73568
  MARKDOWN_BUDGET = Math.max(1000, Number(process.env.UNBROWSE_MARKDOWN_BUDGET ?? "12000") || 12000);
73569
+ QUERY_STOPWORDS = new Set(("resolve unbrowse execute run walk go fetch open view want need please " + "find search browse list lookup discover show get me a an the on of for in to " + "with and or all my your this that some good best top new latest cheap near").split(" "));
73286
73570
  buildBloombergDirectDocumentResult = buildDirectDocumentResult;
73287
73571
  });
73288
73572
 
@@ -121955,181 +122239,6 @@ function clampToFloor(score, demotion, floor) {
121955
122239
  }
121956
122240
  var HARD_NEGATIVE_FLOOR = -2000, WEAK_NEGATIVE_FLOOR = -400, PAGE_ARTIFACT_DEMOTION = 800, EMPTY_ENTITY_BAG_DEMOTION = 650, EMPTY_ENTITY_BAG_FLOOR = -700;
121957
122241
 
121958
- // .tmp-runtime-src/execution/search-forms.ts
121959
- var exports_search_forms = {};
121960
- __export(exports_search_forms, {
121961
- isStructuredSearchForm: () => isStructuredSearchForm,
121962
- detectSearchForms: () => detectSearchForms
121963
- });
121964
- function isStructuredSearchForm(spec) {
121965
- return spec.fields.length > 0 && !!spec.submit_selector;
121966
- }
121967
- function formSelectorFromElement(attribs, index2) {
121968
- const id = attribs.id;
121969
- if (id)
121970
- return `form#${id}`;
121971
- const name = attribs.name;
121972
- if (name)
121973
- return `form[name="${name}"]`;
121974
- const action2 = attribs.action;
121975
- if (action2)
121976
- return `form[action="${action2}"]`;
121977
- return `form:nth-of-type(${index2 + 1})`;
121978
- }
121979
- function inputSelectorFromElement(attribs, tagName) {
121980
- const id = attribs.id;
121981
- if (id)
121982
- return `#${id}`;
121983
- const name = attribs.name;
121984
- if (name)
121985
- return `${tagName}[name="${name}"]`;
121986
- return tagName;
121987
- }
121988
- function mapInputType(typeAttr, tagName) {
121989
- if (tagName === "select")
121990
- return "select";
121991
- if (tagName === "textarea")
121992
- return "text";
121993
- const t = (typeAttr ?? "text").toLowerCase();
121994
- if (t === "radio")
121995
- return "radio";
121996
- if (t === "checkbox")
121997
- return "checkbox";
121998
- if (t === "date")
121999
- return "date";
122000
- if (t === "hidden")
122001
- return "hidden";
122002
- if (t === "submit" || t === "button" || t === "image" || t === "reset")
122003
- return null;
122004
- if (t === "password" || t === "file")
122005
- return null;
122006
- if (SUPPORTED_INPUT_TYPES.has(t))
122007
- return "text";
122008
- return "text";
122009
- }
122010
- function parseAttrs(attrStr) {
122011
- const attrs = {};
122012
- const attrRegex = /(\w[\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g;
122013
- let m;
122014
- while ((m = attrRegex.exec(attrStr)) !== null) {
122015
- attrs[m[1]] = m[2] ?? m[3] ?? m[4] ?? "";
122016
- }
122017
- return attrs;
122018
- }
122019
- function detectSearchForms(html3) {
122020
- const results = [];
122021
- const formRegex = /<form([^>]*)>([\s\S]*?)<\/form>/gi;
122022
- let formMatch;
122023
- let formIndex = 0;
122024
- while ((formMatch = formRegex.exec(html3)) !== null) {
122025
- const formAttrs = formMatch[1];
122026
- const formBody = formMatch[2];
122027
- const formElAttrs = parseAttrs(formAttrs);
122028
- const fieldRegex = /<(input|select|textarea)([^>]*)\/?>/gi;
122029
- let fieldMatch;
122030
- const fields = [];
122031
- const seenNames = new Set;
122032
- let hasLoginField = false;
122033
- let hasSearchLikeField = false;
122034
- while ((fieldMatch = fieldRegex.exec(formBody)) !== null) {
122035
- const tagName = fieldMatch[1].toLowerCase();
122036
- const fieldAttrs = parseAttrs(fieldMatch[2]);
122037
- const name = fieldAttrs.name ?? "";
122038
- const typeAttr = fieldAttrs.type;
122039
- if (LOGIN_FIELD_NAMES.has(name.toLowerCase()) || typeAttr === "password") {
122040
- hasLoginField = true;
122041
- }
122042
- if (SEARCH_FIELD_NAMES.has(name.toLowerCase())) {
122043
- hasSearchLikeField = true;
122044
- }
122045
- const mappedType = mapInputType(typeAttr, tagName);
122046
- if (!mappedType)
122047
- continue;
122048
- if (!name && mappedType !== "text")
122049
- continue;
122050
- if (seenNames.has(name) && mappedType !== "radio")
122051
- continue;
122052
- if (name)
122053
- seenNames.add(name);
122054
- let options;
122055
- if (tagName === "select") {
122056
- const optRegex = /<option[^>]*value="([^"]*)"[^>]*>/gi;
122057
- let optMatch;
122058
- options = [];
122059
- while ((optMatch = optRegex.exec(formBody)) !== null) {
122060
- options.push(optMatch[1]);
122061
- }
122062
- if (options.length === 0)
122063
- options = undefined;
122064
- }
122065
- fields.push({
122066
- name: name || `unnamed_${fields.length}`,
122067
- type: mappedType,
122068
- selector: inputSelectorFromElement(fieldAttrs, tagName),
122069
- ...options ? { options } : {},
122070
- required: fieldAttrs.required !== undefined
122071
- });
122072
- }
122073
- let submitSelector = "";
122074
- if (/<button[^>]*type\s*=\s*"submit"/i.test(formBody)) {
122075
- submitSelector = "button[type=submit]";
122076
- } else if (/<input[^>]*type\s*=\s*"submit"/i.test(formBody)) {
122077
- submitSelector = 'input[type="submit"]';
122078
- } else if (/<button/i.test(formBody)) {
122079
- submitSelector = "button";
122080
- }
122081
- const nonHiddenFields = fields.filter((f) => f.type !== "hidden");
122082
- if (!hasLoginField && nonHiddenFields.length > 0 && submitSelector && (hasSearchLikeField || nonHiddenFields.length >= 1)) {
122083
- const formSelector = formSelectorFromElement(formElAttrs, formIndex);
122084
- results.push({
122085
- form_selector: formSelector,
122086
- submit_selector: submitSelector,
122087
- fields
122088
- });
122089
- }
122090
- formIndex++;
122091
- }
122092
- return results;
122093
- }
122094
- var SEARCH_FIELD_NAMES, LOGIN_FIELD_NAMES, SUPPORTED_INPUT_TYPES;
122095
- var init_search_forms = __esm(() => {
122096
- SEARCH_FIELD_NAMES = new Set([
122097
- "q",
122098
- "query",
122099
- "search",
122100
- "keyword",
122101
- "keywords",
122102
- "term",
122103
- "terms",
122104
- "find",
122105
- "lookup",
122106
- "filter",
122107
- "s",
122108
- "text",
122109
- "input"
122110
- ]);
122111
- LOGIN_FIELD_NAMES = new Set([
122112
- "password",
122113
- "passwd",
122114
- "pass",
122115
- "pwd",
122116
- "confirm_password",
122117
- "username",
122118
- "email",
122119
- "login",
122120
- "user"
122121
- ]);
122122
- SUPPORTED_INPUT_TYPES = new Set([
122123
- "text",
122124
- "search",
122125
- "hidden",
122126
- "date",
122127
- "number",
122128
- "tel",
122129
- "email"
122130
- ]);
122131
- });
122132
-
122133
122242
  // .tmp-runtime-src/state/stateless.ts
122134
122243
  function isStateless() {
122135
122244
  const v = process.env.UNBROWSE_STATELESS;
@@ -123558,7 +123667,7 @@ function isProtobufContentType(contentType) {
123558
123667
  function isProtobufLikeEndpoint(url, contentType) {
123559
123668
  if (isProtobufContentType(contentType))
123560
123669
  return true;
123561
- return /\/(field-data-proto|proto|protobuf)(\/|$|-)/i.test(url);
123670
+ return /[-/](proto|protobuf)(\/|$|-)/i.test(url);
123562
123671
  }
123563
123672
  function decodeProtobufBytes(bytes) {
123564
123673
  return decodeBytes(bytes, "bytes");
package/runtime/mcp.js CHANGED
@@ -36310,7 +36310,7 @@ var init_cached_resolution = __esm(() => {
36310
36310
  });
36311
36311
 
36312
36312
  // .tmp-runtime-src/build-info.generated.ts
36313
- var BUILD_RELEASE_VERSION = "9.6.2", BUILD_GIT_SHA = "d2d14a6629a0", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiOS42LjIiLCJnaXRfc2hhIjoiZDJkMTRhNjYyOWEwIiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUBkMmQxNGE2NjI5YTAiLCJpc3N1ZWRfYXQiOiIyMDI2LTA2LTE4VDA0OjE3OjAzLjg1M1oifQ", BUILD_RELEASE_MANIFEST_SIGNATURE = "OefvyW2iLVVPQ-0HMg9Mz-bciCmP8LE5u7fVEss_98E", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai", BUILD_DEFAULT_PROFILE = "";
36313
+ var BUILD_RELEASE_VERSION = "9.8.0", BUILD_GIT_SHA = "255142bb4c25", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiOS44LjAiLCJnaXRfc2hhIjoiMjU1MTQyYmI0YzI1IiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUAyNTUxNDJiYjRjMjUiLCJpc3N1ZWRfYXQiOiIyMDI2LTA2LTE4VDA2OjA4OjA2LjY3MVoifQ", BUILD_RELEASE_MANIFEST_SIGNATURE = "Tw3ScHlFYGaEtPwKLhcPI_lgQUgjAZmhWKSi4fDFMw4", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai", BUILD_DEFAULT_PROFILE = "";
36314
36314
 
36315
36315
  // .tmp-runtime-src/version.ts
36316
36316
  import { createHash as createHash4 } from "crypto";
@@ -43252,6 +43252,42 @@ function urlPathLooksListLike(contextUrl) {
43252
43252
  return false;
43253
43253
  }
43254
43254
  }
43255
+ function entityPointerTemplate(href) {
43256
+ let path5 = href;
43257
+ try {
43258
+ path5 = new URL(href, "https://_").pathname;
43259
+ } catch {
43260
+ path5 = href.split("?")[0];
43261
+ }
43262
+ const segs = path5.split("/").filter(Boolean);
43263
+ if (segs.length === 0)
43264
+ return null;
43265
+ const shape = [];
43266
+ let hasId = false;
43267
+ for (const s of segs) {
43268
+ const low = s.toLowerCase();
43269
+ if (/\d{3,}/.test(low) || low.length > 30 || /^[0-9a-f-]{8,}$/.test(low) || /-\d{2,}$/.test(low)) {
43270
+ shape.push("{id}");
43271
+ hasId = true;
43272
+ } else {
43273
+ shape.push(low);
43274
+ }
43275
+ }
43276
+ return hasId ? shape.slice(0, 3).join("/") : null;
43277
+ }
43278
+ function linksFormEntityCollection(hrefs, min = 4) {
43279
+ const groups = new Map;
43280
+ for (const href of hrefs) {
43281
+ const t = entityPointerTemplate(href);
43282
+ if (!t)
43283
+ continue;
43284
+ const n = (groups.get(t) ?? 0) + 1;
43285
+ if (n >= min)
43286
+ return true;
43287
+ groups.set(t, n);
43288
+ }
43289
+ return false;
43290
+ }
43255
43291
  function cardinalityMatches(intent, subject, opts) {
43256
43292
  const wantsMany = isListLikeIntent(intent) || urlPathLooksListLike(opts?.contextUrl);
43257
43293
  if (!wantsMany)
@@ -43827,6 +43863,7 @@ __export(exports_capture, {
43827
43863
  tagRequestProvenance: () => tagRequestProvenance,
43828
43864
  shutdownAllBrowsers: () => shutdownAllBrowsers,
43829
43865
  shouldStopHydrationWait: () => shouldStopHydrationWait,
43866
+ shouldScrollStimulate: () => shouldScrollStimulate,
43830
43867
  selectPerformanceReplayCandidates: () => selectPerformanceReplayCandidates,
43831
43868
  registerDocumentStartScript: () => registerDocumentStartScript,
43832
43869
  navigatePageForCapture: () => navigatePageForCapture,
@@ -44089,6 +44126,9 @@ function extractRouteHint(url) {
44089
44126
  } catch {}
44090
44127
  return null;
44091
44128
  }
44129
+ function shouldScrollStimulate(captureUrl, intent) {
44130
+ return isListLikeIntent(intent) || urlPathLooksListLike(captureUrl);
44131
+ }
44092
44132
  function deriveIntentHints(captureUrl, intent) {
44093
44133
  const derivedHints = new Set;
44094
44134
  if (captureUrl) {
@@ -44716,8 +44756,7 @@ async function waitForContentReady(tabId, captureUrl, intent, responseBodies) {
44716
44756
  log("capture", `intent-aware wait: already captured API matching one of [${[...derivedHints].join(", ")}], skipping`);
44717
44757
  }
44718
44758
  }
44719
- const lowerIntent = intent?.toLowerCase() ?? "";
44720
- if (captureUrl && responseBodies && (/search|explore|trending|tabs|discover/i.test(captureUrl) || /\b(person|people|profile|profiles|user|users|member|members|company|companies|organization|organisations|business|post|posts|tweet|tweets|status|statuses)\b/.test(lowerIntent))) {
44759
+ if (captureUrl && responseBodies && shouldScrollStimulate(captureUrl, intent)) {
44721
44760
  try {
44722
44761
  const before = responseBodies.size;
44723
44762
  await evaluate(tabId, "window.scrollTo(0, Math.max(window.innerHeight, Math.min(document.body.scrollHeight, window.innerHeight * 2)))");
@@ -45791,6 +45830,7 @@ var init_capture = __esm(async () => {
45791
45830
  init_domain();
45792
45831
  init_logger();
45793
45832
  init_header_classify();
45833
+ init_cardinality();
45794
45834
  init_browser_access();
45795
45835
  await init_vault();
45796
45836
  waitQueue = [];
@@ -54383,6 +54423,215 @@ var init_curl_impersonate_fallback = __esm(() => {
54383
54423
  };
54384
54424
  });
54385
54425
 
54426
+ // .tmp-runtime-src/execution/search-forms.ts
54427
+ var exports_search_forms = {};
54428
+ __export(exports_search_forms, {
54429
+ isStructuredSearchForm: () => isStructuredSearchForm,
54430
+ fillSearchRoute: () => fillSearchRoute,
54431
+ detectSearchForms: () => detectSearchForms,
54432
+ deriveSearchRouteTemplates: () => deriveSearchRouteTemplates
54433
+ });
54434
+ function deriveSearchRouteTemplates(html, minDistinct = 4) {
54435
+ const hrefs = new Set;
54436
+ for (const m of html.matchAll(/href\s*=\s*["'](\/[^"'?#\s]+)["']/gi))
54437
+ hrefs.add(m[1]);
54438
+ const groups = new Map;
54439
+ for (const h of hrefs) {
54440
+ const segs = h.split("/").filter(Boolean);
54441
+ if (segs.length < 1 || segs.length > 4)
54442
+ continue;
54443
+ for (let i = 0;i < segs.length; i++) {
54444
+ const val = segs[i];
54445
+ if (!/^[a-z][a-z0-9-]{1,40}$/i.test(val) || /\d{3,}/.test(val) || /\.[a-z0-9]{1,5}$/i.test(val))
54446
+ continue;
54447
+ const shape = segs.map((s, j) => j === i ? "{query}" : s).join("/");
54448
+ const trailing = h.endsWith("/") ? "/" : "";
54449
+ const key = `/${shape}${trailing}`;
54450
+ if (!groups.has(key))
54451
+ groups.set(key, new Set);
54452
+ groups.get(key).add(val.toLowerCase());
54453
+ }
54454
+ }
54455
+ const out = [];
54456
+ for (const [template, vals] of groups) {
54457
+ if (vals.size >= minDistinct)
54458
+ out.push({ template, samples: [...vals].slice(0, 5), count: vals.size });
54459
+ }
54460
+ return out.sort((a, b) => b.count - a.count);
54461
+ }
54462
+ function fillSearchRoute(origin, template, query) {
54463
+ const slug = encodeURIComponent(query.trim().toLowerCase());
54464
+ return origin.replace(/\/+$/, "") + template.replace("{query}", slug);
54465
+ }
54466
+ function isStructuredSearchForm(spec) {
54467
+ return spec.fields.length > 0 && !!spec.submit_selector;
54468
+ }
54469
+ function formSelectorFromElement(attribs, index) {
54470
+ const id = attribs.id;
54471
+ if (id)
54472
+ return `form#${id}`;
54473
+ const name = attribs.name;
54474
+ if (name)
54475
+ return `form[name="${name}"]`;
54476
+ const action2 = attribs.action;
54477
+ if (action2)
54478
+ return `form[action="${action2}"]`;
54479
+ return `form:nth-of-type(${index + 1})`;
54480
+ }
54481
+ function inputSelectorFromElement(attribs, tagName) {
54482
+ const id = attribs.id;
54483
+ if (id)
54484
+ return `#${id}`;
54485
+ const name = attribs.name;
54486
+ if (name)
54487
+ return `${tagName}[name="${name}"]`;
54488
+ return tagName;
54489
+ }
54490
+ function mapInputType(typeAttr, tagName) {
54491
+ if (tagName === "select")
54492
+ return "select";
54493
+ if (tagName === "textarea")
54494
+ return "text";
54495
+ const t = (typeAttr ?? "text").toLowerCase();
54496
+ if (t === "radio")
54497
+ return "radio";
54498
+ if (t === "checkbox")
54499
+ return "checkbox";
54500
+ if (t === "date")
54501
+ return "date";
54502
+ if (t === "hidden")
54503
+ return "hidden";
54504
+ if (t === "submit" || t === "button" || t === "image" || t === "reset")
54505
+ return null;
54506
+ if (t === "password" || t === "file")
54507
+ return null;
54508
+ if (SUPPORTED_INPUT_TYPES.has(t))
54509
+ return "text";
54510
+ return "text";
54511
+ }
54512
+ function parseAttrs(attrStr) {
54513
+ const attrs = {};
54514
+ const attrRegex = /(\w[\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g;
54515
+ let m;
54516
+ while ((m = attrRegex.exec(attrStr)) !== null) {
54517
+ attrs[m[1]] = m[2] ?? m[3] ?? m[4] ?? "";
54518
+ }
54519
+ return attrs;
54520
+ }
54521
+ function detectSearchForms(html) {
54522
+ const results = [];
54523
+ const formRegex = /<form([^>]*)>([\s\S]*?)<\/form>/gi;
54524
+ let formMatch;
54525
+ let formIndex = 0;
54526
+ while ((formMatch = formRegex.exec(html)) !== null) {
54527
+ const formAttrs = formMatch[1];
54528
+ const formBody = formMatch[2];
54529
+ const formElAttrs = parseAttrs(formAttrs);
54530
+ const fieldRegex = /<(input|select|textarea)([^>]*)\/?>/gi;
54531
+ let fieldMatch;
54532
+ const fields = [];
54533
+ const seenNames = new Set;
54534
+ let hasLoginField = false;
54535
+ let hasSearchLikeField = false;
54536
+ while ((fieldMatch = fieldRegex.exec(formBody)) !== null) {
54537
+ const tagName = fieldMatch[1].toLowerCase();
54538
+ const fieldAttrs = parseAttrs(fieldMatch[2]);
54539
+ const name = fieldAttrs.name ?? "";
54540
+ const typeAttr = fieldAttrs.type;
54541
+ if (LOGIN_FIELD_NAMES.has(name.toLowerCase()) || typeAttr === "password") {
54542
+ hasLoginField = true;
54543
+ }
54544
+ if (SEARCH_FIELD_NAMES.has(name.toLowerCase())) {
54545
+ hasSearchLikeField = true;
54546
+ }
54547
+ const mappedType = mapInputType(typeAttr, tagName);
54548
+ if (!mappedType)
54549
+ continue;
54550
+ if (!name && mappedType !== "text")
54551
+ continue;
54552
+ if (seenNames.has(name) && mappedType !== "radio")
54553
+ continue;
54554
+ if (name)
54555
+ seenNames.add(name);
54556
+ let options;
54557
+ if (tagName === "select") {
54558
+ const optRegex = /<option[^>]*value="([^"]*)"[^>]*>/gi;
54559
+ let optMatch;
54560
+ options = [];
54561
+ while ((optMatch = optRegex.exec(formBody)) !== null) {
54562
+ options.push(optMatch[1]);
54563
+ }
54564
+ if (options.length === 0)
54565
+ options = undefined;
54566
+ }
54567
+ fields.push({
54568
+ name: name || `unnamed_${fields.length}`,
54569
+ type: mappedType,
54570
+ selector: inputSelectorFromElement(fieldAttrs, tagName),
54571
+ ...options ? { options } : {},
54572
+ required: fieldAttrs.required !== undefined
54573
+ });
54574
+ }
54575
+ let submitSelector = "";
54576
+ if (/<button[^>]*type\s*=\s*"submit"/i.test(formBody)) {
54577
+ submitSelector = "button[type=submit]";
54578
+ } else if (/<input[^>]*type\s*=\s*"submit"/i.test(formBody)) {
54579
+ submitSelector = 'input[type="submit"]';
54580
+ } else if (/<button/i.test(formBody)) {
54581
+ submitSelector = "button";
54582
+ }
54583
+ const nonHiddenFields = fields.filter((f) => f.type !== "hidden");
54584
+ if (!hasLoginField && nonHiddenFields.length > 0 && submitSelector && (hasSearchLikeField || nonHiddenFields.length >= 1)) {
54585
+ const formSelector = formSelectorFromElement(formElAttrs, formIndex);
54586
+ results.push({
54587
+ form_selector: formSelector,
54588
+ submit_selector: submitSelector,
54589
+ fields
54590
+ });
54591
+ }
54592
+ formIndex++;
54593
+ }
54594
+ return results;
54595
+ }
54596
+ var SEARCH_FIELD_NAMES, LOGIN_FIELD_NAMES, SUPPORTED_INPUT_TYPES;
54597
+ var init_search_forms = __esm(() => {
54598
+ SEARCH_FIELD_NAMES = new Set([
54599
+ "q",
54600
+ "query",
54601
+ "search",
54602
+ "keyword",
54603
+ "keywords",
54604
+ "term",
54605
+ "terms",
54606
+ "find",
54607
+ "lookup",
54608
+ "filter",
54609
+ "s",
54610
+ "text",
54611
+ "input"
54612
+ ]);
54613
+ LOGIN_FIELD_NAMES = new Set([
54614
+ "password",
54615
+ "passwd",
54616
+ "pass",
54617
+ "pwd",
54618
+ "confirm_password",
54619
+ "username",
54620
+ "email",
54621
+ "login",
54622
+ "user"
54623
+ ]);
54624
+ SUPPORTED_INPUT_TYPES = new Set([
54625
+ "text",
54626
+ "search",
54627
+ "hidden",
54628
+ "date",
54629
+ "number",
54630
+ "tel",
54631
+ "email"
54632
+ ]);
54633
+ });
54634
+
54386
54635
  // node_modules/.bun/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/Event.js
54387
54636
  var require_Event = __commonJS((exports, module) => {
54388
54637
  module.exports = Event2;
@@ -71034,20 +71283,24 @@ function buildDirectDocumentResult(url, html, contentType, intent) {
71034
71283
  const hits = intentTokens.filter((tok) => haystack.includes(tok));
71035
71284
  const hitRate = hits.length / intentTokens.length;
71036
71285
  if (hitRate < 0.34) {
71037
- return {
71038
- rejected: true,
71039
- reason: "intent_mismatch",
71040
- evidence: {
71041
- intent_tokens: intentTokens,
71042
- response_token_hits: hits,
71043
- response_token_hit_rate: hitRate,
71044
- html_bytes: html.length
71045
- }
71046
- };
71286
+ const isCollection = isListLikeIntent(intent) && linksFormEntityCollection(Array.from(html.matchAll(/href\s*=\s*["']([^"']+)["']/gi), (m) => m[1]));
71287
+ if (!isCollection) {
71288
+ return {
71289
+ rejected: true,
71290
+ reason: "intent_mismatch",
71291
+ evidence: {
71292
+ intent_tokens: intentTokens,
71293
+ response_token_hits: hits,
71294
+ response_token_hit_rate: hitRate,
71295
+ html_bytes: html.length
71296
+ }
71297
+ };
71298
+ }
71047
71299
  }
71048
71300
  }
71049
71301
  }
71050
71302
  const { url_template, input_params, path_params, query } = extractHtmlHoles(url);
71303
+ const routing_candidates = buildSearchRouteCandidates(html, url, intent);
71051
71304
  return {
71052
71305
  rejected: false,
71053
71306
  title,
@@ -71061,12 +71314,40 @@ function buildDirectDocumentResult(url, html, contentType, intent) {
71061
71314
  text_excerpt: bodyText.slice(0, MARKDOWN_BUDGET),
71062
71315
  markdown: htmlToMarkdownSafe(html, bodyText),
71063
71316
  tables: extractTables(html),
71317
+ ...routing_candidates.length > 0 ? { routing_candidates } : {},
71064
71318
  extraction: {
71065
71319
  source: "direct-document",
71066
71320
  rejected: false
71067
71321
  }
71068
71322
  };
71069
71323
  }
71324
+ function intentQueryTerm(intent, url) {
71325
+ let domTokens = new Set;
71326
+ try {
71327
+ domTokens = new Set(new URL(url).hostname.toLowerCase().split(/[.-]/));
71328
+ } catch {}
71329
+ const toks = (intent.toLowerCase().match(/[a-z][a-z0-9]{2,}/g) ?? []).filter((t) => !QUERY_STOPWORDS.has(t) && !domTokens.has(t));
71330
+ return [...new Set(toks)].join(" ").trim();
71331
+ }
71332
+ function buildSearchRouteCandidates(html, url, intent) {
71333
+ if (!intent || !isListLikeIntent(intent))
71334
+ return [];
71335
+ const queryTerm = intentQueryTerm(intent, url);
71336
+ if (!queryTerm)
71337
+ return [];
71338
+ let origin = "";
71339
+ try {
71340
+ origin = new URL(url).origin;
71341
+ } catch {
71342
+ return [];
71343
+ }
71344
+ return deriveSearchRouteTemplates(html).slice(0, 3).map((t) => ({
71345
+ url: fillSearchRoute(origin, t.template, queryTerm),
71346
+ template: t.template,
71347
+ query: queryTerm,
71348
+ samples: t.samples
71349
+ }));
71350
+ }
71070
71351
  async function fetchDirectDocument(url) {
71071
71352
  if (!isDirectDocumentEligibleUrl(url))
71072
71353
  return null;
@@ -71254,10 +71535,12 @@ function cellText(html) {
71254
71535
  function decodeHtmlEntityText(input) {
71255
71536
  return input.replace(/&nbsp;/g, " ").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, '"').replace(/&#39;/g, "'");
71256
71537
  }
71257
- var HTML_RE, MIN_DIRECT_DOCUMENT_HTML_BYTES = 5000, CHALLENGE_RE, INTERSTITIAL_RE, MIN_DIRECT_DOCUMENT_BODY_TEXT = 500, SPA_HYDRATION_RE, SPA_HYDRATION_BODY_TEXT_FLOOR = 2000, SPA_ROOT_CONTAINER_RE, PARKED_RE, INTENT_STOPWORDS, MARKDOWN_BUDGET, MAX_TABLES = 10, MAX_TABLE_ROWS = 50, buildBloombergDirectDocumentResult;
71538
+ var HTML_RE, MIN_DIRECT_DOCUMENT_HTML_BYTES = 5000, CHALLENGE_RE, INTERSTITIAL_RE, MIN_DIRECT_DOCUMENT_BODY_TEXT = 500, SPA_HYDRATION_RE, SPA_HYDRATION_BODY_TEXT_FLOOR = 2000, SPA_ROOT_CONTAINER_RE, PARKED_RE, INTENT_STOPWORDS, MARKDOWN_BUDGET, MAX_TABLES = 10, MAX_TABLE_ROWS = 50, QUERY_STOPWORDS, buildBloombergDirectDocumentResult;
71258
71539
  var init_direct_document = __esm(() => {
71259
71540
  init_curl_impersonate_fallback();
71260
71541
  init_proxy_fetch();
71542
+ init_cardinality();
71543
+ init_search_forms();
71261
71544
  HTML_RE = /text\/html|application\/xhtml\+xml/i;
71262
71545
  CHALLENGE_RE = /\b(access denied|are you a robot|captcha|just a moment|pardon our interruption|robot check|unusual traffic|verify you are human)\b/i;
71263
71546
  INTERSTITIAL_RE = /\b(please wait for verification|just a moment|cf-mitigated|datadome|akamai bot|perimeterx|sign in to continue|log in to (?:continue|access)|javascript is not available)\b/i;
@@ -71340,6 +71623,7 @@ var init_direct_document = __esm(() => {
71340
71623
  "look"
71341
71624
  ]);
71342
71625
  MARKDOWN_BUDGET = Math.max(1000, Number(process.env.UNBROWSE_MARKDOWN_BUDGET ?? "12000") || 12000);
71626
+ QUERY_STOPWORDS = new Set(("resolve unbrowse execute run walk go fetch open view want need please " + "find search browse list lookup discover show get me a an the on of for in to " + "with and or all my your this that some good best top new latest cheap near").split(" "));
71343
71627
  buildBloombergDirectDocumentResult = buildDirectDocumentResult;
71344
71628
  });
71345
71629
 
@@ -120052,181 +120336,6 @@ function clampToFloor(score, demotion, floor) {
120052
120336
  }
120053
120337
  var HARD_NEGATIVE_FLOOR = -2000, WEAK_NEGATIVE_FLOOR = -400, PAGE_ARTIFACT_DEMOTION = 800, EMPTY_ENTITY_BAG_DEMOTION = 650, EMPTY_ENTITY_BAG_FLOOR = -700;
120054
120338
 
120055
- // .tmp-runtime-src/execution/search-forms.ts
120056
- var exports_search_forms = {};
120057
- __export(exports_search_forms, {
120058
- isStructuredSearchForm: () => isStructuredSearchForm,
120059
- detectSearchForms: () => detectSearchForms
120060
- });
120061
- function isStructuredSearchForm(spec) {
120062
- return spec.fields.length > 0 && !!spec.submit_selector;
120063
- }
120064
- function formSelectorFromElement(attribs, index2) {
120065
- const id = attribs.id;
120066
- if (id)
120067
- return `form#${id}`;
120068
- const name = attribs.name;
120069
- if (name)
120070
- return `form[name="${name}"]`;
120071
- const action2 = attribs.action;
120072
- if (action2)
120073
- return `form[action="${action2}"]`;
120074
- return `form:nth-of-type(${index2 + 1})`;
120075
- }
120076
- function inputSelectorFromElement(attribs, tagName) {
120077
- const id = attribs.id;
120078
- if (id)
120079
- return `#${id}`;
120080
- const name = attribs.name;
120081
- if (name)
120082
- return `${tagName}[name="${name}"]`;
120083
- return tagName;
120084
- }
120085
- function mapInputType(typeAttr, tagName) {
120086
- if (tagName === "select")
120087
- return "select";
120088
- if (tagName === "textarea")
120089
- return "text";
120090
- const t = (typeAttr ?? "text").toLowerCase();
120091
- if (t === "radio")
120092
- return "radio";
120093
- if (t === "checkbox")
120094
- return "checkbox";
120095
- if (t === "date")
120096
- return "date";
120097
- if (t === "hidden")
120098
- return "hidden";
120099
- if (t === "submit" || t === "button" || t === "image" || t === "reset")
120100
- return null;
120101
- if (t === "password" || t === "file")
120102
- return null;
120103
- if (SUPPORTED_INPUT_TYPES.has(t))
120104
- return "text";
120105
- return "text";
120106
- }
120107
- function parseAttrs(attrStr) {
120108
- const attrs = {};
120109
- const attrRegex = /(\w[\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g;
120110
- let m;
120111
- while ((m = attrRegex.exec(attrStr)) !== null) {
120112
- attrs[m[1]] = m[2] ?? m[3] ?? m[4] ?? "";
120113
- }
120114
- return attrs;
120115
- }
120116
- function detectSearchForms(html3) {
120117
- const results = [];
120118
- const formRegex = /<form([^>]*)>([\s\S]*?)<\/form>/gi;
120119
- let formMatch;
120120
- let formIndex = 0;
120121
- while ((formMatch = formRegex.exec(html3)) !== null) {
120122
- const formAttrs = formMatch[1];
120123
- const formBody = formMatch[2];
120124
- const formElAttrs = parseAttrs(formAttrs);
120125
- const fieldRegex = /<(input|select|textarea)([^>]*)\/?>/gi;
120126
- let fieldMatch;
120127
- const fields = [];
120128
- const seenNames = new Set;
120129
- let hasLoginField = false;
120130
- let hasSearchLikeField = false;
120131
- while ((fieldMatch = fieldRegex.exec(formBody)) !== null) {
120132
- const tagName = fieldMatch[1].toLowerCase();
120133
- const fieldAttrs = parseAttrs(fieldMatch[2]);
120134
- const name = fieldAttrs.name ?? "";
120135
- const typeAttr = fieldAttrs.type;
120136
- if (LOGIN_FIELD_NAMES.has(name.toLowerCase()) || typeAttr === "password") {
120137
- hasLoginField = true;
120138
- }
120139
- if (SEARCH_FIELD_NAMES.has(name.toLowerCase())) {
120140
- hasSearchLikeField = true;
120141
- }
120142
- const mappedType = mapInputType(typeAttr, tagName);
120143
- if (!mappedType)
120144
- continue;
120145
- if (!name && mappedType !== "text")
120146
- continue;
120147
- if (seenNames.has(name) && mappedType !== "radio")
120148
- continue;
120149
- if (name)
120150
- seenNames.add(name);
120151
- let options;
120152
- if (tagName === "select") {
120153
- const optRegex = /<option[^>]*value="([^"]*)"[^>]*>/gi;
120154
- let optMatch;
120155
- options = [];
120156
- while ((optMatch = optRegex.exec(formBody)) !== null) {
120157
- options.push(optMatch[1]);
120158
- }
120159
- if (options.length === 0)
120160
- options = undefined;
120161
- }
120162
- fields.push({
120163
- name: name || `unnamed_${fields.length}`,
120164
- type: mappedType,
120165
- selector: inputSelectorFromElement(fieldAttrs, tagName),
120166
- ...options ? { options } : {},
120167
- required: fieldAttrs.required !== undefined
120168
- });
120169
- }
120170
- let submitSelector = "";
120171
- if (/<button[^>]*type\s*=\s*"submit"/i.test(formBody)) {
120172
- submitSelector = "button[type=submit]";
120173
- } else if (/<input[^>]*type\s*=\s*"submit"/i.test(formBody)) {
120174
- submitSelector = 'input[type="submit"]';
120175
- } else if (/<button/i.test(formBody)) {
120176
- submitSelector = "button";
120177
- }
120178
- const nonHiddenFields = fields.filter((f) => f.type !== "hidden");
120179
- if (!hasLoginField && nonHiddenFields.length > 0 && submitSelector && (hasSearchLikeField || nonHiddenFields.length >= 1)) {
120180
- const formSelector = formSelectorFromElement(formElAttrs, formIndex);
120181
- results.push({
120182
- form_selector: formSelector,
120183
- submit_selector: submitSelector,
120184
- fields
120185
- });
120186
- }
120187
- formIndex++;
120188
- }
120189
- return results;
120190
- }
120191
- var SEARCH_FIELD_NAMES, LOGIN_FIELD_NAMES, SUPPORTED_INPUT_TYPES;
120192
- var init_search_forms = __esm(() => {
120193
- SEARCH_FIELD_NAMES = new Set([
120194
- "q",
120195
- "query",
120196
- "search",
120197
- "keyword",
120198
- "keywords",
120199
- "term",
120200
- "terms",
120201
- "find",
120202
- "lookup",
120203
- "filter",
120204
- "s",
120205
- "text",
120206
- "input"
120207
- ]);
120208
- LOGIN_FIELD_NAMES = new Set([
120209
- "password",
120210
- "passwd",
120211
- "pass",
120212
- "pwd",
120213
- "confirm_password",
120214
- "username",
120215
- "email",
120216
- "login",
120217
- "user"
120218
- ]);
120219
- SUPPORTED_INPUT_TYPES = new Set([
120220
- "text",
120221
- "search",
120222
- "hidden",
120223
- "date",
120224
- "number",
120225
- "tel",
120226
- "email"
120227
- ]);
120228
- });
120229
-
120230
120339
  // .tmp-runtime-src/state/stateless.ts
120231
120340
  function isStateless() {
120232
120341
  const v = process.env.UNBROWSE_STATELESS;
@@ -121655,7 +121764,7 @@ function isProtobufContentType(contentType) {
121655
121764
  function isProtobufLikeEndpoint(url, contentType) {
121656
121765
  if (isProtobufContentType(contentType))
121657
121766
  return true;
121658
- return /\/(field-data-proto|proto|protobuf)(\/|$|-)/i.test(url);
121767
+ return /[-/](proto|protobuf)(\/|$|-)/i.test(url);
121659
121768
  }
121660
121769
  function decodeProtobufBytes(bytes) {
121661
121770
  return decodeBytes(bytes, "bytes");
Binary file
@@ -2,7 +2,7 @@
2
2
  "repo_url": "https://github.com/justrach/kuri.git",
3
3
  "branch": "adding-extensions",
4
4
  "source_sha": "149881254046a20778f642b69f20f0c6468f6fb4",
5
- "built_at": "2026-06-18T03:58:10.362Z",
5
+ "built_at": "2026-06-18T05:53:02.585Z",
6
6
  "binaries": {
7
7
  "darwin-arm64": {
8
8
  "zig_target": "aarch64-macos",
@@ -21,11 +21,11 @@
21
21
  },
22
22
  "linux-x64": {
23
23
  "zig_target": "x86_64-linux",
24
- "sha256": "e73aecfbf07001ba0be5032118790eb253ad5d8d12caca6a1dd5ad3ccab44b9e"
24
+ "sha256": "f39955d73d86150fba2a4bec6393e7745feb42f5152870b6c27fd68a5cff3a6e"
25
25
  },
26
26
  "win-x64": {
27
27
  "zig_target": "x86_64-windows-gnu",
28
- "sha256": "9ecbc82be646e755e4664051cf345d54dde3c6610e457d763deff67895047963",
28
+ "sha256": "376a34f508ea6a4e140150f9f6ddc00519f8bb0894ee2dd7a60bc0e7613d89b0",
29
29
  "source": "pre-staged"
30
30
  }
31
31
  },
@@ -33,22 +33,22 @@
33
33
  "darwin-arm64": {
34
34
  "zig_target": "aarch64-macos",
35
35
  "lib": "libkuri_ffi.dylib",
36
- "sha256": "6c72cf383df4fa3f870b745da43d64eae8f67e58c6f971214ac29602fb649939"
36
+ "sha256": "2ca1be4d477f28c4a4ab1dd993cfe6766b9f6858c42befda3f503a3e2940bf6f"
37
37
  },
38
38
  "darwin-x64": {
39
39
  "zig_target": "x86_64-macos",
40
40
  "lib": "libkuri_ffi.dylib",
41
- "sha256": "82480772ddc8e44c8e34e70b80d7dc0969004942f77276587af450b62d3d2750"
41
+ "sha256": "80e27865e521b4bc6a79dfcb4fd481535d70e03226125ce10d9625ba02320f47"
42
42
  },
43
43
  "linux-arm64": {
44
44
  "zig_target": "aarch64-linux",
45
45
  "lib": "libkuri_ffi.so",
46
- "sha256": "ef8dfa2b634f04294f93a94472d9856ba777681afaab2d4213f0e29821882e07"
46
+ "sha256": "6fa04fc6b505212e5ae9cfdc0d21eb2f9b2eb97dc41850e02ba3f4112e252e9d"
47
47
  },
48
48
  "linux-x64": {
49
49
  "zig_target": "x86_64-linux",
50
50
  "lib": "libkuri_ffi.so",
51
- "sha256": "fb29ad2b71186d176306321d17e88074a67fea139991faef9aa4862333942c9e"
51
+ "sha256": "fa520df6cca6eab9260bd43ed7a60d665c746645f12f6fb4e3ecddbb026bdb05"
52
52
  }
53
53
  }
54
54
  }
Binary file