unbrowse 9.7.0 → 9.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unbrowse",
3
- "version": "9.7.0",
3
+ "version": "9.8.0",
4
4
  "repository": {
5
5
  "type": "git",
6
6
  "url": "git+https://github.com/unbrowse-ai/unbrowse.git"
package/runtime/cli.js CHANGED
@@ -2350,7 +2350,7 @@ var init_telemetry = __esm(() => {
2350
2350
  });
2351
2351
 
2352
2352
  // .tmp-runtime-src/build-info.generated.ts
2353
- var BUILD_RELEASE_VERSION = "9.7.0", BUILD_GIT_SHA = "98fa4d4472e2", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiOS43LjAiLCJnaXRfc2hhIjoiOThmYTRkNDQ3MmUyIiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUA5OGZhNGQ0NDcyZTIiLCJpc3N1ZWRfYXQiOiIyMDI2LTA2LTE4VDA0OjU4OjQwLjQ3NFoifQ", BUILD_RELEASE_MANIFEST_SIGNATURE = "-b2L4xhYhJ-mLJeo39RL19HSAKgjhZ2_D3ezoXjvQLQ", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai", BUILD_DEFAULT_PROFILE = "";
2353
+ var BUILD_RELEASE_VERSION = "9.8.0", BUILD_GIT_SHA = "255142bb4c25", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiOS44LjAiLCJnaXRfc2hhIjoiMjU1MTQyYmI0YzI1IiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUAyNTUxNDJiYjRjMjUiLCJpc3N1ZWRfYXQiOiIyMDI2LTA2LTE4VDA2OjA4OjA2LjY3MVoifQ", BUILD_RELEASE_MANIFEST_SIGNATURE = "Tw3ScHlFYGaEtPwKLhcPI_lgQUgjAZmhWKSi4fDFMw4", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai", BUILD_DEFAULT_PROFILE = "";
2354
2354
 
2355
2355
  // .tmp-runtime-src/version.ts
2356
2356
  import { createHash as createHash7 } from "crypto";
@@ -46122,6 +46122,42 @@ function urlPathLooksListLike2(contextUrl) {
46122
46122
  return false;
46123
46123
  }
46124
46124
  }
46125
+ function entityPointerTemplate(href) {
46126
+ let path7 = href;
46127
+ try {
46128
+ path7 = new URL(href, "https://_").pathname;
46129
+ } catch {
46130
+ path7 = href.split("?")[0];
46131
+ }
46132
+ const segs = path7.split("/").filter(Boolean);
46133
+ if (segs.length === 0)
46134
+ return null;
46135
+ const shape = [];
46136
+ let hasId = false;
46137
+ for (const s of segs) {
46138
+ const low = s.toLowerCase();
46139
+ if (/\d{3,}/.test(low) || low.length > 30 || /^[0-9a-f-]{8,}$/.test(low) || /-\d{2,}$/.test(low)) {
46140
+ shape.push("{id}");
46141
+ hasId = true;
46142
+ } else {
46143
+ shape.push(low);
46144
+ }
46145
+ }
46146
+ return hasId ? shape.slice(0, 3).join("/") : null;
46147
+ }
46148
+ function linksFormEntityCollection(hrefs, min = 4) {
46149
+ const groups = new Map;
46150
+ for (const href of hrefs) {
46151
+ const t = entityPointerTemplate(href);
46152
+ if (!t)
46153
+ continue;
46154
+ const n = (groups.get(t) ?? 0) + 1;
46155
+ if (n >= min)
46156
+ return true;
46157
+ groups.set(t, n);
46158
+ }
46159
+ return false;
46160
+ }
46125
46161
  function cardinalityMatches2(intent, subject, opts) {
46126
46162
  const wantsMany = isListLikeIntent2(intent) || urlPathLooksListLike2(opts?.contextUrl);
46127
46163
  if (!wantsMany)
@@ -56330,6 +56366,215 @@ var init_curl_impersonate_fallback = __esm(() => {
56330
56366
  };
56331
56367
  });
56332
56368
 
56369
+ // .tmp-runtime-src/execution/search-forms.ts
56370
+ var exports_search_forms = {};
56371
+ __export(exports_search_forms, {
56372
+ isStructuredSearchForm: () => isStructuredSearchForm,
56373
+ fillSearchRoute: () => fillSearchRoute,
56374
+ detectSearchForms: () => detectSearchForms,
56375
+ deriveSearchRouteTemplates: () => deriveSearchRouteTemplates
56376
+ });
56377
+ function deriveSearchRouteTemplates(html, minDistinct = 4) {
56378
+ const hrefs = new Set;
56379
+ for (const m of html.matchAll(/href\s*=\s*["'](\/[^"'?#\s]+)["']/gi))
56380
+ hrefs.add(m[1]);
56381
+ const groups = new Map;
56382
+ for (const h of hrefs) {
56383
+ const segs = h.split("/").filter(Boolean);
56384
+ if (segs.length < 1 || segs.length > 4)
56385
+ continue;
56386
+ for (let i = 0;i < segs.length; i++) {
56387
+ const val = segs[i];
56388
+ if (!/^[a-z][a-z0-9-]{1,40}$/i.test(val) || /\d{3,}/.test(val) || /\.[a-z0-9]{1,5}$/i.test(val))
56389
+ continue;
56390
+ const shape = segs.map((s, j) => j === i ? "{query}" : s).join("/");
56391
+ const trailing = h.endsWith("/") ? "/" : "";
56392
+ const key = `/${shape}${trailing}`;
56393
+ if (!groups.has(key))
56394
+ groups.set(key, new Set);
56395
+ groups.get(key).add(val.toLowerCase());
56396
+ }
56397
+ }
56398
+ const out = [];
56399
+ for (const [template, vals] of groups) {
56400
+ if (vals.size >= minDistinct)
56401
+ out.push({ template, samples: [...vals].slice(0, 5), count: vals.size });
56402
+ }
56403
+ return out.sort((a, b) => b.count - a.count);
56404
+ }
56405
+ function fillSearchRoute(origin, template, query) {
56406
+ const slug = encodeURIComponent(query.trim().toLowerCase());
56407
+ return origin.replace(/\/+$/, "") + template.replace("{query}", slug);
56408
+ }
56409
+ function isStructuredSearchForm(spec) {
56410
+ return spec.fields.length > 0 && !!spec.submit_selector;
56411
+ }
56412
+ function formSelectorFromElement(attribs, index) {
56413
+ const id = attribs.id;
56414
+ if (id)
56415
+ return `form#${id}`;
56416
+ const name = attribs.name;
56417
+ if (name)
56418
+ return `form[name="${name}"]`;
56419
+ const action2 = attribs.action;
56420
+ if (action2)
56421
+ return `form[action="${action2}"]`;
56422
+ return `form:nth-of-type(${index + 1})`;
56423
+ }
56424
+ function inputSelectorFromElement(attribs, tagName) {
56425
+ const id = attribs.id;
56426
+ if (id)
56427
+ return `#${id}`;
56428
+ const name = attribs.name;
56429
+ if (name)
56430
+ return `${tagName}[name="${name}"]`;
56431
+ return tagName;
56432
+ }
56433
+ function mapInputType(typeAttr, tagName) {
56434
+ if (tagName === "select")
56435
+ return "select";
56436
+ if (tagName === "textarea")
56437
+ return "text";
56438
+ const t = (typeAttr ?? "text").toLowerCase();
56439
+ if (t === "radio")
56440
+ return "radio";
56441
+ if (t === "checkbox")
56442
+ return "checkbox";
56443
+ if (t === "date")
56444
+ return "date";
56445
+ if (t === "hidden")
56446
+ return "hidden";
56447
+ if (t === "submit" || t === "button" || t === "image" || t === "reset")
56448
+ return null;
56449
+ if (t === "password" || t === "file")
56450
+ return null;
56451
+ if (SUPPORTED_INPUT_TYPES.has(t))
56452
+ return "text";
56453
+ return "text";
56454
+ }
56455
+ function parseAttrs(attrStr) {
56456
+ const attrs = {};
56457
+ const attrRegex = /(\w[\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g;
56458
+ let m;
56459
+ while ((m = attrRegex.exec(attrStr)) !== null) {
56460
+ attrs[m[1]] = m[2] ?? m[3] ?? m[4] ?? "";
56461
+ }
56462
+ return attrs;
56463
+ }
56464
+ function detectSearchForms(html) {
56465
+ const results = [];
56466
+ const formRegex = /<form([^>]*)>([\s\S]*?)<\/form>/gi;
56467
+ let formMatch;
56468
+ let formIndex = 0;
56469
+ while ((formMatch = formRegex.exec(html)) !== null) {
56470
+ const formAttrs = formMatch[1];
56471
+ const formBody = formMatch[2];
56472
+ const formElAttrs = parseAttrs(formAttrs);
56473
+ const fieldRegex = /<(input|select|textarea)([^>]*)\/?>/gi;
56474
+ let fieldMatch;
56475
+ const fields = [];
56476
+ const seenNames = new Set;
56477
+ let hasLoginField = false;
56478
+ let hasSearchLikeField = false;
56479
+ while ((fieldMatch = fieldRegex.exec(formBody)) !== null) {
56480
+ const tagName = fieldMatch[1].toLowerCase();
56481
+ const fieldAttrs = parseAttrs(fieldMatch[2]);
56482
+ const name = fieldAttrs.name ?? "";
56483
+ const typeAttr = fieldAttrs.type;
56484
+ if (LOGIN_FIELD_NAMES.has(name.toLowerCase()) || typeAttr === "password") {
56485
+ hasLoginField = true;
56486
+ }
56487
+ if (SEARCH_FIELD_NAMES.has(name.toLowerCase())) {
56488
+ hasSearchLikeField = true;
56489
+ }
56490
+ const mappedType = mapInputType(typeAttr, tagName);
56491
+ if (!mappedType)
56492
+ continue;
56493
+ if (!name && mappedType !== "text")
56494
+ continue;
56495
+ if (seenNames.has(name) && mappedType !== "radio")
56496
+ continue;
56497
+ if (name)
56498
+ seenNames.add(name);
56499
+ let options;
56500
+ if (tagName === "select") {
56501
+ const optRegex = /<option[^>]*value="([^"]*)"[^>]*>/gi;
56502
+ let optMatch;
56503
+ options = [];
56504
+ while ((optMatch = optRegex.exec(formBody)) !== null) {
56505
+ options.push(optMatch[1]);
56506
+ }
56507
+ if (options.length === 0)
56508
+ options = undefined;
56509
+ }
56510
+ fields.push({
56511
+ name: name || `unnamed_${fields.length}`,
56512
+ type: mappedType,
56513
+ selector: inputSelectorFromElement(fieldAttrs, tagName),
56514
+ ...options ? { options } : {},
56515
+ required: fieldAttrs.required !== undefined
56516
+ });
56517
+ }
56518
+ let submitSelector = "";
56519
+ if (/<button[^>]*type\s*=\s*"submit"/i.test(formBody)) {
56520
+ submitSelector = "button[type=submit]";
56521
+ } else if (/<input[^>]*type\s*=\s*"submit"/i.test(formBody)) {
56522
+ submitSelector = 'input[type="submit"]';
56523
+ } else if (/<button/i.test(formBody)) {
56524
+ submitSelector = "button";
56525
+ }
56526
+ const nonHiddenFields = fields.filter((f) => f.type !== "hidden");
56527
+ if (!hasLoginField && nonHiddenFields.length > 0 && submitSelector && (hasSearchLikeField || nonHiddenFields.length >= 1)) {
56528
+ const formSelector = formSelectorFromElement(formElAttrs, formIndex);
56529
+ results.push({
56530
+ form_selector: formSelector,
56531
+ submit_selector: submitSelector,
56532
+ fields
56533
+ });
56534
+ }
56535
+ formIndex++;
56536
+ }
56537
+ return results;
56538
+ }
56539
+ var SEARCH_FIELD_NAMES, LOGIN_FIELD_NAMES, SUPPORTED_INPUT_TYPES;
56540
+ var init_search_forms = __esm(() => {
56541
+ SEARCH_FIELD_NAMES = new Set([
56542
+ "q",
56543
+ "query",
56544
+ "search",
56545
+ "keyword",
56546
+ "keywords",
56547
+ "term",
56548
+ "terms",
56549
+ "find",
56550
+ "lookup",
56551
+ "filter",
56552
+ "s",
56553
+ "text",
56554
+ "input"
56555
+ ]);
56556
+ LOGIN_FIELD_NAMES = new Set([
56557
+ "password",
56558
+ "passwd",
56559
+ "pass",
56560
+ "pwd",
56561
+ "confirm_password",
56562
+ "username",
56563
+ "email",
56564
+ "login",
56565
+ "user"
56566
+ ]);
56567
+ SUPPORTED_INPUT_TYPES = new Set([
56568
+ "text",
56569
+ "search",
56570
+ "hidden",
56571
+ "date",
56572
+ "number",
56573
+ "tel",
56574
+ "email"
56575
+ ]);
56576
+ });
56577
+
56333
56578
  // node_modules/.bun/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/Event.js
56334
56579
  var require_Event = __commonJS((exports, module) => {
56335
56580
  module.exports = Event2;
@@ -72981,20 +73226,24 @@ function buildDirectDocumentResult(url, html, contentType, intent) {
72981
73226
  const hits = intentTokens.filter((tok) => haystack.includes(tok));
72982
73227
  const hitRate = hits.length / intentTokens.length;
72983
73228
  if (hitRate < 0.34) {
72984
- return {
72985
- rejected: true,
72986
- reason: "intent_mismatch",
72987
- evidence: {
72988
- intent_tokens: intentTokens,
72989
- response_token_hits: hits,
72990
- response_token_hit_rate: hitRate,
72991
- html_bytes: html.length
72992
- }
72993
- };
73229
+ const isCollection = isListLikeIntent2(intent) && linksFormEntityCollection(Array.from(html.matchAll(/href\s*=\s*["']([^"']+)["']/gi), (m) => m[1]));
73230
+ if (!isCollection) {
73231
+ return {
73232
+ rejected: true,
73233
+ reason: "intent_mismatch",
73234
+ evidence: {
73235
+ intent_tokens: intentTokens,
73236
+ response_token_hits: hits,
73237
+ response_token_hit_rate: hitRate,
73238
+ html_bytes: html.length
73239
+ }
73240
+ };
73241
+ }
72994
73242
  }
72995
73243
  }
72996
73244
  }
72997
73245
  const { url_template, input_params, path_params, query } = extractHtmlHoles(url);
73246
+ const routing_candidates = buildSearchRouteCandidates(html, url, intent);
72998
73247
  return {
72999
73248
  rejected: false,
73000
73249
  title,
@@ -73008,12 +73257,40 @@ function buildDirectDocumentResult(url, html, contentType, intent) {
73008
73257
  text_excerpt: bodyText.slice(0, MARKDOWN_BUDGET),
73009
73258
  markdown: htmlToMarkdownSafe(html, bodyText),
73010
73259
  tables: extractTables(html),
73260
+ ...routing_candidates.length > 0 ? { routing_candidates } : {},
73011
73261
  extraction: {
73012
73262
  source: "direct-document",
73013
73263
  rejected: false
73014
73264
  }
73015
73265
  };
73016
73266
  }
73267
+ function intentQueryTerm(intent, url) {
73268
+ let domTokens = new Set;
73269
+ try {
73270
+ domTokens = new Set(new URL(url).hostname.toLowerCase().split(/[.-]/));
73271
+ } catch {}
73272
+ const toks = (intent.toLowerCase().match(/[a-z][a-z0-9]{2,}/g) ?? []).filter((t) => !QUERY_STOPWORDS.has(t) && !domTokens.has(t));
73273
+ return [...new Set(toks)].join(" ").trim();
73274
+ }
73275
+ function buildSearchRouteCandidates(html, url, intent) {
73276
+ if (!intent || !isListLikeIntent2(intent))
73277
+ return [];
73278
+ const queryTerm = intentQueryTerm(intent, url);
73279
+ if (!queryTerm)
73280
+ return [];
73281
+ let origin = "";
73282
+ try {
73283
+ origin = new URL(url).origin;
73284
+ } catch {
73285
+ return [];
73286
+ }
73287
+ return deriveSearchRouteTemplates(html).slice(0, 3).map((t) => ({
73288
+ url: fillSearchRoute(origin, t.template, queryTerm),
73289
+ template: t.template,
73290
+ query: queryTerm,
73291
+ samples: t.samples
73292
+ }));
73293
+ }
73017
73294
  async function fetchDirectDocument(url) {
73018
73295
  if (!isDirectDocumentEligibleUrl(url))
73019
73296
  return null;
@@ -73201,10 +73478,12 @@ function cellText(html) {
73201
73478
  function decodeHtmlEntityText(input) {
73202
73479
  return input.replace(/&nbsp;/g, " ").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, '"').replace(/&#39;/g, "'");
73203
73480
  }
73204
- var HTML_RE, MIN_DIRECT_DOCUMENT_HTML_BYTES = 5000, CHALLENGE_RE, INTERSTITIAL_RE, MIN_DIRECT_DOCUMENT_BODY_TEXT = 500, SPA_HYDRATION_RE, SPA_HYDRATION_BODY_TEXT_FLOOR = 2000, SPA_ROOT_CONTAINER_RE, PARKED_RE, INTENT_STOPWORDS, MARKDOWN_BUDGET, MAX_TABLES = 10, MAX_TABLE_ROWS = 50, buildBloombergDirectDocumentResult;
73481
+ var HTML_RE, MIN_DIRECT_DOCUMENT_HTML_BYTES = 5000, CHALLENGE_RE, INTERSTITIAL_RE, MIN_DIRECT_DOCUMENT_BODY_TEXT = 500, SPA_HYDRATION_RE, SPA_HYDRATION_BODY_TEXT_FLOOR = 2000, SPA_ROOT_CONTAINER_RE, PARKED_RE, INTENT_STOPWORDS, MARKDOWN_BUDGET, MAX_TABLES = 10, MAX_TABLE_ROWS = 50, QUERY_STOPWORDS, buildBloombergDirectDocumentResult;
73205
73482
  var init_direct_document = __esm(() => {
73206
73483
  init_curl_impersonate_fallback();
73207
73484
  init_proxy_fetch();
73485
+ init_cardinality2();
73486
+ init_search_forms();
73208
73487
  HTML_RE = /text\/html|application\/xhtml\+xml/i;
73209
73488
  CHALLENGE_RE = /\b(access denied|are you a robot|captcha|just a moment|pardon our interruption|robot check|unusual traffic|verify you are human)\b/i;
73210
73489
  INTERSTITIAL_RE = /\b(please wait for verification|just a moment|cf-mitigated|datadome|akamai bot|perimeterx|sign in to continue|log in to (?:continue|access)|javascript is not available)\b/i;
@@ -73287,6 +73566,7 @@ var init_direct_document = __esm(() => {
73287
73566
  "look"
73288
73567
  ]);
73289
73568
  MARKDOWN_BUDGET = Math.max(1000, Number(process.env.UNBROWSE_MARKDOWN_BUDGET ?? "12000") || 12000);
73569
+ QUERY_STOPWORDS = new Set(("resolve unbrowse execute run walk go fetch open view want need please " + "find search browse list lookup discover show get me a an the on of for in to " + "with and or all my your this that some good best top new latest cheap near").split(" "));
73290
73570
  buildBloombergDirectDocumentResult = buildDirectDocumentResult;
73291
73571
  });
73292
73572
 
@@ -121959,181 +122239,6 @@ function clampToFloor(score, demotion, floor) {
121959
122239
  }
121960
122240
  var HARD_NEGATIVE_FLOOR = -2000, WEAK_NEGATIVE_FLOOR = -400, PAGE_ARTIFACT_DEMOTION = 800, EMPTY_ENTITY_BAG_DEMOTION = 650, EMPTY_ENTITY_BAG_FLOOR = -700;
121961
122241
 
121962
- // .tmp-runtime-src/execution/search-forms.ts
121963
- var exports_search_forms = {};
121964
- __export(exports_search_forms, {
121965
- isStructuredSearchForm: () => isStructuredSearchForm,
121966
- detectSearchForms: () => detectSearchForms
121967
- });
121968
- function isStructuredSearchForm(spec) {
121969
- return spec.fields.length > 0 && !!spec.submit_selector;
121970
- }
121971
- function formSelectorFromElement(attribs, index2) {
121972
- const id = attribs.id;
121973
- if (id)
121974
- return `form#${id}`;
121975
- const name = attribs.name;
121976
- if (name)
121977
- return `form[name="${name}"]`;
121978
- const action2 = attribs.action;
121979
- if (action2)
121980
- return `form[action="${action2}"]`;
121981
- return `form:nth-of-type(${index2 + 1})`;
121982
- }
121983
- function inputSelectorFromElement(attribs, tagName) {
121984
- const id = attribs.id;
121985
- if (id)
121986
- return `#${id}`;
121987
- const name = attribs.name;
121988
- if (name)
121989
- return `${tagName}[name="${name}"]`;
121990
- return tagName;
121991
- }
121992
- function mapInputType(typeAttr, tagName) {
121993
- if (tagName === "select")
121994
- return "select";
121995
- if (tagName === "textarea")
121996
- return "text";
121997
- const t = (typeAttr ?? "text").toLowerCase();
121998
- if (t === "radio")
121999
- return "radio";
122000
- if (t === "checkbox")
122001
- return "checkbox";
122002
- if (t === "date")
122003
- return "date";
122004
- if (t === "hidden")
122005
- return "hidden";
122006
- if (t === "submit" || t === "button" || t === "image" || t === "reset")
122007
- return null;
122008
- if (t === "password" || t === "file")
122009
- return null;
122010
- if (SUPPORTED_INPUT_TYPES.has(t))
122011
- return "text";
122012
- return "text";
122013
- }
122014
- function parseAttrs(attrStr) {
122015
- const attrs = {};
122016
- const attrRegex = /(\w[\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g;
122017
- let m;
122018
- while ((m = attrRegex.exec(attrStr)) !== null) {
122019
- attrs[m[1]] = m[2] ?? m[3] ?? m[4] ?? "";
122020
- }
122021
- return attrs;
122022
- }
122023
- function detectSearchForms(html3) {
122024
- const results = [];
122025
- const formRegex = /<form([^>]*)>([\s\S]*?)<\/form>/gi;
122026
- let formMatch;
122027
- let formIndex = 0;
122028
- while ((formMatch = formRegex.exec(html3)) !== null) {
122029
- const formAttrs = formMatch[1];
122030
- const formBody = formMatch[2];
122031
- const formElAttrs = parseAttrs(formAttrs);
122032
- const fieldRegex = /<(input|select|textarea)([^>]*)\/?>/gi;
122033
- let fieldMatch;
122034
- const fields = [];
122035
- const seenNames = new Set;
122036
- let hasLoginField = false;
122037
- let hasSearchLikeField = false;
122038
- while ((fieldMatch = fieldRegex.exec(formBody)) !== null) {
122039
- const tagName = fieldMatch[1].toLowerCase();
122040
- const fieldAttrs = parseAttrs(fieldMatch[2]);
122041
- const name = fieldAttrs.name ?? "";
122042
- const typeAttr = fieldAttrs.type;
122043
- if (LOGIN_FIELD_NAMES.has(name.toLowerCase()) || typeAttr === "password") {
122044
- hasLoginField = true;
122045
- }
122046
- if (SEARCH_FIELD_NAMES.has(name.toLowerCase())) {
122047
- hasSearchLikeField = true;
122048
- }
122049
- const mappedType = mapInputType(typeAttr, tagName);
122050
- if (!mappedType)
122051
- continue;
122052
- if (!name && mappedType !== "text")
122053
- continue;
122054
- if (seenNames.has(name) && mappedType !== "radio")
122055
- continue;
122056
- if (name)
122057
- seenNames.add(name);
122058
- let options;
122059
- if (tagName === "select") {
122060
- const optRegex = /<option[^>]*value="([^"]*)"[^>]*>/gi;
122061
- let optMatch;
122062
- options = [];
122063
- while ((optMatch = optRegex.exec(formBody)) !== null) {
122064
- options.push(optMatch[1]);
122065
- }
122066
- if (options.length === 0)
122067
- options = undefined;
122068
- }
122069
- fields.push({
122070
- name: name || `unnamed_${fields.length}`,
122071
- type: mappedType,
122072
- selector: inputSelectorFromElement(fieldAttrs, tagName),
122073
- ...options ? { options } : {},
122074
- required: fieldAttrs.required !== undefined
122075
- });
122076
- }
122077
- let submitSelector = "";
122078
- if (/<button[^>]*type\s*=\s*"submit"/i.test(formBody)) {
122079
- submitSelector = "button[type=submit]";
122080
- } else if (/<input[^>]*type\s*=\s*"submit"/i.test(formBody)) {
122081
- submitSelector = 'input[type="submit"]';
122082
- } else if (/<button/i.test(formBody)) {
122083
- submitSelector = "button";
122084
- }
122085
- const nonHiddenFields = fields.filter((f) => f.type !== "hidden");
122086
- if (!hasLoginField && nonHiddenFields.length > 0 && submitSelector && (hasSearchLikeField || nonHiddenFields.length >= 1)) {
122087
- const formSelector = formSelectorFromElement(formElAttrs, formIndex);
122088
- results.push({
122089
- form_selector: formSelector,
122090
- submit_selector: submitSelector,
122091
- fields
122092
- });
122093
- }
122094
- formIndex++;
122095
- }
122096
- return results;
122097
- }
122098
- var SEARCH_FIELD_NAMES, LOGIN_FIELD_NAMES, SUPPORTED_INPUT_TYPES;
122099
- var init_search_forms = __esm(() => {
122100
- SEARCH_FIELD_NAMES = new Set([
122101
- "q",
122102
- "query",
122103
- "search",
122104
- "keyword",
122105
- "keywords",
122106
- "term",
122107
- "terms",
122108
- "find",
122109
- "lookup",
122110
- "filter",
122111
- "s",
122112
- "text",
122113
- "input"
122114
- ]);
122115
- LOGIN_FIELD_NAMES = new Set([
122116
- "password",
122117
- "passwd",
122118
- "pass",
122119
- "pwd",
122120
- "confirm_password",
122121
- "username",
122122
- "email",
122123
- "login",
122124
- "user"
122125
- ]);
122126
- SUPPORTED_INPUT_TYPES = new Set([
122127
- "text",
122128
- "search",
122129
- "hidden",
122130
- "date",
122131
- "number",
122132
- "tel",
122133
- "email"
122134
- ]);
122135
- });
122136
-
122137
122242
  // .tmp-runtime-src/state/stateless.ts
122138
122243
  function isStateless() {
122139
122244
  const v = process.env.UNBROWSE_STATELESS;
package/runtime/mcp.js CHANGED
@@ -36310,7 +36310,7 @@ var init_cached_resolution = __esm(() => {
36310
36310
  });
36311
36311
 
36312
36312
  // .tmp-runtime-src/build-info.generated.ts
36313
- var BUILD_RELEASE_VERSION = "9.7.0", BUILD_GIT_SHA = "98fa4d4472e2", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiOS43LjAiLCJnaXRfc2hhIjoiOThmYTRkNDQ3MmUyIiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUA5OGZhNGQ0NDcyZTIiLCJpc3N1ZWRfYXQiOiIyMDI2LTA2LTE4VDA0OjU4OjQwLjQ3NFoifQ", BUILD_RELEASE_MANIFEST_SIGNATURE = "-b2L4xhYhJ-mLJeo39RL19HSAKgjhZ2_D3ezoXjvQLQ", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai", BUILD_DEFAULT_PROFILE = "";
36313
+ var BUILD_RELEASE_VERSION = "9.8.0", BUILD_GIT_SHA = "255142bb4c25", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiOS44LjAiLCJnaXRfc2hhIjoiMjU1MTQyYmI0YzI1IiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUAyNTUxNDJiYjRjMjUiLCJpc3N1ZWRfYXQiOiIyMDI2LTA2LTE4VDA2OjA4OjA2LjY3MVoifQ", BUILD_RELEASE_MANIFEST_SIGNATURE = "Tw3ScHlFYGaEtPwKLhcPI_lgQUgjAZmhWKSi4fDFMw4", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai", BUILD_DEFAULT_PROFILE = "";
36314
36314
 
36315
36315
  // .tmp-runtime-src/version.ts
36316
36316
  import { createHash as createHash4 } from "crypto";
@@ -43252,6 +43252,42 @@ function urlPathLooksListLike(contextUrl) {
43252
43252
  return false;
43253
43253
  }
43254
43254
  }
43255
+ function entityPointerTemplate(href) {
43256
+ let path5 = href;
43257
+ try {
43258
+ path5 = new URL(href, "https://_").pathname;
43259
+ } catch {
43260
+ path5 = href.split("?")[0];
43261
+ }
43262
+ const segs = path5.split("/").filter(Boolean);
43263
+ if (segs.length === 0)
43264
+ return null;
43265
+ const shape = [];
43266
+ let hasId = false;
43267
+ for (const s of segs) {
43268
+ const low = s.toLowerCase();
43269
+ if (/\d{3,}/.test(low) || low.length > 30 || /^[0-9a-f-]{8,}$/.test(low) || /-\d{2,}$/.test(low)) {
43270
+ shape.push("{id}");
43271
+ hasId = true;
43272
+ } else {
43273
+ shape.push(low);
43274
+ }
43275
+ }
43276
+ return hasId ? shape.slice(0, 3).join("/") : null;
43277
+ }
43278
+ function linksFormEntityCollection(hrefs, min = 4) {
43279
+ const groups = new Map;
43280
+ for (const href of hrefs) {
43281
+ const t = entityPointerTemplate(href);
43282
+ if (!t)
43283
+ continue;
43284
+ const n = (groups.get(t) ?? 0) + 1;
43285
+ if (n >= min)
43286
+ return true;
43287
+ groups.set(t, n);
43288
+ }
43289
+ return false;
43290
+ }
43255
43291
  function cardinalityMatches(intent, subject, opts) {
43256
43292
  const wantsMany = isListLikeIntent(intent) || urlPathLooksListLike(opts?.contextUrl);
43257
43293
  if (!wantsMany)
@@ -54387,6 +54423,215 @@ var init_curl_impersonate_fallback = __esm(() => {
54387
54423
  };
54388
54424
  });
54389
54425
 
54426
+ // .tmp-runtime-src/execution/search-forms.ts
54427
+ var exports_search_forms = {};
54428
+ __export(exports_search_forms, {
54429
+ isStructuredSearchForm: () => isStructuredSearchForm,
54430
+ fillSearchRoute: () => fillSearchRoute,
54431
+ detectSearchForms: () => detectSearchForms,
54432
+ deriveSearchRouteTemplates: () => deriveSearchRouteTemplates
54433
+ });
54434
+ function deriveSearchRouteTemplates(html, minDistinct = 4) {
54435
+ const hrefs = new Set;
54436
+ for (const m of html.matchAll(/href\s*=\s*["'](\/[^"'?#\s]+)["']/gi))
54437
+ hrefs.add(m[1]);
54438
+ const groups = new Map;
54439
+ for (const h of hrefs) {
54440
+ const segs = h.split("/").filter(Boolean);
54441
+ if (segs.length < 1 || segs.length > 4)
54442
+ continue;
54443
+ for (let i = 0;i < segs.length; i++) {
54444
+ const val = segs[i];
54445
+ if (!/^[a-z][a-z0-9-]{1,40}$/i.test(val) || /\d{3,}/.test(val) || /\.[a-z0-9]{1,5}$/i.test(val))
54446
+ continue;
54447
+ const shape = segs.map((s, j) => j === i ? "{query}" : s).join("/");
54448
+ const trailing = h.endsWith("/") ? "/" : "";
54449
+ const key = `/${shape}${trailing}`;
54450
+ if (!groups.has(key))
54451
+ groups.set(key, new Set);
54452
+ groups.get(key).add(val.toLowerCase());
54453
+ }
54454
+ }
54455
+ const out = [];
54456
+ for (const [template, vals] of groups) {
54457
+ if (vals.size >= minDistinct)
54458
+ out.push({ template, samples: [...vals].slice(0, 5), count: vals.size });
54459
+ }
54460
+ return out.sort((a, b) => b.count - a.count);
54461
+ }
54462
+ function fillSearchRoute(origin, template, query) {
54463
+ const slug = encodeURIComponent(query.trim().toLowerCase());
54464
+ return origin.replace(/\/+$/, "") + template.replace("{query}", slug);
54465
+ }
54466
+ function isStructuredSearchForm(spec) {
54467
+ return spec.fields.length > 0 && !!spec.submit_selector;
54468
+ }
54469
+ function formSelectorFromElement(attribs, index) {
54470
+ const id = attribs.id;
54471
+ if (id)
54472
+ return `form#${id}`;
54473
+ const name = attribs.name;
54474
+ if (name)
54475
+ return `form[name="${name}"]`;
54476
+ const action2 = attribs.action;
54477
+ if (action2)
54478
+ return `form[action="${action2}"]`;
54479
+ return `form:nth-of-type(${index + 1})`;
54480
+ }
54481
+ function inputSelectorFromElement(attribs, tagName) {
54482
+ const id = attribs.id;
54483
+ if (id)
54484
+ return `#${id}`;
54485
+ const name = attribs.name;
54486
+ if (name)
54487
+ return `${tagName}[name="${name}"]`;
54488
+ return tagName;
54489
+ }
54490
+ function mapInputType(typeAttr, tagName) {
54491
+ if (tagName === "select")
54492
+ return "select";
54493
+ if (tagName === "textarea")
54494
+ return "text";
54495
+ const t = (typeAttr ?? "text").toLowerCase();
54496
+ if (t === "radio")
54497
+ return "radio";
54498
+ if (t === "checkbox")
54499
+ return "checkbox";
54500
+ if (t === "date")
54501
+ return "date";
54502
+ if (t === "hidden")
54503
+ return "hidden";
54504
+ if (t === "submit" || t === "button" || t === "image" || t === "reset")
54505
+ return null;
54506
+ if (t === "password" || t === "file")
54507
+ return null;
54508
+ if (SUPPORTED_INPUT_TYPES.has(t))
54509
+ return "text";
54510
+ return "text";
54511
+ }
54512
+ function parseAttrs(attrStr) {
54513
+ const attrs = {};
54514
+ const attrRegex = /(\w[\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g;
54515
+ let m;
54516
+ while ((m = attrRegex.exec(attrStr)) !== null) {
54517
+ attrs[m[1]] = m[2] ?? m[3] ?? m[4] ?? "";
54518
+ }
54519
+ return attrs;
54520
+ }
54521
+ function detectSearchForms(html) {
54522
+ const results = [];
54523
+ const formRegex = /<form([^>]*)>([\s\S]*?)<\/form>/gi;
54524
+ let formMatch;
54525
+ let formIndex = 0;
54526
+ while ((formMatch = formRegex.exec(html)) !== null) {
54527
+ const formAttrs = formMatch[1];
54528
+ const formBody = formMatch[2];
54529
+ const formElAttrs = parseAttrs(formAttrs);
54530
+ const fieldRegex = /<(input|select|textarea)([^>]*)\/?>/gi;
54531
+ let fieldMatch;
54532
+ const fields = [];
54533
+ const seenNames = new Set;
54534
+ let hasLoginField = false;
54535
+ let hasSearchLikeField = false;
54536
+ while ((fieldMatch = fieldRegex.exec(formBody)) !== null) {
54537
+ const tagName = fieldMatch[1].toLowerCase();
54538
+ const fieldAttrs = parseAttrs(fieldMatch[2]);
54539
+ const name = fieldAttrs.name ?? "";
54540
+ const typeAttr = fieldAttrs.type;
54541
+ if (LOGIN_FIELD_NAMES.has(name.toLowerCase()) || typeAttr === "password") {
54542
+ hasLoginField = true;
54543
+ }
54544
+ if (SEARCH_FIELD_NAMES.has(name.toLowerCase())) {
54545
+ hasSearchLikeField = true;
54546
+ }
54547
+ const mappedType = mapInputType(typeAttr, tagName);
54548
+ if (!mappedType)
54549
+ continue;
54550
+ if (!name && mappedType !== "text")
54551
+ continue;
54552
+ if (seenNames.has(name) && mappedType !== "radio")
54553
+ continue;
54554
+ if (name)
54555
+ seenNames.add(name);
54556
+ let options;
54557
+ if (tagName === "select") {
54558
+ const optRegex = /<option[^>]*value="([^"]*)"[^>]*>/gi;
54559
+ let optMatch;
54560
+ options = [];
54561
+ while ((optMatch = optRegex.exec(formBody)) !== null) {
54562
+ options.push(optMatch[1]);
54563
+ }
54564
+ if (options.length === 0)
54565
+ options = undefined;
54566
+ }
54567
+ fields.push({
54568
+ name: name || `unnamed_${fields.length}`,
54569
+ type: mappedType,
54570
+ selector: inputSelectorFromElement(fieldAttrs, tagName),
54571
+ ...options ? { options } : {},
54572
+ required: fieldAttrs.required !== undefined
54573
+ });
54574
+ }
54575
+ let submitSelector = "";
54576
+ if (/<button[^>]*type\s*=\s*"submit"/i.test(formBody)) {
54577
+ submitSelector = "button[type=submit]";
54578
+ } else if (/<input[^>]*type\s*=\s*"submit"/i.test(formBody)) {
54579
+ submitSelector = 'input[type="submit"]';
54580
+ } else if (/<button/i.test(formBody)) {
54581
+ submitSelector = "button";
54582
+ }
54583
+ const nonHiddenFields = fields.filter((f) => f.type !== "hidden");
54584
+ if (!hasLoginField && nonHiddenFields.length > 0 && submitSelector && (hasSearchLikeField || nonHiddenFields.length >= 1)) {
54585
+ const formSelector = formSelectorFromElement(formElAttrs, formIndex);
54586
+ results.push({
54587
+ form_selector: formSelector,
54588
+ submit_selector: submitSelector,
54589
+ fields
54590
+ });
54591
+ }
54592
+ formIndex++;
54593
+ }
54594
+ return results;
54595
+ }
54596
+ var SEARCH_FIELD_NAMES, LOGIN_FIELD_NAMES, SUPPORTED_INPUT_TYPES;
54597
+ var init_search_forms = __esm(() => {
54598
+ SEARCH_FIELD_NAMES = new Set([
54599
+ "q",
54600
+ "query",
54601
+ "search",
54602
+ "keyword",
54603
+ "keywords",
54604
+ "term",
54605
+ "terms",
54606
+ "find",
54607
+ "lookup",
54608
+ "filter",
54609
+ "s",
54610
+ "text",
54611
+ "input"
54612
+ ]);
54613
+ LOGIN_FIELD_NAMES = new Set([
54614
+ "password",
54615
+ "passwd",
54616
+ "pass",
54617
+ "pwd",
54618
+ "confirm_password",
54619
+ "username",
54620
+ "email",
54621
+ "login",
54622
+ "user"
54623
+ ]);
54624
+ SUPPORTED_INPUT_TYPES = new Set([
54625
+ "text",
54626
+ "search",
54627
+ "hidden",
54628
+ "date",
54629
+ "number",
54630
+ "tel",
54631
+ "email"
54632
+ ]);
54633
+ });
54634
+
54390
54635
  // node_modules/.bun/@mixmark-io+domino@2.2.0/node_modules/@mixmark-io/domino/lib/Event.js
54391
54636
  var require_Event = __commonJS((exports, module) => {
54392
54637
  module.exports = Event2;
@@ -71038,20 +71283,24 @@ function buildDirectDocumentResult(url, html, contentType, intent) {
71038
71283
  const hits = intentTokens.filter((tok) => haystack.includes(tok));
71039
71284
  const hitRate = hits.length / intentTokens.length;
71040
71285
  if (hitRate < 0.34) {
71041
- return {
71042
- rejected: true,
71043
- reason: "intent_mismatch",
71044
- evidence: {
71045
- intent_tokens: intentTokens,
71046
- response_token_hits: hits,
71047
- response_token_hit_rate: hitRate,
71048
- html_bytes: html.length
71049
- }
71050
- };
71286
+ const isCollection = isListLikeIntent(intent) && linksFormEntityCollection(Array.from(html.matchAll(/href\s*=\s*["']([^"']+)["']/gi), (m) => m[1]));
71287
+ if (!isCollection) {
71288
+ return {
71289
+ rejected: true,
71290
+ reason: "intent_mismatch",
71291
+ evidence: {
71292
+ intent_tokens: intentTokens,
71293
+ response_token_hits: hits,
71294
+ response_token_hit_rate: hitRate,
71295
+ html_bytes: html.length
71296
+ }
71297
+ };
71298
+ }
71051
71299
  }
71052
71300
  }
71053
71301
  }
71054
71302
  const { url_template, input_params, path_params, query } = extractHtmlHoles(url);
71303
+ const routing_candidates = buildSearchRouteCandidates(html, url, intent);
71055
71304
  return {
71056
71305
  rejected: false,
71057
71306
  title,
@@ -71065,12 +71314,40 @@ function buildDirectDocumentResult(url, html, contentType, intent) {
71065
71314
  text_excerpt: bodyText.slice(0, MARKDOWN_BUDGET),
71066
71315
  markdown: htmlToMarkdownSafe(html, bodyText),
71067
71316
  tables: extractTables(html),
71317
+ ...routing_candidates.length > 0 ? { routing_candidates } : {},
71068
71318
  extraction: {
71069
71319
  source: "direct-document",
71070
71320
  rejected: false
71071
71321
  }
71072
71322
  };
71073
71323
  }
71324
+ function intentQueryTerm(intent, url) {
71325
+ let domTokens = new Set;
71326
+ try {
71327
+ domTokens = new Set(new URL(url).hostname.toLowerCase().split(/[.-]/));
71328
+ } catch {}
71329
+ const toks = (intent.toLowerCase().match(/[a-z][a-z0-9]{2,}/g) ?? []).filter((t) => !QUERY_STOPWORDS.has(t) && !domTokens.has(t));
71330
+ return [...new Set(toks)].join(" ").trim();
71331
+ }
71332
+ function buildSearchRouteCandidates(html, url, intent) {
71333
+ if (!intent || !isListLikeIntent(intent))
71334
+ return [];
71335
+ const queryTerm = intentQueryTerm(intent, url);
71336
+ if (!queryTerm)
71337
+ return [];
71338
+ let origin = "";
71339
+ try {
71340
+ origin = new URL(url).origin;
71341
+ } catch {
71342
+ return [];
71343
+ }
71344
+ return deriveSearchRouteTemplates(html).slice(0, 3).map((t) => ({
71345
+ url: fillSearchRoute(origin, t.template, queryTerm),
71346
+ template: t.template,
71347
+ query: queryTerm,
71348
+ samples: t.samples
71349
+ }));
71350
+ }
71074
71351
  async function fetchDirectDocument(url) {
71075
71352
  if (!isDirectDocumentEligibleUrl(url))
71076
71353
  return null;
@@ -71258,10 +71535,12 @@ function cellText(html) {
71258
71535
  function decodeHtmlEntityText(input) {
71259
71536
  return input.replace(/&nbsp;/g, " ").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, '"').replace(/&#39;/g, "'");
71260
71537
  }
71261
- var HTML_RE, MIN_DIRECT_DOCUMENT_HTML_BYTES = 5000, CHALLENGE_RE, INTERSTITIAL_RE, MIN_DIRECT_DOCUMENT_BODY_TEXT = 500, SPA_HYDRATION_RE, SPA_HYDRATION_BODY_TEXT_FLOOR = 2000, SPA_ROOT_CONTAINER_RE, PARKED_RE, INTENT_STOPWORDS, MARKDOWN_BUDGET, MAX_TABLES = 10, MAX_TABLE_ROWS = 50, buildBloombergDirectDocumentResult;
71538
+ var HTML_RE, MIN_DIRECT_DOCUMENT_HTML_BYTES = 5000, CHALLENGE_RE, INTERSTITIAL_RE, MIN_DIRECT_DOCUMENT_BODY_TEXT = 500, SPA_HYDRATION_RE, SPA_HYDRATION_BODY_TEXT_FLOOR = 2000, SPA_ROOT_CONTAINER_RE, PARKED_RE, INTENT_STOPWORDS, MARKDOWN_BUDGET, MAX_TABLES = 10, MAX_TABLE_ROWS = 50, QUERY_STOPWORDS, buildBloombergDirectDocumentResult;
71262
71539
  var init_direct_document = __esm(() => {
71263
71540
  init_curl_impersonate_fallback();
71264
71541
  init_proxy_fetch();
71542
+ init_cardinality();
71543
+ init_search_forms();
71265
71544
  HTML_RE = /text\/html|application\/xhtml\+xml/i;
71266
71545
  CHALLENGE_RE = /\b(access denied|are you a robot|captcha|just a moment|pardon our interruption|robot check|unusual traffic|verify you are human)\b/i;
71267
71546
  INTERSTITIAL_RE = /\b(please wait for verification|just a moment|cf-mitigated|datadome|akamai bot|perimeterx|sign in to continue|log in to (?:continue|access)|javascript is not available)\b/i;
@@ -71344,6 +71623,7 @@ var init_direct_document = __esm(() => {
71344
71623
  "look"
71345
71624
  ]);
71346
71625
  MARKDOWN_BUDGET = Math.max(1000, Number(process.env.UNBROWSE_MARKDOWN_BUDGET ?? "12000") || 12000);
71626
+ QUERY_STOPWORDS = new Set(("resolve unbrowse execute run walk go fetch open view want need please " + "find search browse list lookup discover show get me a an the on of for in to " + "with and or all my your this that some good best top new latest cheap near").split(" "));
71347
71627
  buildBloombergDirectDocumentResult = buildDirectDocumentResult;
71348
71628
  });
71349
71629
 
@@ -120056,181 +120336,6 @@ function clampToFloor(score, demotion, floor) {
120056
120336
  }
120057
120337
  var HARD_NEGATIVE_FLOOR = -2000, WEAK_NEGATIVE_FLOOR = -400, PAGE_ARTIFACT_DEMOTION = 800, EMPTY_ENTITY_BAG_DEMOTION = 650, EMPTY_ENTITY_BAG_FLOOR = -700;
120058
120338
 
120059
- // .tmp-runtime-src/execution/search-forms.ts
120060
- var exports_search_forms = {};
120061
- __export(exports_search_forms, {
120062
- isStructuredSearchForm: () => isStructuredSearchForm,
120063
- detectSearchForms: () => detectSearchForms
120064
- });
120065
- function isStructuredSearchForm(spec) {
120066
- return spec.fields.length > 0 && !!spec.submit_selector;
120067
- }
120068
- function formSelectorFromElement(attribs, index2) {
120069
- const id = attribs.id;
120070
- if (id)
120071
- return `form#${id}`;
120072
- const name = attribs.name;
120073
- if (name)
120074
- return `form[name="${name}"]`;
120075
- const action2 = attribs.action;
120076
- if (action2)
120077
- return `form[action="${action2}"]`;
120078
- return `form:nth-of-type(${index2 + 1})`;
120079
- }
120080
- function inputSelectorFromElement(attribs, tagName) {
120081
- const id = attribs.id;
120082
- if (id)
120083
- return `#${id}`;
120084
- const name = attribs.name;
120085
- if (name)
120086
- return `${tagName}[name="${name}"]`;
120087
- return tagName;
120088
- }
120089
- function mapInputType(typeAttr, tagName) {
120090
- if (tagName === "select")
120091
- return "select";
120092
- if (tagName === "textarea")
120093
- return "text";
120094
- const t = (typeAttr ?? "text").toLowerCase();
120095
- if (t === "radio")
120096
- return "radio";
120097
- if (t === "checkbox")
120098
- return "checkbox";
120099
- if (t === "date")
120100
- return "date";
120101
- if (t === "hidden")
120102
- return "hidden";
120103
- if (t === "submit" || t === "button" || t === "image" || t === "reset")
120104
- return null;
120105
- if (t === "password" || t === "file")
120106
- return null;
120107
- if (SUPPORTED_INPUT_TYPES.has(t))
120108
- return "text";
120109
- return "text";
120110
- }
120111
- function parseAttrs(attrStr) {
120112
- const attrs = {};
120113
- const attrRegex = /(\w[\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g;
120114
- let m;
120115
- while ((m = attrRegex.exec(attrStr)) !== null) {
120116
- attrs[m[1]] = m[2] ?? m[3] ?? m[4] ?? "";
120117
- }
120118
- return attrs;
120119
- }
120120
- function detectSearchForms(html3) {
120121
- const results = [];
120122
- const formRegex = /<form([^>]*)>([\s\S]*?)<\/form>/gi;
120123
- let formMatch;
120124
- let formIndex = 0;
120125
- while ((formMatch = formRegex.exec(html3)) !== null) {
120126
- const formAttrs = formMatch[1];
120127
- const formBody = formMatch[2];
120128
- const formElAttrs = parseAttrs(formAttrs);
120129
- const fieldRegex = /<(input|select|textarea)([^>]*)\/?>/gi;
120130
- let fieldMatch;
120131
- const fields = [];
120132
- const seenNames = new Set;
120133
- let hasLoginField = false;
120134
- let hasSearchLikeField = false;
120135
- while ((fieldMatch = fieldRegex.exec(formBody)) !== null) {
120136
- const tagName = fieldMatch[1].toLowerCase();
120137
- const fieldAttrs = parseAttrs(fieldMatch[2]);
120138
- const name = fieldAttrs.name ?? "";
120139
- const typeAttr = fieldAttrs.type;
120140
- if (LOGIN_FIELD_NAMES.has(name.toLowerCase()) || typeAttr === "password") {
120141
- hasLoginField = true;
120142
- }
120143
- if (SEARCH_FIELD_NAMES.has(name.toLowerCase())) {
120144
- hasSearchLikeField = true;
120145
- }
120146
- const mappedType = mapInputType(typeAttr, tagName);
120147
- if (!mappedType)
120148
- continue;
120149
- if (!name && mappedType !== "text")
120150
- continue;
120151
- if (seenNames.has(name) && mappedType !== "radio")
120152
- continue;
120153
- if (name)
120154
- seenNames.add(name);
120155
- let options;
120156
- if (tagName === "select") {
120157
- const optRegex = /<option[^>]*value="([^"]*)"[^>]*>/gi;
120158
- let optMatch;
120159
- options = [];
120160
- while ((optMatch = optRegex.exec(formBody)) !== null) {
120161
- options.push(optMatch[1]);
120162
- }
120163
- if (options.length === 0)
120164
- options = undefined;
120165
- }
120166
- fields.push({
120167
- name: name || `unnamed_${fields.length}`,
120168
- type: mappedType,
120169
- selector: inputSelectorFromElement(fieldAttrs, tagName),
120170
- ...options ? { options } : {},
120171
- required: fieldAttrs.required !== undefined
120172
- });
120173
- }
120174
- let submitSelector = "";
120175
- if (/<button[^>]*type\s*=\s*"submit"/i.test(formBody)) {
120176
- submitSelector = "button[type=submit]";
120177
- } else if (/<input[^>]*type\s*=\s*"submit"/i.test(formBody)) {
120178
- submitSelector = 'input[type="submit"]';
120179
- } else if (/<button/i.test(formBody)) {
120180
- submitSelector = "button";
120181
- }
120182
- const nonHiddenFields = fields.filter((f) => f.type !== "hidden");
120183
- if (!hasLoginField && nonHiddenFields.length > 0 && submitSelector && (hasSearchLikeField || nonHiddenFields.length >= 1)) {
120184
- const formSelector = formSelectorFromElement(formElAttrs, formIndex);
120185
- results.push({
120186
- form_selector: formSelector,
120187
- submit_selector: submitSelector,
120188
- fields
120189
- });
120190
- }
120191
- formIndex++;
120192
- }
120193
- return results;
120194
- }
120195
- var SEARCH_FIELD_NAMES, LOGIN_FIELD_NAMES, SUPPORTED_INPUT_TYPES;
120196
- var init_search_forms = __esm(() => {
120197
- SEARCH_FIELD_NAMES = new Set([
120198
- "q",
120199
- "query",
120200
- "search",
120201
- "keyword",
120202
- "keywords",
120203
- "term",
120204
- "terms",
120205
- "find",
120206
- "lookup",
120207
- "filter",
120208
- "s",
120209
- "text",
120210
- "input"
120211
- ]);
120212
- LOGIN_FIELD_NAMES = new Set([
120213
- "password",
120214
- "passwd",
120215
- "pass",
120216
- "pwd",
120217
- "confirm_password",
120218
- "username",
120219
- "email",
120220
- "login",
120221
- "user"
120222
- ]);
120223
- SUPPORTED_INPUT_TYPES = new Set([
120224
- "text",
120225
- "search",
120226
- "hidden",
120227
- "date",
120228
- "number",
120229
- "tel",
120230
- "email"
120231
- ]);
120232
- });
120233
-
120234
120339
  // .tmp-runtime-src/state/stateless.ts
120235
120340
  function isStateless() {
120236
120341
  const v = process.env.UNBROWSE_STATELESS;
Binary file
@@ -2,7 +2,7 @@
2
2
  "repo_url": "https://github.com/justrach/kuri.git",
3
3
  "branch": "adding-extensions",
4
4
  "source_sha": "149881254046a20778f642b69f20f0c6468f6fb4",
5
- "built_at": "2026-06-18T04:40:25.593Z",
5
+ "built_at": "2026-06-18T05:53:02.585Z",
6
6
  "binaries": {
7
7
  "darwin-arm64": {
8
8
  "zig_target": "aarch64-macos",
@@ -21,11 +21,11 @@
21
21
  },
22
22
  "linux-x64": {
23
23
  "zig_target": "x86_64-linux",
24
- "sha256": "b505ed7fd67c24c58d666b3d868bd5d0eb6c44033f0d6c52a5ad3f4abfcbedf7"
24
+ "sha256": "f39955d73d86150fba2a4bec6393e7745feb42f5152870b6c27fd68a5cff3a6e"
25
25
  },
26
26
  "win-x64": {
27
27
  "zig_target": "x86_64-windows-gnu",
28
- "sha256": "5cb9e912772b7a80126ef358721b05a99d28259ffc03e75df2715569ed799b80",
28
+ "sha256": "376a34f508ea6a4e140150f9f6ddc00519f8bb0894ee2dd7a60bc0e7613d89b0",
29
29
  "source": "pre-staged"
30
30
  }
31
31
  },
@@ -33,22 +33,22 @@
33
33
  "darwin-arm64": {
34
34
  "zig_target": "aarch64-macos",
35
35
  "lib": "libkuri_ffi.dylib",
36
- "sha256": "01b68bd41b030c8d70ba6c8e4858ad1b2f578511709bb44affb60266766e089a"
36
+ "sha256": "2ca1be4d477f28c4a4ab1dd993cfe6766b9f6858c42befda3f503a3e2940bf6f"
37
37
  },
38
38
  "darwin-x64": {
39
39
  "zig_target": "x86_64-macos",
40
40
  "lib": "libkuri_ffi.dylib",
41
- "sha256": "bbf6543f8dc9490a1f0e84b877c03499b1836b0462b119b678483b890eadadfa"
41
+ "sha256": "80e27865e521b4bc6a79dfcb4fd481535d70e03226125ce10d9625ba02320f47"
42
42
  },
43
43
  "linux-arm64": {
44
44
  "zig_target": "aarch64-linux",
45
45
  "lib": "libkuri_ffi.so",
46
- "sha256": "3ff8184062706577cbdba34700c853bbabb7de40953fd13d94ab12d4fd470424"
46
+ "sha256": "6fa04fc6b505212e5ae9cfdc0d21eb2f9b2eb97dc41850e02ba3f4112e252e9d"
47
47
  },
48
48
  "linux-x64": {
49
49
  "zig_target": "x86_64-linux",
50
50
  "lib": "libkuri_ffi.so",
51
- "sha256": "b44805644d94e1cd2d8613d5c0a0474e72fb9550e2067b750c967845c483e3e5"
51
+ "sha256": "fa520df6cca6eab9260bd43ed7a60d665c746645f12f6fb4e3ecddbb026bdb05"
52
52
  }
53
53
  }
54
54
  }
Binary file