gologin-web-access 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/CHANGELOG.md +19 -0
  2. package/LICENSE +21 -0
  3. package/README.md +344 -0
  4. package/dist/cli.js +173 -0
  5. package/dist/commands/back.js +13 -0
  6. package/dist/commands/batch.js +81 -0
  7. package/dist/commands/batchChangeTrack.js +99 -0
  8. package/dist/commands/batchExtract.js +97 -0
  9. package/dist/commands/batchScrape.js +140 -0
  10. package/dist/commands/changeTrack.js +65 -0
  11. package/dist/commands/check.js +14 -0
  12. package/dist/commands/click.js +14 -0
  13. package/dist/commands/close.js +19 -0
  14. package/dist/commands/configInit.js +77 -0
  15. package/dist/commands/configShow.js +23 -0
  16. package/dist/commands/cookies.js +22 -0
  17. package/dist/commands/cookiesClear.js +13 -0
  18. package/dist/commands/cookiesImport.js +14 -0
  19. package/dist/commands/crawl.js +71 -0
  20. package/dist/commands/crawlErrors.js +20 -0
  21. package/dist/commands/crawlResult.js +27 -0
  22. package/dist/commands/crawlStart.js +56 -0
  23. package/dist/commands/crawlStatus.js +25 -0
  24. package/dist/commands/current.js +14 -0
  25. package/dist/commands/dblclick.js +14 -0
  26. package/dist/commands/eval.js +20 -0
  27. package/dist/commands/extract.js +44 -0
  28. package/dist/commands/fill.js +15 -0
  29. package/dist/commands/find.js +16 -0
  30. package/dist/commands/focus.js +14 -0
  31. package/dist/commands/forward.js +13 -0
  32. package/dist/commands/get.js +15 -0
  33. package/dist/commands/hover.js +14 -0
  34. package/dist/commands/jobs.js +47 -0
  35. package/dist/commands/map.js +61 -0
  36. package/dist/commands/open.js +22 -0
  37. package/dist/commands/parseDocument.js +34 -0
  38. package/dist/commands/pdf.js +14 -0
  39. package/dist/commands/press.js +15 -0
  40. package/dist/commands/read.js +51 -0
  41. package/dist/commands/reload.js +13 -0
  42. package/dist/commands/run.js +76 -0
  43. package/dist/commands/scrape.js +19 -0
  44. package/dist/commands/scrapeJson.js +24 -0
  45. package/dist/commands/scrapeMarkdown.js +37 -0
  46. package/dist/commands/scrapeScreenshot.js +65 -0
  47. package/dist/commands/scrapeText.js +37 -0
  48. package/dist/commands/screenshot.js +23 -0
  49. package/dist/commands/scroll.js +23 -0
  50. package/dist/commands/scrollIntoView.js +14 -0
  51. package/dist/commands/search.js +39 -0
  52. package/dist/commands/searchBrowser.js +28 -0
  53. package/dist/commands/select.js +15 -0
  54. package/dist/commands/sessions.js +14 -0
  55. package/dist/commands/shared.js +102 -0
  56. package/dist/commands/snapshot.js +18 -0
  57. package/dist/commands/storageClear.js +18 -0
  58. package/dist/commands/storageExport.js +26 -0
  59. package/dist/commands/storageImport.js +23 -0
  60. package/dist/commands/tabClose.js +18 -0
  61. package/dist/commands/tabFocus.js +15 -0
  62. package/dist/commands/tabOpen.js +19 -0
  63. package/dist/commands/tabs.js +13 -0
  64. package/dist/commands/type.js +15 -0
  65. package/dist/commands/uncheck.js +14 -0
  66. package/dist/commands/upload.js +15 -0
  67. package/dist/commands/wait.js +27 -0
  68. package/dist/config.js +260 -0
  69. package/dist/doctor.js +86 -0
  70. package/dist/internal-agent/cli.js +336 -0
  71. package/dist/internal-agent/commands/back.js +12 -0
  72. package/dist/internal-agent/commands/check.js +17 -0
  73. package/dist/internal-agent/commands/click.js +17 -0
  74. package/dist/internal-agent/commands/close.js +12 -0
  75. package/dist/internal-agent/commands/cookies.js +23 -0
  76. package/dist/internal-agent/commands/cookiesClear.js +12 -0
  77. package/dist/internal-agent/commands/cookiesImport.js +18 -0
  78. package/dist/internal-agent/commands/current.js +9 -0
  79. package/dist/internal-agent/commands/dblclick.js +17 -0
  80. package/dist/internal-agent/commands/doctor.js +53 -0
  81. package/dist/internal-agent/commands/eval.js +30 -0
  82. package/dist/internal-agent/commands/fill.js +18 -0
  83. package/dist/internal-agent/commands/find.js +86 -0
  84. package/dist/internal-agent/commands/focus.js +17 -0
  85. package/dist/internal-agent/commands/forward.js +12 -0
  86. package/dist/internal-agent/commands/get.js +19 -0
  87. package/dist/internal-agent/commands/hover.js +17 -0
  88. package/dist/internal-agent/commands/open.js +67 -0
  89. package/dist/internal-agent/commands/pdf.js +18 -0
  90. package/dist/internal-agent/commands/press.js +19 -0
  91. package/dist/internal-agent/commands/reload.js +12 -0
  92. package/dist/internal-agent/commands/screenshot.js +22 -0
  93. package/dist/internal-agent/commands/scroll.js +25 -0
  94. package/dist/internal-agent/commands/scrollIntoView.js +17 -0
  95. package/dist/internal-agent/commands/select.js +18 -0
  96. package/dist/internal-agent/commands/sessions.js +15 -0
  97. package/dist/internal-agent/commands/shared.js +51 -0
  98. package/dist/internal-agent/commands/snapshot.js +16 -0
  99. package/dist/internal-agent/commands/storageClear.js +13 -0
  100. package/dist/internal-agent/commands/storageExport.js +24 -0
  101. package/dist/internal-agent/commands/storageImport.js +20 -0
  102. package/dist/internal-agent/commands/tabClose.js +21 -0
  103. package/dist/internal-agent/commands/tabFocus.js +21 -0
  104. package/dist/internal-agent/commands/tabOpen.js +13 -0
  105. package/dist/internal-agent/commands/tabs.js +17 -0
  106. package/dist/internal-agent/commands/type.js +18 -0
  107. package/dist/internal-agent/commands/uncheck.js +17 -0
  108. package/dist/internal-agent/commands/upload.js +18 -0
  109. package/dist/internal-agent/commands/wait.js +41 -0
  110. package/dist/internal-agent/daemon/browser.js +818 -0
  111. package/dist/internal-agent/daemon/refStore.js +26 -0
  112. package/dist/internal-agent/daemon/server.js +330 -0
  113. package/dist/internal-agent/daemon/sessionManager.js +684 -0
  114. package/dist/internal-agent/daemon/snapshot.js +285 -0
  115. package/dist/internal-agent/lib/config.js +59 -0
  116. package/dist/internal-agent/lib/daemon.js +300 -0
  117. package/dist/internal-agent/lib/errors.js +63 -0
  118. package/dist/internal-agent/lib/types.js +2 -0
  119. package/dist/internal-agent/lib/utils.js +165 -0
  120. package/dist/jobRunner.js +56 -0
  121. package/dist/lib/agentCli.js +158 -0
  122. package/dist/lib/browserRead.js +125 -0
  123. package/dist/lib/browserStructured.js +77 -0
  124. package/dist/lib/changeTracking.js +117 -0
  125. package/dist/lib/cloudApi.js +41 -0
  126. package/dist/lib/concurrency.js +15 -0
  127. package/dist/lib/crawl.js +313 -0
  128. package/dist/lib/document.js +170 -0
  129. package/dist/lib/errors.js +55 -0
  130. package/dist/lib/extract.js +65 -0
  131. package/dist/lib/extractRunner.js +22 -0
  132. package/dist/lib/jobRegistry.js +164 -0
  133. package/dist/lib/output.js +122 -0
  134. package/dist/lib/readSource.js +297 -0
  135. package/dist/lib/runbooks.js +193 -0
  136. package/dist/lib/search.js +727 -0
  137. package/dist/lib/selfCli.js +136 -0
  138. package/dist/lib/structuredScrape.js +83 -0
  139. package/dist/lib/types.js +2 -0
  140. package/dist/lib/unlocker.js +383 -0
  141. package/package.json +67 -0
@@ -0,0 +1,727 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __importDefault = (this && this.__importDefault) || function (mod) {
36
+ return (mod && mod.__esModule) ? mod : { "default": mod };
37
+ };
38
+ Object.defineProperty(exports, "__esModule", { value: true });
39
+ exports.searchWeb = searchWeb;
40
+ exports.buildSearchAttemptPlan = buildSearchAttemptPlan;
41
+ exports.buildSearchUrl = buildSearchUrl;
42
+ exports.parseGoogleSearchResults = parseGoogleSearchResults;
43
+ exports.parseBingSearchResults = parseBingSearchResults;
44
+ exports.parseDuckDuckGoSearchResults = parseDuckDuckGoSearchResults;
45
+ exports.classifySearchPage = classifySearchPage;
46
+ const crypto_1 = require("crypto");
47
+ const fs_1 = require("fs");
48
+ const path_1 = __importDefault(require("path"));
49
+ const cheerio = __importStar(require("cheerio"));
50
+ const config_1 = require("../config");
51
+ const errors_1 = require("./errors");
52
+ const agentCli_1 = require("./agentCli");
53
+ const unlocker_1 = require("./unlocker");
54
+ const SEARCH_CACHE_VERSION = 1;
55
+ const SEARCH_CACHE_TTL_MS = 10 * 60 * 1000;
56
+ async function searchWeb(query, config, options) {
57
+ const cached = await readSearchCache(config, query, options);
58
+ if (cached) {
59
+ return cached;
60
+ }
61
+ const attempts = [];
62
+ let lastError;
63
+ let emptyCandidate;
64
+ for (const planItem of buildSearchAttemptPlan(options.source, Boolean(config.cloudToken))) {
65
+ const searchUrl = buildSearchUrl(planItem.engine, query, options);
66
+ try {
67
+ const executor = planItem.source === "unlocker" ? searchViaUnlocker : searchViaBrowser;
68
+ const result = await executor(query, config, options, planItem.engine);
69
+ const attempt = {
70
+ engine: planItem.engine,
71
+ source: planItem.source,
72
+ url: result.url,
73
+ ok: result.results.length > 0,
74
+ resultCount: result.results.length,
75
+ };
76
+ if (result.results.length === 0) {
77
+ attempt.warning = `No results parsed from ${planItem.engine} ${planItem.source} response`;
78
+ }
79
+ attempts.push(attempt);
80
+ if (result.results.length > 0) {
81
+ const warnings = buildSearchWarnings(options.limit, result.results.length);
82
+ const envelope = {
83
+ engine: planItem.engine,
84
+ source: planItem.source,
85
+ query,
86
+ url: result.url,
87
+ requestedLimit: options.limit,
88
+ resultCount: result.results.length,
89
+ returnedCount: result.results.length,
90
+ results: result.results,
91
+ attempts,
92
+ warnings,
93
+ cacheHit: false,
94
+ cacheTtlMs: SEARCH_CACHE_TTL_MS,
95
+ };
96
+ await writeSearchCache(config, query, options, envelope);
97
+ return envelope;
98
+ }
99
+ emptyCandidate ??= {
100
+ engine: planItem.engine,
101
+ source: planItem.source,
102
+ url: result.url,
103
+ results: result.results,
104
+ warning: attempt.warning,
105
+ };
106
+ }
107
+ catch (error) {
108
+ const message = error instanceof Error ? error.message : String(error);
109
+ attempts.push({
110
+ engine: planItem.engine,
111
+ source: planItem.source,
112
+ url: searchUrl,
113
+ ok: false,
114
+ resultCount: 0,
115
+ error: message,
116
+ });
117
+ lastError = error instanceof Error ? error : new Error(message);
118
+ }
119
+ }
120
+ if (emptyCandidate) {
121
+ const warnings = [
122
+ ...(emptyCandidate.warning ? [emptyCandidate.warning] : []),
123
+ ...buildSearchWarnings(options.limit, 0),
124
+ ];
125
+ const envelope = {
126
+ engine: emptyCandidate.engine,
127
+ source: emptyCandidate.source,
128
+ query,
129
+ url: emptyCandidate.url,
130
+ requestedLimit: options.limit,
131
+ resultCount: 0,
132
+ returnedCount: 0,
133
+ results: [],
134
+ attempts,
135
+ warnings,
136
+ cacheHit: false,
137
+ cacheTtlMs: SEARCH_CACHE_TTL_MS,
138
+ };
139
+ await writeSearchCache(config, query, options, envelope);
140
+ return envelope;
141
+ }
142
+ const detail = attempts
143
+ .map((attempt) => `${attempt.source}:${attempt.engine}=${attempt.ok ? `ok(${attempt.resultCount})` : `error(${attempt.error})`}`)
144
+ .join(", ");
145
+ throw new errors_1.CliError("Search failed across all available search paths.", 1, detail || lastError?.message);
146
+ }
147
+ function buildSearchAttemptPlan(source, hasCloudToken) {
148
+ if (source === "browser") {
149
+ return hasCloudToken ? [{ engine: "bing", source: "browser" }] : [];
150
+ }
151
+ if (source === "unlocker") {
152
+ return [
153
+ { engine: "google", source: "unlocker" },
154
+ { engine: "duckduckgo", source: "unlocker" },
155
+ { engine: "bing", source: "unlocker" },
156
+ ];
157
+ }
158
+ const plan = [
159
+ { engine: "google", source: "unlocker" },
160
+ { engine: "duckduckgo", source: "unlocker" },
161
+ { engine: "bing", source: "unlocker" },
162
+ ];
163
+ if (hasCloudToken) {
164
+ plan.push({ engine: "bing", source: "browser" });
165
+ }
166
+ return plan;
167
+ }
168
+ function buildSearchUrl(engine, query, options) {
169
+ if (engine === "bing") {
170
+ const url = new URL("https://www.bing.com/search");
171
+ url.searchParams.set("q", query);
172
+ url.searchParams.set("count", String(Math.min(Math.max(options.limit, 1), 50)));
173
+ url.searchParams.set("cc", (options.country || "us").toLowerCase());
174
+ const locale = normalizeBingLocale(options.country, options.language);
175
+ url.searchParams.set("setlang", locale);
176
+ url.searchParams.set("mkt", locale);
177
+ return url.toString();
178
+ }
179
+ if (engine === "duckduckgo") {
180
+ const url = new URL("https://html.duckduckgo.com/html/");
181
+ url.searchParams.set("q", query);
182
+ url.searchParams.set("kl", normalizeDuckDuckGoLocale(options.country, options.language));
183
+ return url.toString();
184
+ }
185
+ const url = new URL("https://www.google.com/search");
186
+ url.searchParams.set("q", query);
187
+ url.searchParams.set("num", String(Math.min(Math.max(options.limit, 1), 100)));
188
+ url.searchParams.set("hl", options.language || "en");
189
+ url.searchParams.set("gl", (options.country || "us").toLowerCase());
190
+ return url.toString();
191
+ }
192
+ function parseGoogleSearchResults(html, limit) {
193
+ const results = [];
194
+ const seen = new Set();
195
+ const anchorRegex = /<a\b[^>]*href="\/url\?q=([^"&]+)[^"]*"[^>]*>([\s\S]*?)<\/a>/gi;
196
+ for (const match of html.matchAll(anchorRegex)) {
197
+ const decodedUrl = decodeGoogleTarget(match[1]);
198
+ if (!decodedUrl || seen.has(decodedUrl) || !isUsefulSearchResult(decodedUrl)) {
199
+ continue;
200
+ }
201
+ const title = cleanInlineHtml(match[2]);
202
+ if (!title || title.length < 3) {
203
+ continue;
204
+ }
205
+ results.push({
206
+ position: results.length + 1,
207
+ title,
208
+ url: decodedUrl,
209
+ snippet: extractGoogleSnippet(html, match.index ?? 0) || undefined,
210
+ host: getHost(decodedUrl),
211
+ });
212
+ seen.add(decodedUrl);
213
+ if (results.length >= limit) {
214
+ break;
215
+ }
216
+ }
217
+ return results;
218
+ }
219
+ function parseBingSearchResults(html, limit) {
220
+ const $ = cheerio.load(html);
221
+ const results = [];
222
+ const seen = new Set();
223
+ $("li.b_algo").each((_, element) => {
224
+ if (results.length >= limit) {
225
+ return false;
226
+ }
227
+ const anchor = $(element).find("h2 a").first();
228
+ const href = normalizeAbsoluteUrl(anchor.attr("href"));
229
+ const title = anchor.text().replace(/\s+/g, " ").trim();
230
+ if (!href || !title || seen.has(href)) {
231
+ return;
232
+ }
233
+ const snippet = $(element)
234
+ .find(".b_caption p, p")
235
+ .first()
236
+ .text()
237
+ .replace(/\s+/g, " ")
238
+ .trim();
239
+ results.push({
240
+ position: results.length + 1,
241
+ title,
242
+ url: href,
243
+ snippet: snippet || undefined,
244
+ host: getHost(href),
245
+ });
246
+ seen.add(href);
247
+ });
248
+ return results;
249
+ }
250
+ function parseDuckDuckGoSearchResults(html, limit) {
251
+ const $ = cheerio.load(html);
252
+ const results = [];
253
+ const seen = new Set();
254
+ $("a.result__a").each((_, element) => {
255
+ if (results.length >= limit) {
256
+ return false;
257
+ }
258
+ const anchor = $(element);
259
+ const href = normalizeAbsoluteUrl(anchor.attr("href"));
260
+ const title = anchor.text().replace(/\s+/g, " ").trim();
261
+ if (!href || !title || seen.has(href)) {
262
+ return;
263
+ }
264
+ const snippet = anchor
265
+ .closest(".result")
266
+ .find(".result__snippet")
267
+ .first()
268
+ .text()
269
+ .replace(/\s+/g, " ")
270
+ .trim();
271
+ results.push({
272
+ position: results.length + 1,
273
+ title,
274
+ url: href,
275
+ snippet: snippet || undefined,
276
+ host: getHost(href),
277
+ });
278
+ seen.add(href);
279
+ });
280
+ return results;
281
+ }
282
+ function classifySearchPage(engine, html, results) {
283
+ const lower = html.toLowerCase();
284
+ if (matchesBlockedSearchPage(engine, lower)) {
285
+ return "blocked";
286
+ }
287
+ if (results.length > 0) {
288
+ return "valid";
289
+ }
290
+ if (matchesEmptySearchPage(engine, lower) || matchesValidSearchShell(engine, lower)) {
291
+ return "empty";
292
+ }
293
+ return "invalid";
294
+ }
295
+ async function searchViaUnlocker(query, config, options, engine) {
296
+ if (!config.webUnlockerApiKey) {
297
+ throw new errors_1.CliError("Missing GOLOGIN_WEB_UNLOCKER_API_KEY for unlocker search.");
298
+ }
299
+ const searchUrl = buildSearchUrl(engine, query, options);
300
+ const scraped = await (0, unlocker_1.scrapeRenderedHtml)(searchUrl, config.webUnlockerApiKey);
301
+ const results = engine === "google"
302
+ ? parseGoogleSearchResults(scraped.content, options.limit)
303
+ : engine === "bing"
304
+ ? parseBingSearchResults(scraped.content, options.limit)
305
+ : parseDuckDuckGoSearchResults(scraped.content, options.limit);
306
+ const pageState = classifySearchPage(engine, scraped.content, results);
307
+ if (pageState === "blocked") {
308
+ throw new errors_1.CliError(`Unlocker search was blocked on ${engine}.`, 1);
309
+ }
310
+ if (pageState === "invalid") {
311
+ throw new errors_1.CliError(`Unlocker search did not return a valid ${engine} search results page.`, 1);
312
+ }
313
+ return {
314
+ url: searchUrl,
315
+ results,
316
+ };
317
+ }
318
+ async function searchViaBrowser(query, config, options, engine) {
319
+ if (!config.cloudToken) {
320
+ throw new errors_1.CliError("Missing GOLOGIN_CLOUD_TOKEN for browser search fallback.");
321
+ }
322
+ const sessionId = `search-${(0, crypto_1.randomUUID)()}`;
323
+ const searchUrl = buildSearchUrl(engine, query, options);
324
+ const openArgs = ["open", searchUrl, "--session", sessionId];
325
+ const profileId = (0, config_1.resolveProfileId)(config);
326
+ if (profileId) {
327
+ openArgs.push("--profile", profileId);
328
+ }
329
+ const open = await (0, agentCli_1.runAgentCommandCapture)(openArgs, config);
330
+ ensureAgentCommandOk("open", open, searchUrl);
331
+ try {
332
+ const evalExpression = engine === "bing"
333
+ ? buildBingBrowserExtractionExpression(options.limit)
334
+ : buildGoogleBrowserExtractionExpression(options.limit);
335
+ const evaluated = await (0, agentCli_1.runAgentCommandCapture)(["eval", evalExpression, "--json", "--session", sessionId], config);
336
+ ensureAgentCommandOk("eval", evaluated, searchUrl);
337
+ const payload = JSON.parse(evaluated.stdout.trim());
338
+ const results = Array.isArray(payload.results)
339
+ ? payload.results
340
+ .map((item) => ({
341
+ ...item,
342
+ url: normalizeAbsoluteUrl(item.url) ?? item.url,
343
+ host: normalizeAbsoluteUrl(item.url)
344
+ ? getHost(normalizeAbsoluteUrl(item.url))
345
+ : item.host,
346
+ }))
347
+ .filter((item) => Boolean(item.url))
348
+ : [];
349
+ if (payload.blocked) {
350
+ throw new errors_1.CliError(`Browser search was blocked on ${engine}.`, 1, payload.title ? `Blocked page title: ${payload.title}` : undefined);
351
+ }
352
+ return {
353
+ url: searchUrl,
354
+ results,
355
+ };
356
+ }
357
+ finally {
358
+ await (0, agentCli_1.runAgentCommandCapture)(["close", "--session", sessionId], config).catch(() => undefined);
359
+ }
360
+ }
361
+ function ensureAgentCommandOk(step, response, url) {
362
+ if (response.exitCode === 0) {
363
+ return;
364
+ }
365
+ const message = response.stderr.trim() || response.stdout.trim() || `Command failed while handling ${url}`;
366
+ throw new errors_1.CliError(`Browser search ${step} failed.`, 1, message);
367
+ }
368
+ function buildBingBrowserExtractionExpression(limit) {
369
+ return `(() => {
370
+ const items = [];
371
+ const seen = new Set();
372
+ const nodes = Array.from(document.querySelectorAll("li.b_algo"));
373
+ for (const node of nodes) {
374
+ const link = node.querySelector("h2 a");
375
+ const href = link?.href;
376
+ const title = link?.textContent?.replace(/\\s+/g, " ").trim();
377
+ if (!href || !title || seen.has(href)) continue;
378
+ let host;
379
+ try { host = new URL(href).hostname; } catch {}
380
+ const snippet = node.querySelector(".b_caption p, p")?.textContent?.replace(/\\s+/g, " ").trim() || undefined;
381
+ items.push({ title, url: href, snippet, host });
382
+ seen.add(href);
383
+ if (items.length >= ${Math.max(limit, 1)}) break;
384
+ }
385
+ const body = document.body.innerText || "";
386
+ const blocked =
387
+ body.includes("One last step") ||
388
+ body.includes("Please solve the challenge below to continue") ||
389
+ body.includes("Enter the characters you see below");
390
+ return {
391
+ blocked,
392
+ title: document.title,
393
+ results: items
394
+ };
395
+ })()`;
396
+ }
397
+ function buildGoogleBrowserExtractionExpression(limit) {
398
+ return `(() => {
399
+ const blocked = location.pathname.startsWith("/sorry/") || document.body.innerText.includes("About this page");
400
+ const items = [];
401
+ const seen = new Set();
402
+ const nodes = Array.from(document.querySelectorAll('a[href^="/url?q="]'));
403
+ for (const node of nodes) {
404
+ const raw = node.getAttribute("href") || "";
405
+ const match = raw.match(/\\/url\\?q=([^&]+)/);
406
+ if (!match) continue;
407
+ let href;
408
+ try { href = decodeURIComponent(match[1]); } catch { continue; }
409
+ const title = node.textContent?.replace(/\\s+/g, " ").trim();
410
+ if (!href || !title || seen.has(href)) continue;
411
+ let host;
412
+ try { host = new URL(href).hostname; } catch {}
413
+ items.push({ title, url: href, host });
414
+ seen.add(href);
415
+ if (items.length >= ${Math.max(limit, 1)}) break;
416
+ }
417
+ return {
418
+ blocked,
419
+ title: document.title,
420
+ results: items
421
+ };
422
+ })()`;
423
+ }
424
+ function decodeGoogleTarget(value) {
425
+ try {
426
+ const decoded = decodeURIComponent(value);
427
+ const parsed = new URL(decoded);
428
+ if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
429
+ return null;
430
+ }
431
+ parsed.hash = "";
432
+ return parsed.toString();
433
+ }
434
+ catch {
435
+ return null;
436
+ }
437
+ }
438
+ function isUsefulSearchResult(url) {
439
+ try {
440
+ const parsed = new URL(url);
441
+ const host = parsed.hostname.toLowerCase();
442
+ if (host === "google.com" || host.endsWith(".google.com")) {
443
+ return false;
444
+ }
445
+ return true;
446
+ }
447
+ catch {
448
+ return false;
449
+ }
450
+ }
451
+ function matchesBlockedSearchPage(engine, lowerHtml) {
452
+ const markers = engine === "google"
453
+ ? [
454
+ "our systems have detected unusual traffic",
455
+ "about this page",
456
+ "/sorry/",
457
+ "to continue, please type the characters below",
458
+ "captcha",
459
+ ]
460
+ : engine === "bing"
461
+ ? [
462
+ "one last step",
463
+ "please solve the challenge below to continue",
464
+ "enter the characters you see below",
465
+ "verify you are human",
466
+ "captcha",
467
+ ]
468
+ : ["anomaly", "captcha", "automated requests", "unusual traffic"];
469
+ return markers.some((marker) => lowerHtml.includes(marker));
470
+ }
471
+ function matchesEmptySearchPage(engine, lowerHtml) {
472
+ const markers = engine === "google"
473
+ ? ["did not match any documents", "no results found for"]
474
+ : engine === "bing"
475
+ ? ["there are no results for", "no results for"]
476
+ : ["no results.", "no more results."];
477
+ return markers.some((marker) => lowerHtml.includes(marker));
478
+ }
479
+ function matchesValidSearchShell(engine, lowerHtml) {
480
+ const markers = engine === "google"
481
+ ? ['name="q"', 'href="/search?', 'google search']
482
+ : engine === "bing"
483
+ ? ['id="b_results"', 'name="q"', 'class="b_algo"']
484
+ : ['class="result__a"', 'name="q"', 'duckduckgo'];
485
+ return markers.some((marker) => lowerHtml.includes(marker));
486
+ }
487
+ function extractGoogleSnippet(html, startIndex) {
488
+ const nearby = html.slice(startIndex, startIndex + 4000);
489
+ const snippetMatch = nearby.match(/<div\b[^>]*class="[^"]*VwiC3b[^"]*"[^>]*>([\s\S]*?)<\/div>/i) ??
490
+ nearby.match(/<span\b[^>]*class="[^"]*aCOpRe[^"]*"[^>]*>([\s\S]*?)<\/span>/i);
491
+ if (!snippetMatch) {
492
+ return "";
493
+ }
494
+ return cleanInlineHtml(snippetMatch[1]);
495
+ }
496
+ function cleanInlineHtml(value) {
497
+ return decodeHtmlEntities(value.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ")).trim();
498
+ }
499
+ function decodeHtmlEntities(value) {
500
+ const namedEntities = {
501
+ "&amp;": "&",
502
+ "&lt;": "<",
503
+ "&gt;": ">",
504
+ "&quot;": "\"",
505
+ "&#39;": "'",
506
+ "&nbsp;": " ",
507
+ };
508
+ let decoded = value;
509
+ for (const [entity, plain] of Object.entries(namedEntities)) {
510
+ decoded = decoded.split(entity).join(plain);
511
+ }
512
+ decoded = decoded.replace(/&#(\d+);/g, (_, num) => String.fromCharCode(Number(num)));
513
+ decoded = decoded.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(parseInt(hex, 16)));
514
+ return decoded;
515
+ }
516
+ function getHost(url) {
517
+ try {
518
+ return new URL(url).hostname;
519
+ }
520
+ catch {
521
+ return undefined;
522
+ }
523
+ }
524
+ function buildSearchCacheKey(query, options) {
525
+ return (0, crypto_1.createHash)("sha1")
526
+ .update(JSON.stringify({
527
+ query,
528
+ limit: options.limit,
529
+ country: options.country.toLowerCase(),
530
+ language: options.language.toLowerCase(),
531
+ source: options.source,
532
+ }))
533
+ .digest("hex");
534
+ }
535
+ function getSearchCachePath(config, query, options) {
536
+ return path_1.default.join(config.stateDir, "search-cache", `${buildSearchCacheKey(query, options)}.json`);
537
+ }
538
+ async function readSearchCache(config, query, options) {
539
+ const cachePath = getSearchCachePath(config, query, options);
540
+ try {
541
+ const raw = await fs_1.promises.readFile(cachePath, "utf8");
542
+ const parsed = JSON.parse(raw);
543
+ if (parsed.version !== SEARCH_CACHE_VERSION) {
544
+ return null;
545
+ }
546
+ const createdAtMs = Date.parse(parsed.createdAt);
547
+ if (!Number.isFinite(createdAtMs) || Date.now() - createdAtMs > SEARCH_CACHE_TTL_MS) {
548
+ return null;
549
+ }
550
+ const normalized = normalizeCachedEnvelope(parsed.envelope, options.limit);
551
+ return {
552
+ ...normalized,
553
+ cacheHit: true,
554
+ cachedAt: parsed.createdAt,
555
+ cacheTtlMs: SEARCH_CACHE_TTL_MS,
556
+ };
557
+ }
558
+ catch (error) {
559
+ const nodeError = error;
560
+ if (nodeError.code === "ENOENT") {
561
+ return null;
562
+ }
563
+ return null;
564
+ }
565
+ }
566
+ function normalizeCachedEnvelope(envelope, requestedLimit) {
567
+ const results = Array.isArray(envelope.results)
568
+ ? envelope.results.map((result, index) => ({
569
+ ...result,
570
+ position: typeof result.position === "number" ? result.position : index + 1,
571
+ }))
572
+ : [];
573
+ const returnedCount = typeof envelope.returnedCount === "number"
574
+ ? envelope.returnedCount
575
+ : results.length;
576
+ const normalizedRequestedLimit = typeof envelope.requestedLimit === "number"
577
+ ? envelope.requestedLimit
578
+ : requestedLimit;
579
+ const attempts = Array.isArray(envelope.attempts)
580
+ ? envelope.attempts.map((attempt) => normalizeCachedAttempt(attempt))
581
+ : [];
582
+ const warnings = Array.isArray(envelope.warnings)
583
+ ? envelope.warnings
584
+ : buildSearchWarnings(normalizedRequestedLimit, returnedCount);
585
+ return {
586
+ engine: envelope.engine,
587
+ source: envelope.source,
588
+ query: envelope.query,
589
+ url: envelope.url,
590
+ requestedLimit: normalizedRequestedLimit,
591
+ resultCount: typeof envelope.resultCount === "number" ? envelope.resultCount : returnedCount,
592
+ returnedCount,
593
+ results,
594
+ attempts,
595
+ warnings,
596
+ cacheTtlMs: typeof envelope.cacheTtlMs === "number" ? envelope.cacheTtlMs : SEARCH_CACHE_TTL_MS,
597
+ };
598
+ }
599
+ function normalizeCachedAttempt(attempt) {
600
+ if (!attempt.ok || attempt.resultCount > 0 || attempt.error) {
601
+ return attempt;
602
+ }
603
+ return {
604
+ ...attempt,
605
+ ok: false,
606
+ warning: attempt.warning ?? `No results parsed from ${attempt.engine} ${attempt.source} response`,
607
+ };
608
+ }
609
+ async function writeSearchCache(config, query, options, envelope) {
610
+ const cachePath = getSearchCachePath(config, query, options);
611
+ const record = {
612
+ version: SEARCH_CACHE_VERSION,
613
+ createdAt: new Date().toISOString(),
614
+ query,
615
+ options,
616
+ envelope: {
617
+ engine: envelope.engine,
618
+ source: envelope.source,
619
+ query: envelope.query,
620
+ url: envelope.url,
621
+ cacheTtlMs: envelope.cacheTtlMs,
622
+ resultCount: envelope.resultCount,
623
+ requestedLimit: envelope.requestedLimit,
624
+ returnedCount: envelope.returnedCount,
625
+ results: envelope.results,
626
+ attempts: envelope.attempts,
627
+ warnings: envelope.warnings,
628
+ },
629
+ };
630
+ try {
631
+ await fs_1.promises.mkdir(path_1.default.dirname(cachePath), { recursive: true });
632
+ await fs_1.promises.writeFile(cachePath, JSON.stringify(record, null, 2) + "\n", "utf8");
633
+ }
634
+ catch {
635
+ // Cache write failures should never block search results.
636
+ }
637
+ }
638
+ function buildSearchWarnings(requestedLimit, returnedCount) {
639
+ if (returnedCount >= requestedLimit) {
640
+ return [];
641
+ }
642
+ return [`Requested ${requestedLimit} results but received ${returnedCount}.`];
643
+ }
644
+ function normalizeAbsoluteUrl(value) {
645
+ if (!value) {
646
+ return null;
647
+ }
648
+ try {
649
+ const parsed = new URL(value.startsWith("//") ? `https:${value}` : value);
650
+ if (parsed.hostname.toLowerCase() === "www.bing.com" &&
651
+ parsed.pathname.startsWith("/ck/")) {
652
+ const decoded = decodeBingTrackingUrl(parsed);
653
+ if (decoded) {
654
+ return decoded;
655
+ }
656
+ }
657
+ if (parsed.hostname.toLowerCase().endsWith("duckduckgo.com") &&
658
+ parsed.pathname.startsWith("/l/")) {
659
+ const decoded = decodeDuckDuckGoTrackingUrl(parsed);
660
+ if (decoded) {
661
+ return decoded;
662
+ }
663
+ }
664
+ if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
665
+ return null;
666
+ }
667
+ parsed.hash = "";
668
+ return parsed.toString();
669
+ }
670
+ catch {
671
+ return null;
672
+ }
673
+ }
674
+ function decodeBingTrackingUrl(parsed) {
675
+ const encoded = parsed.searchParams.get("u");
676
+ if (!encoded) {
677
+ return null;
678
+ }
679
+ const payload = encoded.startsWith("a1") ? encoded.slice(2) : encoded;
680
+ try {
681
+ const decoded = Buffer.from(payload, "base64").toString("utf8");
682
+ const url = new URL(decoded);
683
+ if (url.protocol !== "http:" && url.protocol !== "https:") {
684
+ return null;
685
+ }
686
+ url.hash = "";
687
+ return url.toString();
688
+ }
689
+ catch {
690
+ return null;
691
+ }
692
+ }
693
+ function normalizeBingLocale(country, language) {
694
+ const normalizedLanguage = (language || "en").trim().toLowerCase();
695
+ const normalizedCountry = (country || "us").trim().toUpperCase();
696
+ if (!normalizedLanguage) {
697
+ return `en-${normalizedCountry || "US"}`;
698
+ }
699
+ if (normalizedLanguage.includes("-")) {
700
+ const [lang, region] = normalizedLanguage.split("-", 2);
701
+ return `${lang.toLowerCase()}-${region.toUpperCase()}`;
702
+ }
703
+ return `${normalizedLanguage}-${normalizedCountry || "US"}`;
704
+ }
705
+ function normalizeDuckDuckGoLocale(country, language) {
706
+ const normalizedLanguage = (language || "en").trim().toLowerCase() || "en";
707
+ const normalizedCountry = (country || "us").trim().toLowerCase() || "us";
708
+ return `${normalizedCountry}-${normalizedLanguage}`;
709
+ }
710
+ function decodeDuckDuckGoTrackingUrl(parsed) {
711
+ const encoded = parsed.searchParams.get("uddg");
712
+ if (!encoded) {
713
+ return null;
714
+ }
715
+ try {
716
+ const decoded = decodeURIComponent(encoded);
717
+ const url = new URL(decoded);
718
+ if (url.protocol !== "http:" && url.protocol !== "https:") {
719
+ return null;
720
+ }
721
+ url.hash = "";
722
+ return url.toString();
723
+ }
724
+ catch {
725
+ return null;
726
+ }
727
+ }