recker 1.0.72 → 1.0.75-next.2e5a94f

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/README.md +5 -18
  2. package/dist/browser/core/client.d.ts +14 -8
  3. package/dist/browser/core/client.js +199 -17
  4. package/dist/browser/core/errors.d.ts +15 -1
  5. package/dist/browser/core/errors.js +140 -9
  6. package/dist/browser/core/request.d.ts +5 -0
  7. package/dist/browser/core/request.js +33 -2
  8. package/dist/browser/core-runtime/plugin-manifest.d.ts +24 -0
  9. package/dist/browser/core-runtime/plugin-manifest.js +159 -0
  10. package/dist/browser/core-runtime/request-context.d.ts +13 -0
  11. package/dist/browser/core-runtime/request-context.js +24 -0
  12. package/dist/browser/core-runtime/typed-events.d.ts +89 -0
  13. package/dist/browser/core-runtime/typed-events.js +34 -0
  14. package/dist/browser/index.iife.min.js +79 -79
  15. package/dist/browser/index.min.js +79 -79
  16. package/dist/browser/index.mini.iife.js +913 -97
  17. package/dist/browser/index.mini.iife.min.js +46 -46
  18. package/dist/browser/index.mini.min.js +46 -46
  19. package/dist/browser/index.mini.umd.js +913 -97
  20. package/dist/browser/index.mini.umd.min.js +46 -46
  21. package/dist/browser/index.umd.min.js +79 -79
  22. package/dist/browser/plugins/auth/aws-sigv4.d.ts +1 -0
  23. package/dist/browser/plugins/auth/aws-sigv4.js +19 -2
  24. package/dist/browser/plugins/retry.js +29 -1
  25. package/dist/browser/presets/aws.d.ts +1 -0
  26. package/dist/browser/presets/aws.js +62 -1
  27. package/dist/browser/runner/request-runner.d.ts +15 -5
  28. package/dist/browser/runner/request-runner.js +164 -30
  29. package/dist/browser/scrape/parser/nodes/html.d.ts +6 -0
  30. package/dist/browser/scrape/parser/nodes/html.js +70 -18
  31. package/dist/browser/scrape/parser/nodes/node.d.ts +1 -0
  32. package/dist/browser/scrape/parser/nodes/node.js +5 -0
  33. package/dist/browser/scrape/spider.d.ts +1 -0
  34. package/dist/browser/scrape/spider.js +39 -26
  35. package/dist/browser/seo/analyzer.d.ts +1 -1
  36. package/dist/browser/seo/analyzer.js +73 -42
  37. package/dist/browser/seo/index.d.ts +1 -1
  38. package/dist/browser/seo/rules/types.d.ts +2 -0
  39. package/dist/browser/seo/seo-spider.d.ts +2 -3
  40. package/dist/browser/seo/seo-spider.js +26 -202
  41. package/dist/browser/seo/types.d.ts +4 -0
  42. package/dist/browser/seo/validators/sitemap.js +9 -2
  43. package/dist/browser/transport/fetch.js +38 -5
  44. package/dist/browser/transport/undici.js +73 -11
  45. package/dist/browser/transport/worker.d.ts +0 -1
  46. package/dist/browser/transport/worker.js +1 -3
  47. package/dist/browser/types/index.d.ts +24 -0
  48. package/dist/cli/commands/mcp.js +5 -3
  49. package/dist/core/client.d.ts +14 -8
  50. package/dist/core/client.js +199 -17
  51. package/dist/core/errors.d.ts +15 -1
  52. package/dist/core/errors.js +140 -9
  53. package/dist/core/request.d.ts +5 -0
  54. package/dist/core/request.js +33 -2
  55. package/dist/core-runtime/plugin-manifest.d.ts +24 -0
  56. package/dist/core-runtime/plugin-manifest.js +159 -0
  57. package/dist/core-runtime/request-context.d.ts +13 -0
  58. package/dist/core-runtime/request-context.js +24 -0
  59. package/dist/core-runtime/typed-events.d.ts +89 -0
  60. package/dist/core-runtime/typed-events.js +34 -0
  61. package/dist/index.d.ts +2 -1
  62. package/dist/index.js +2 -1
  63. package/dist/mcp/cli.js +10 -8
  64. package/dist/mcp/profiles.d.ts +1 -1
  65. package/dist/mcp/profiles.js +31 -6
  66. package/dist/mcp/tools/categories.js +0 -1
  67. package/dist/mcp/tools/seo.js +320 -4
  68. package/dist/plugins/auth/aws-sigv4.d.ts +1 -0
  69. package/dist/plugins/auth/aws-sigv4.js +19 -2
  70. package/dist/plugins/retry.js +29 -1
  71. package/dist/presets/aws.d.ts +1 -0
  72. package/dist/presets/aws.js +62 -1
  73. package/dist/recker.d.ts +3 -0
  74. package/dist/recker.js +5 -0
  75. package/dist/runner/request-runner.d.ts +15 -5
  76. package/dist/runner/request-runner.js +164 -30
  77. package/dist/scrape/parser/nodes/html.d.ts +6 -0
  78. package/dist/scrape/parser/nodes/html.js +70 -18
  79. package/dist/scrape/parser/nodes/node.d.ts +1 -0
  80. package/dist/scrape/parser/nodes/node.js +5 -0
  81. package/dist/scrape/spider.d.ts +1 -0
  82. package/dist/scrape/spider.js +39 -26
  83. package/dist/search/google.d.ts +67 -0
  84. package/dist/search/google.js +480 -0
  85. package/dist/search/index.d.ts +3 -0
  86. package/dist/search/index.js +1 -0
  87. package/dist/seo/analyzer.d.ts +1 -1
  88. package/dist/seo/analyzer.js +73 -42
  89. package/dist/seo/index.d.ts +1 -1
  90. package/dist/seo/rules/types.d.ts +2 -0
  91. package/dist/seo/seo-spider.d.ts +2 -3
  92. package/dist/seo/seo-spider.js +26 -202
  93. package/dist/seo/types.d.ts +4 -0
  94. package/dist/seo/validators/sitemap.js +9 -2
  95. package/dist/transport/fetch.js +38 -5
  96. package/dist/transport/undici.js +73 -11
  97. package/dist/transport/worker.d.ts +0 -1
  98. package/dist/transport/worker.js +1 -3
  99. package/dist/types/index.d.ts +24 -0
  100. package/dist/version.js +1 -1
  101. package/package.json +9 -1
@@ -123,6 +123,7 @@ export declare class Spider {
123
123
  private robotsData;
124
124
  private sitemapValidation;
125
125
  private robotsValidation;
126
+ private toHeaderRecord;
126
127
  constructor(options?: SpiderOptions);
127
128
  crawl(startUrl: string): Promise<SpiderResult>;
128
129
  private fetchRobotsTxt;
@@ -76,9 +76,6 @@ function shouldCrawl(url, baseHost, options) {
76
76
  return false;
77
77
  }
78
78
  }
79
- function sleep(ms) {
80
- return new Promise(resolve => setTimeout(resolve, ms));
81
- }
82
79
  function parseExtractSelectors(selectors) {
83
80
  const schema = {};
84
81
  for (const sel of selectors) {
@@ -115,6 +112,13 @@ export class Spider {
115
112
  robotsData = null;
116
113
  sitemapValidation = null;
117
114
  robotsValidation = null;
115
+ toHeaderRecord(headers) {
116
+ const headerRecord = {};
117
+ headers.forEach((value, key) => {
118
+ headerRecord[key] = value;
119
+ });
120
+ return headerRecord;
121
+ }
118
122
  constructor(options = {}) {
119
123
  let extractSchema;
120
124
  if (options.extract) {
@@ -194,7 +198,7 @@ export class Spider {
194
198
  await this.fetchSitemaps(baseUrl);
195
199
  }
196
200
  const pending = new Map();
197
- const scheduleUrl = (item, fromSitemap = false) => {
201
+ const scheduleUrl = (item) => {
198
202
  const normalized = normalizeUrl(item.url);
199
203
  if (this.visited.has(normalized))
200
204
  return;
@@ -230,7 +234,7 @@ export class Spider {
230
234
  try {
231
235
  const urlHost = new URL(sitemapUrl.loc).hostname;
232
236
  if (urlHost === this.baseHost) {
233
- scheduleUrl({ url: sitemapUrl.loc, depth: 1 }, true);
237
+ scheduleUrl({ url: sitemapUrl.loc, depth: 1 });
234
238
  }
235
239
  }
236
240
  catch {
@@ -303,7 +307,7 @@ export class Spider {
303
307
  return {
304
308
  status: response.status,
305
309
  text: await response.text(),
306
- headers: Object.fromEntries([...response.headers.entries()]),
310
+ headers: this.toHeaderRecord(response.headers),
307
311
  };
308
312
  };
309
313
  try {
@@ -351,40 +355,49 @@ export class Spider {
351
355
  }
352
356
  buildSitemapAnalysis() {
353
357
  const crawledUrls = new Set(this.results.map(r => normalizeUrl(r.url)));
354
- const crawledFromSitemap = this.sitemapUrls.filter(u => crawledUrls.has(normalizeUrl(u.loc))).length;
358
+ const sitemapUrlSet = this.sitemapUrlSet.size > 0
359
+ ? this.sitemapUrlSet
360
+ : new Set(this.sitemapUrls.map((u) => normalizeUrl(u.loc)));
361
+ const crawledFromSitemap = Array.from(sitemapUrlSet)
362
+ .filter(url => crawledUrls.has(url))
363
+ .length;
355
364
  const linkedUrls = new Set();
356
- for (const page of this.results) {
357
- for (const link of page.links) {
358
- if (link.href) {
359
- linkedUrls.add(normalizeUrl(link.href));
360
- }
361
- }
362
- }
363
- const orphanUrls = this.sitemapUrls
364
- .filter(u => {
365
- const normalized = normalizeUrl(u.loc);
366
- return !linkedUrls.has(normalized) && crawledUrls.has(normalized);
367
- })
368
- .map(u => u.loc);
369
- const missingFromSitemap = Array.from(crawledUrls)
370
- .filter(url => !this.sitemapUrlSet.has(url));
371
- const blockedBySitemapRobots = [];
365
+ const blockedBySitemapRobotsSet = new Set();
372
366
  if (this.robotsData) {
373
367
  for (const sitemapUrl of this.sitemapUrls) {
374
368
  try {
369
+ const normalized = normalizeUrl(sitemapUrl.loc);
375
370
  const urlPath = new URL(sitemapUrl.loc).pathname;
376
371
  if (!isPathAllowed(this.robotsData, urlPath, this.options.userAgent)) {
377
- blockedBySitemapRobots.push(sitemapUrl.loc);
372
+ blockedBySitemapRobotsSet.add(normalized);
378
373
  }
379
374
  }
380
375
  catch {
381
376
  }
382
377
  }
383
378
  }
379
+ for (const page of this.results) {
380
+ for (const link of page.links) {
381
+ if (link.href) {
382
+ linkedUrls.add(normalizeUrl(link.href));
383
+ }
384
+ }
385
+ }
386
+ const orphanUrlSet = new Set();
387
+ for (const u of this.sitemapUrls) {
388
+ const normalized = normalizeUrl(u.loc);
389
+ if (!linkedUrls.has(normalized) && !blockedBySitemapRobotsSet.has(normalized)) {
390
+ orphanUrlSet.add(normalized);
391
+ }
392
+ }
393
+ const orphanUrls = Array.from(orphanUrlSet);
394
+ const missingFromSitemap = Array.from(crawledUrls)
395
+ .filter(url => !sitemapUrlSet.has(url));
396
+ const blockedBySitemapRobots = Array.from(blockedBySitemapRobotsSet);
384
397
  return {
385
398
  found: this.sitemapUrls.length > 0,
386
- url: this.sitemapValidation?.parseResult ? undefined : undefined,
387
- totalUrls: this.sitemapUrls.length,
399
+ url: this.sitemapUrls[0]?.loc,
400
+ totalUrls: sitemapUrlSet.size,
388
401
  crawledFromSitemap,
389
402
  orphanUrls,
390
403
  missingFromSitemap,
@@ -0,0 +1,67 @@
1
+ import { type BlockDetectionResult } from '../utils/block-detector.js';
2
+ type SearchTransport = 'auto' | 'undici' | 'curl';
3
+ export type { SearchTransport };
4
+ export interface GoogleSearchAdvancedOptions {
5
+ asQ?: string;
6
+ as_q?: string;
7
+ asEpq?: string;
8
+ as_epq?: string;
9
+ asOq?: string;
10
+ as_oq?: string;
11
+ asEq?: string;
12
+ as_eq?: string;
13
+ asSitesearch?: string;
14
+ as_sitesearch?: string;
15
+ asFiletype?: string;
16
+ as_filetype?: string;
17
+ asRights?: string;
18
+ as_rights?: string;
19
+ asNlo?: number | string;
20
+ as_nlo?: number | string;
21
+ asNhi?: number | string;
22
+ as_nhi?: number | string;
23
+ safe?: string;
24
+ tbm?: string;
25
+ num?: number;
26
+ start?: number;
27
+ tbs?: string;
28
+ lr?: string;
29
+ cr?: string;
30
+ country?: string;
31
+ gl?: string;
32
+ hl?: string;
33
+ transport?: SearchTransport;
34
+ timeout?: number;
35
+ maxResults?: number;
36
+ extraParams?: Record<string, string | number | boolean>;
37
+ userAgent?: string;
38
+ headers?: HeadersInit;
39
+ includeRawHtml?: boolean;
40
+ }
41
+ export interface GoogleSearchResult {
42
+ rank: number;
43
+ title: string;
44
+ url: string;
45
+ snippet?: string;
46
+ displayedUrl?: string;
47
+ source?: string;
48
+ }
49
+ export interface SearchTransportDetails {
50
+ requested: SearchTransport;
51
+ used: SearchTransport;
52
+ fallbackUsed: boolean;
53
+ impersonateAvailable: boolean;
54
+ }
55
+ export interface GoogleSearchResponse {
56
+ query: string;
57
+ searchUrl: string;
58
+ results: GoogleSearchResult[];
59
+ nextPageUrl?: string;
60
+ nextPageStart?: number;
61
+ resultStats?: number;
62
+ block?: BlockDetectionResult;
63
+ transport: SearchTransportDetails;
64
+ status?: number;
65
+ rawHtml?: string;
66
+ }
67
+ export declare function searchGoogleAdvanced(query: string, options?: GoogleSearchAdvancedOptions): Promise<GoogleSearchResponse>;
@@ -0,0 +1,480 @@
1
+ import { createClient } from '../core/client.js';
2
+ import { ValidationError } from '../core/errors.js';
3
+ import { HttpRequest } from '../core/request.js';
4
+ import { ScrapeDocument } from '../scrape/document.js';
5
+ import { detectBlock } from '../utils/block-detector.js';
6
+ import { getRandomUserAgent } from '../utils/user-agent.js';
7
+ const GOOGLE_SEARCH_BASE_URL = 'https://www.google.com/search';
8
+ const GOOGLE_SEARCH_ORIGIN = 'https://www.google.com';
9
+ const GOOGLE_RESULT_SNIPPET_SELECTOR_ORDER = [
10
+ '[data-sncf="1"]',
11
+ 'div[data-sncf="1"]',
12
+ 'span.aCOpRe',
13
+ 'div.aCOpRe',
14
+ 'div.VwiC3b',
15
+ 'span.VwiC3b',
16
+ 'div.BNeawe',
17
+ 'div.yXK7lf',
18
+ 'div[data-attrid="wa:/description"]',
19
+ ];
20
+ const GOOGLE_RESULT_LINK_SELECTORS = [
21
+ 'a[href^="/url?q="]',
22
+ 'a[href^="https://www.google.com/url?"]',
23
+ 'a[href^="http://www.google.com/url?"]',
24
+ ];
25
+ const GOOGLE_RESULT_CONTAINER_SELECTORS = '[data-hveid], [data-ved], div[class*="g"], div[class*="MjjY"], div[class*="tF2Cxc"], [class*="xpd"]';
26
+ const COUNTRY_CODE_PATTERN = /^[a-z]{2}$/;
27
+ const COUNTRY_ALIASES = {
28
+ us: 'us',
29
+ usa: 'us',
30
+ united_states: 'us',
31
+ 'united states': 'us',
32
+ br: 'br',
33
+ brasil: 'br',
34
+ brazil: 'br',
35
+ pt_br: 'br',
36
+ pt: 'br',
37
+ de: 'de',
38
+ germany: 'de',
39
+ deutschland: 'de',
40
+ gb: 'gb',
41
+ uk: 'gb',
42
+ england: 'gb',
43
+ britain: 'gb',
44
+ 'united kingdom': 'gb',
45
+ fr: 'fr',
46
+ france: 'fr',
47
+ spain: 'es',
48
+ españa: 'es',
49
+ es: 'es',
50
+ ca: 'ca',
51
+ mexico: 'mx',
52
+ mx: 'mx',
53
+ it: 'it',
54
+ italy: 'it',
55
+ au: 'au',
56
+ india: 'in',
57
+ in: 'in',
58
+ argentina: 'ar',
59
+ ar: 'ar',
60
+ };
61
+ const COUNTRY_ALIASES_NORMALIZED = Object.entries(COUNTRY_ALIASES).reduce((acc, [key, value]) => {
62
+ acc[key.toLowerCase().replace(/[^a-z]/g, '_')] = value;
63
+ return acc;
64
+ }, {});
65
+ function cleanText(value) {
66
+ return value.replace(/\s+/g, ' ').trim();
67
+ }
68
+ function isDefined(value) {
69
+ return value !== undefined && value !== null;
70
+ }
71
+ function toParamValue(value) {
72
+ if (value === undefined)
73
+ return undefined;
74
+ if (typeof value === 'boolean')
75
+ return value ? '1' : '0';
76
+ const trimmed = String(value).trim();
77
+ return trimmed.length > 0 ? trimmed : undefined;
78
+ }
79
+ function pick(...values) {
80
+ for (const value of values) {
81
+ if (value !== undefined) {
82
+ return value;
83
+ }
84
+ }
85
+ return undefined;
86
+ }
87
+ function normalizeCountryCode(input) {
88
+ const normalized = input.trim().toLowerCase().replace(/\s+/g, '_');
89
+ if (COUNTRY_CODE_PATTERN.test(normalized)) {
90
+ return normalized;
91
+ }
92
+ const cleaned = normalized.replace(/[^a-z_]/g, '');
93
+ const directAlias = COUNTRY_ALIASES_NORMALIZED[cleaned];
94
+ if (directAlias) {
95
+ return directAlias;
96
+ }
97
+ if (normalized.includes('_')) {
98
+ const tail = normalized.split('_').pop() || '';
99
+ if (COUNTRY_CODE_PATTERN.test(tail)) {
100
+ return tail;
101
+ }
102
+ }
103
+ return '';
104
+ }
105
+ function resolveCountryCode(country, legacyGl) {
106
+ if (country !== undefined) {
107
+ const resolved = normalizeCountryCode(country);
108
+ if (!resolved) {
109
+ throw new ValidationError('Invalid country for Google search. Use ISO 3166-1 alpha-2 code or a known country name.', {
110
+ field: 'country',
111
+ value: country,
112
+ });
113
+ }
114
+ return resolved;
115
+ }
116
+ return legacyGl ? legacyGl.trim().toLowerCase() : '';
117
+ }
118
+ function normalizeOptions(query, options = {}) {
119
+ const normalizedQuery = cleanText(query);
120
+ if (!normalizedQuery) {
121
+ throw new ValidationError('Google query is required', {
122
+ field: 'query',
123
+ value: query,
124
+ });
125
+ }
126
+ const resolvedTransport = (options.transport ?? 'auto');
127
+ return {
128
+ ...options,
129
+ as_q: pick(options.as_q, options.asQ) ?? '',
130
+ query: normalizedQuery,
131
+ asEpq: pick(options.as_epq, options.asEpq) ?? '',
132
+ asOq: pick(options.as_oq, options.asOq) ?? '',
133
+ asEq: pick(options.as_eq, options.asEq) ?? '',
134
+ as_sitesearch: pick(options.as_sitesearch, options.asSitesearch) ?? '',
135
+ as_filetype: pick(options.as_filetype, options.asFiletype) ?? '',
136
+ as_rights: pick(options.as_rights, options.asRights) ?? '',
137
+ as_nlo: toParamValue(pick(options.as_nlo, options.asNlo)) ?? '',
138
+ as_nhi: toParamValue(pick(options.as_nhi, options.asNhi)) ?? '',
139
+ gl: resolveCountryCode(options.country, options.gl),
140
+ transport: resolvedTransport,
141
+ includeRawHtml: options.includeRawHtml ?? false,
142
+ };
143
+ }
144
+ function buildSearchUrl(query, options) {
145
+ const params = new URLSearchParams();
146
+ params.set('q', query);
147
+ params.set('ie', 'UTF-8');
148
+ params.set('oe', 'UTF-8');
149
+ if (options.as_q)
150
+ params.set('as_q', options.as_q);
151
+ if (options.asEpq)
152
+ params.set('as_epq', options.asEpq);
153
+ if (options.asOq)
154
+ params.set('as_oq', options.asOq);
155
+ if (options.asEq)
156
+ params.set('as_eq', options.asEq);
157
+ if (options.as_sitesearch)
158
+ params.set('as_sitesearch', options.as_sitesearch);
159
+ if (options.as_filetype)
160
+ params.set('as_filetype', options.as_filetype);
161
+ if (options.as_rights)
162
+ params.set('as_rights', options.as_rights);
163
+ if (options.as_nlo)
164
+ params.set('as_nlo', options.as_nlo);
165
+ if (options.as_nhi)
166
+ params.set('as_nhi', options.as_nhi);
167
+ if (options.safe)
168
+ params.set('safe', options.safe);
169
+ if (options.tbm)
170
+ params.set('tbm', options.tbm);
171
+ if (options.lr)
172
+ params.set('lr', options.lr);
173
+ if (options.cr)
174
+ params.set('cr', options.cr);
175
+ if (options.gl)
176
+ params.set('gl', options.gl);
177
+ if (options.hl)
178
+ params.set('hl', options.hl);
179
+ if (isDefined(options.num)) {
180
+ const parsed = Number(options.num);
181
+ if (Number.isFinite(parsed) && parsed > 0) {
182
+ params.set('num', String(Math.min(100, Math.floor(parsed))));
183
+ }
184
+ }
185
+ if (isDefined(options.start)) {
186
+ const parsedStart = Number(options.start);
187
+ if (Number.isFinite(parsedStart) && parsedStart >= 0) {
188
+ params.set('start', String(Math.floor(parsedStart)));
189
+ }
190
+ }
191
+ if (options.tbs)
192
+ params.set('tbs', options.tbs);
193
+ if (options.extraParams) {
194
+ for (const [key, value] of Object.entries(options.extraParams)) {
195
+ const normalized = toParamValue(value);
196
+ if (normalized !== undefined) {
197
+ params.set(key, normalized);
198
+ }
199
+ }
200
+ }
201
+ return `${GOOGLE_SEARCH_BASE_URL}?${params.toString()}`;
202
+ }
203
+ function normalizeRequestHeaders(inputHeaders, userAgent) {
204
+ const headers = new Headers({
205
+ accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
206
+ 'accept-language': 'en-US,en;q=0.9',
207
+ 'cache-control': 'max-age=0',
208
+ 'sec-ch-ua-mobile': '?0',
209
+ 'sec-ch-ua-platform': '"Windows"',
210
+ 'user-agent': userAgent,
211
+ referer: GOOGLE_SEARCH_ORIGIN,
212
+ });
213
+ if (inputHeaders) {
214
+ const incoming = new Headers(inputHeaders);
215
+ incoming.forEach((value, key) => headers.set(key, value));
216
+ }
217
+ const merged = {};
218
+ headers.forEach((value, key) => {
219
+ merged[key] = value;
220
+ });
221
+ return merged;
222
+ }
223
+ async function hasImpersonateBinary() {
224
+ try {
225
+ const { hasImpersonate } = await import('../utils/binary-manager.js');
226
+ return hasImpersonate();
227
+ }
228
+ catch {
229
+ return false;
230
+ }
231
+ }
232
+ async function fetchWithCurl(url, headers, timeout) {
233
+ const { CurlTransport } = await import('../transport/curl.js');
234
+ const transport = new CurlTransport();
235
+ const request = new HttpRequest(url, {
236
+ method: 'GET',
237
+ headers,
238
+ timeout,
239
+ });
240
+ const response = await transport.dispatch(request);
241
+ const html = await response.text();
242
+ return { html, status: response.status };
243
+ }
244
+ async function fetchSearchResults(url, options) {
245
+ const headers = normalizeRequestHeaders(options.headers, options.userAgent ?? getRandomUserAgent('desktop.chrome'));
246
+ const requestTimeout = options.timeout;
247
+ const impersonateAvailable = options.transport !== 'undici' && (await hasImpersonateBinary());
248
+ if (options.transport === 'curl' && !impersonateAvailable) {
249
+ throw new ValidationError('Transport "curl" requires curl-impersonate; install it with `rek setup`', {
250
+ field: 'transport',
251
+ value: options.transport,
252
+ });
253
+ }
254
+ if (options.transport === 'curl') {
255
+ const directResponse = await fetchWithCurl(url, headers, requestTimeout);
256
+ const directBlock = detectBlock({ status: directResponse.status, headers: new Headers() }, directResponse.html);
257
+ return {
258
+ html: directResponse.html,
259
+ status: directResponse.status,
260
+ transport: 'curl',
261
+ fallbackUsed: false,
262
+ impersonateAvailable,
263
+ block: directBlock,
264
+ };
265
+ }
266
+ const client = createClient({ timeout: requestTimeout });
267
+ const performUndiciRequest = async () => {
268
+ const response = await client.get(url, { headers });
269
+ const html = await response.text();
270
+ const block = detectBlock({ status: response.status, headers: response.headers }, html);
271
+ return {
272
+ html,
273
+ status: response.status,
274
+ transport: 'undici',
275
+ fallbackUsed: false,
276
+ impersonateAvailable,
277
+ block,
278
+ };
279
+ };
280
+ if (options.transport === 'undici') {
281
+ return performUndiciRequest();
282
+ }
283
+ if (!impersonateAvailable) {
284
+ return performUndiciRequest();
285
+ }
286
+ try {
287
+ const primaryImpersonateResponse = await fetchWithCurl(url, headers, requestTimeout);
288
+ const primaryImpersonateBlock = detectBlock({ status: primaryImpersonateResponse.status, headers: new Headers() }, primaryImpersonateResponse.html);
289
+ if (!primaryImpersonateBlock.blocked || primaryImpersonateBlock.confidence <= 0.7) {
290
+ return {
291
+ html: primaryImpersonateResponse.html,
292
+ status: primaryImpersonateResponse.status,
293
+ transport: 'curl',
294
+ fallbackUsed: false,
295
+ impersonateAvailable,
296
+ block: primaryImpersonateBlock,
297
+ };
298
+ }
299
+ }
300
+ catch {
301
+ }
302
+ const fallback = await performUndiciRequest();
303
+ return {
304
+ ...fallback,
305
+ transport: fallback.transport,
306
+ fallbackUsed: true,
307
+ impersonateAvailable,
308
+ };
309
+ }
310
+ function resolveSearchResultUrl(rawHref) {
311
+ try {
312
+ const normalized = rawHref.startsWith('//') ? `https:${rawHref}` : rawHref;
313
+ const parsed = new URL(normalized, GOOGLE_SEARCH_ORIGIN);
314
+ const rawResultUrl = parsed.searchParams.get('q') ?? parsed.searchParams.get('url');
315
+ if (!rawResultUrl)
316
+ return null;
317
+ const candidate = new URL(decodeURIComponent(rawResultUrl), GOOGLE_SEARCH_ORIGIN);
318
+ if (!candidate.protocol.startsWith('http'))
319
+ return null;
320
+ if (candidate.hostname === 'www.google.com' && candidate.pathname === '/search')
321
+ return null;
322
+ return candidate.toString();
323
+ }
324
+ catch {
325
+ return null;
326
+ }
327
+ }
328
+ function extractDisplayedUrl(linkUrl, containerText) {
329
+ const direct = (() => {
330
+ try {
331
+ return new URL(linkUrl).hostname;
332
+ }
333
+ catch {
334
+ return '';
335
+ }
336
+ })();
337
+ if (direct)
338
+ return cleanText(direct);
339
+ return containerText ? cleanText(containerText).slice(0, 120) : '';
340
+ }
341
+ function looksLikeSnippet(text, title) {
342
+ const cleaned = cleanText(text);
343
+ if (cleaned.length < 25 || cleaned.length > 600)
344
+ return false;
345
+ if (cleaned === title)
346
+ return false;
347
+ if (/^https?:\/\//i.test(cleaned))
348
+ return false;
349
+ return true;
350
+ }
351
+ function parseResultStats(text) {
352
+ const normalized = text.replace(/,/g, '');
353
+ const match = normalized.match(/([0-9]+)\s*(?:result|resultado)/i);
354
+ if (!match)
355
+ return undefined;
356
+ const parsed = Number.parseInt(match[1], 10);
357
+ return Number.isFinite(parsed) ? parsed : undefined;
358
+ }
359
+ function parseSearchPage(html, options) {
360
+ const doc = ScrapeDocument.createSync(html, { baseUrl: GOOGLE_SEARCH_ORIGIN });
361
+ const results = [];
362
+ const seen = new Set();
363
+ const maxResults = options.maxResults ? Number(options.maxResults) : undefined;
364
+ const linkSelector = GOOGLE_RESULT_LINK_SELECTORS.join(', ');
365
+ const anchors = doc.selectAll(linkSelector);
366
+ for (const anchor of anchors) {
367
+ const rawHref = anchor.attr('href');
368
+ if (!rawHref)
369
+ continue;
370
+ const resultUrl = resolveSearchResultUrl(rawHref);
371
+ if (!resultUrl || seen.has(resultUrl))
372
+ continue;
373
+ const titleText = (() => {
374
+ const fromHeading = anchor.find('h3').text();
375
+ if (fromHeading)
376
+ return cleanText(fromHeading);
377
+ const fromContainer = anchor.text();
378
+ return cleanText(fromContainer);
379
+ })();
380
+ if (!titleText)
381
+ continue;
382
+ const resultContainer = anchor.parents(GOOGLE_RESULT_CONTAINER_SELECTORS).first();
383
+ const snippet = (() => {
384
+ for (const selector of GOOGLE_RESULT_SNIPPET_SELECTOR_ORDER) {
385
+ const snippetNode = resultContainer.find(selector).first();
386
+ const snippetText = cleanText(snippetNode.text());
387
+ if (looksLikeSnippet(snippetText, titleText)) {
388
+ return snippetText;
389
+ }
390
+ }
391
+ const fallbackElements = resultContainer.find('span,div').toArray();
392
+ for (const fallbackElement of fallbackElements) {
393
+ const fallbackText = cleanText(fallbackElement.text());
394
+ if (looksLikeSnippet(fallbackText, titleText)) {
395
+ return fallbackText;
396
+ }
397
+ }
398
+ const rootFallback = cleanText(resultContainer.text());
399
+ if (looksLikeSnippet(rootFallback, titleText)) {
400
+ return rootFallback.slice(0, 240);
401
+ }
402
+ return undefined;
403
+ })();
404
+ const item = {
405
+ rank: results.length + 1,
406
+ title: titleText,
407
+ url: resultUrl,
408
+ snippet,
409
+ displayedUrl: extractDisplayedUrl(resultUrl, anchor.text()),
410
+ };
411
+ results.push(item);
412
+ seen.add(resultUrl);
413
+ if (typeof maxResults === 'number' && Number.isFinite(maxResults) && results.length >= maxResults) {
414
+ break;
415
+ }
416
+ }
417
+ const nextPageRaw = (() => {
418
+ const candidate = doc.selectFirst('a#pnnext, a[aria-label="Next"], a[id="pnnext"]').first();
419
+ if (candidate && candidate.length) {
420
+ const href = candidate.attr('href');
421
+ if (!href)
422
+ return undefined;
423
+ try {
424
+ return new URL(href, GOOGLE_SEARCH_ORIGIN).toString();
425
+ }
426
+ catch {
427
+ return undefined;
428
+ }
429
+ }
430
+ return undefined;
431
+ })();
432
+ const nextPageStart = (() => {
433
+ if (!nextPageRaw)
434
+ return undefined;
435
+ try {
436
+ const nextUrl = new URL(nextPageRaw);
437
+ const next = nextUrl.searchParams.get('start');
438
+ if (!next)
439
+ return undefined;
440
+ const parsed = Number.parseInt(next, 10);
441
+ return Number.isFinite(parsed) ? parsed : undefined;
442
+ }
443
+ catch {
444
+ return undefined;
445
+ }
446
+ })();
447
+ const resultStats = parseResultStats(doc.selectFirst('#result-stats').text());
448
+ return {
449
+ results,
450
+ nextPageUrl: nextPageRaw,
451
+ nextPageStart,
452
+ resultStats,
453
+ };
454
+ }
455
+ export async function searchGoogleAdvanced(query, options = {}) {
456
+ const normalized = normalizeOptions(query, options);
457
+ const searchUrl = buildSearchUrl(query, normalized);
458
+ const fetchResult = await fetchSearchResults(searchUrl, normalized);
459
+ const parsed = parseSearchPage(fetchResult.html, normalized);
460
+ const response = {
461
+ query: normalized.query,
462
+ searchUrl,
463
+ results: parsed.results,
464
+ transport: {
465
+ requested: normalized.transport,
466
+ used: fetchResult.transport,
467
+ fallbackUsed: fetchResult.fallbackUsed,
468
+ impersonateAvailable: fetchResult.impersonateAvailable,
469
+ },
470
+ status: fetchResult.status,
471
+ block: fetchResult.block,
472
+ nextPageUrl: parsed.nextPageUrl,
473
+ nextPageStart: parsed.nextPageStart,
474
+ resultStats: parsed.resultStats,
475
+ };
476
+ if (normalized.includeRawHtml) {
477
+ response.rawHtml = fetchResult.html;
478
+ }
479
+ return response;
480
+ }
@@ -0,0 +1,3 @@
1
+ export { searchGoogleAdvanced } from './google.js';
2
+ export type { GoogleSearchAdvancedOptions, GoogleSearchResult, GoogleSearchResponse, SearchTransportDetails, } from './google.js';
3
+ export type { SearchTransport } from './google.js';
@@ -0,0 +1 @@
1
+ export { searchGoogleAdvanced } from './google.js';
@@ -12,6 +12,7 @@ export declare class SeoAnalyzer {
12
12
  static fromHtml(html: string, options?: SeoAnalyzerFullOptions): Promise<SeoAnalyzer>;
13
13
  analyze(): SeoReport;
14
14
  private getMainBody;
15
+ private detectPageType;
15
16
  private getVisibleText;
16
17
  private buildRuleContext;
17
18
  private analyzeUrlQuality;
@@ -32,7 +33,6 @@ export declare class SeoAnalyzer {
32
33
  private analyzeAnalytics;
33
34
  private analyzeFeeds;
34
35
  private analyzeConversionElements;
35
- private analyzeAdvancedImages;
36
36
  private calculateTextHtmlRatio;
37
37
  private convertToCheckResults;
38
38
  private buildSummary;