recker 1.0.72 → 1.0.75-next.2e5a94f
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -18
- package/dist/browser/core/client.d.ts +14 -8
- package/dist/browser/core/client.js +199 -17
- package/dist/browser/core/errors.d.ts +15 -1
- package/dist/browser/core/errors.js +140 -9
- package/dist/browser/core/request.d.ts +5 -0
- package/dist/browser/core/request.js +33 -2
- package/dist/browser/core-runtime/plugin-manifest.d.ts +24 -0
- package/dist/browser/core-runtime/plugin-manifest.js +159 -0
- package/dist/browser/core-runtime/request-context.d.ts +13 -0
- package/dist/browser/core-runtime/request-context.js +24 -0
- package/dist/browser/core-runtime/typed-events.d.ts +89 -0
- package/dist/browser/core-runtime/typed-events.js +34 -0
- package/dist/browser/index.iife.min.js +79 -79
- package/dist/browser/index.min.js +79 -79
- package/dist/browser/index.mini.iife.js +913 -97
- package/dist/browser/index.mini.iife.min.js +46 -46
- package/dist/browser/index.mini.min.js +46 -46
- package/dist/browser/index.mini.umd.js +913 -97
- package/dist/browser/index.mini.umd.min.js +46 -46
- package/dist/browser/index.umd.min.js +79 -79
- package/dist/browser/plugins/auth/aws-sigv4.d.ts +1 -0
- package/dist/browser/plugins/auth/aws-sigv4.js +19 -2
- package/dist/browser/plugins/retry.js +29 -1
- package/dist/browser/presets/aws.d.ts +1 -0
- package/dist/browser/presets/aws.js +62 -1
- package/dist/browser/runner/request-runner.d.ts +15 -5
- package/dist/browser/runner/request-runner.js +164 -30
- package/dist/browser/scrape/parser/nodes/html.d.ts +6 -0
- package/dist/browser/scrape/parser/nodes/html.js +70 -18
- package/dist/browser/scrape/parser/nodes/node.d.ts +1 -0
- package/dist/browser/scrape/parser/nodes/node.js +5 -0
- package/dist/browser/scrape/spider.d.ts +1 -0
- package/dist/browser/scrape/spider.js +39 -26
- package/dist/browser/seo/analyzer.d.ts +1 -1
- package/dist/browser/seo/analyzer.js +73 -42
- package/dist/browser/seo/index.d.ts +1 -1
- package/dist/browser/seo/rules/types.d.ts +2 -0
- package/dist/browser/seo/seo-spider.d.ts +2 -3
- package/dist/browser/seo/seo-spider.js +26 -202
- package/dist/browser/seo/types.d.ts +4 -0
- package/dist/browser/seo/validators/sitemap.js +9 -2
- package/dist/browser/transport/fetch.js +38 -5
- package/dist/browser/transport/undici.js +73 -11
- package/dist/browser/transport/worker.d.ts +0 -1
- package/dist/browser/transport/worker.js +1 -3
- package/dist/browser/types/index.d.ts +24 -0
- package/dist/cli/commands/mcp.js +5 -3
- package/dist/core/client.d.ts +14 -8
- package/dist/core/client.js +199 -17
- package/dist/core/errors.d.ts +15 -1
- package/dist/core/errors.js +140 -9
- package/dist/core/request.d.ts +5 -0
- package/dist/core/request.js +33 -2
- package/dist/core-runtime/plugin-manifest.d.ts +24 -0
- package/dist/core-runtime/plugin-manifest.js +159 -0
- package/dist/core-runtime/request-context.d.ts +13 -0
- package/dist/core-runtime/request-context.js +24 -0
- package/dist/core-runtime/typed-events.d.ts +89 -0
- package/dist/core-runtime/typed-events.js +34 -0
- package/dist/index.d.ts +2 -1
- package/dist/index.js +2 -1
- package/dist/mcp/cli.js +10 -8
- package/dist/mcp/profiles.d.ts +1 -1
- package/dist/mcp/profiles.js +31 -6
- package/dist/mcp/tools/categories.js +0 -1
- package/dist/mcp/tools/seo.js +320 -4
- package/dist/plugins/auth/aws-sigv4.d.ts +1 -0
- package/dist/plugins/auth/aws-sigv4.js +19 -2
- package/dist/plugins/retry.js +29 -1
- package/dist/presets/aws.d.ts +1 -0
- package/dist/presets/aws.js +62 -1
- package/dist/recker.d.ts +3 -0
- package/dist/recker.js +5 -0
- package/dist/runner/request-runner.d.ts +15 -5
- package/dist/runner/request-runner.js +164 -30
- package/dist/scrape/parser/nodes/html.d.ts +6 -0
- package/dist/scrape/parser/nodes/html.js +70 -18
- package/dist/scrape/parser/nodes/node.d.ts +1 -0
- package/dist/scrape/parser/nodes/node.js +5 -0
- package/dist/scrape/spider.d.ts +1 -0
- package/dist/scrape/spider.js +39 -26
- package/dist/search/google.d.ts +67 -0
- package/dist/search/google.js +480 -0
- package/dist/search/index.d.ts +3 -0
- package/dist/search/index.js +1 -0
- package/dist/seo/analyzer.d.ts +1 -1
- package/dist/seo/analyzer.js +73 -42
- package/dist/seo/index.d.ts +1 -1
- package/dist/seo/rules/types.d.ts +2 -0
- package/dist/seo/seo-spider.d.ts +2 -3
- package/dist/seo/seo-spider.js +26 -202
- package/dist/seo/types.d.ts +4 -0
- package/dist/seo/validators/sitemap.js +9 -2
- package/dist/transport/fetch.js +38 -5
- package/dist/transport/undici.js +73 -11
- package/dist/transport/worker.d.ts +0 -1
- package/dist/transport/worker.js +1 -3
- package/dist/types/index.d.ts +24 -0
- package/dist/version.js +1 -1
- package/package.json +9 -1
package/dist/scrape/spider.d.ts
CHANGED
|
@@ -123,6 +123,7 @@ export declare class Spider {
|
|
|
123
123
|
private robotsData;
|
|
124
124
|
private sitemapValidation;
|
|
125
125
|
private robotsValidation;
|
|
126
|
+
private toHeaderRecord;
|
|
126
127
|
constructor(options?: SpiderOptions);
|
|
127
128
|
crawl(startUrl: string): Promise<SpiderResult>;
|
|
128
129
|
private fetchRobotsTxt;
|
package/dist/scrape/spider.js
CHANGED
|
@@ -76,9 +76,6 @@ function shouldCrawl(url, baseHost, options) {
|
|
|
76
76
|
return false;
|
|
77
77
|
}
|
|
78
78
|
}
|
|
79
|
-
function sleep(ms) {
|
|
80
|
-
return new Promise(resolve => setTimeout(resolve, ms));
|
|
81
|
-
}
|
|
82
79
|
function parseExtractSelectors(selectors) {
|
|
83
80
|
const schema = {};
|
|
84
81
|
for (const sel of selectors) {
|
|
@@ -115,6 +112,13 @@ export class Spider {
|
|
|
115
112
|
robotsData = null;
|
|
116
113
|
sitemapValidation = null;
|
|
117
114
|
robotsValidation = null;
|
|
115
|
+
toHeaderRecord(headers) {
|
|
116
|
+
const headerRecord = {};
|
|
117
|
+
headers.forEach((value, key) => {
|
|
118
|
+
headerRecord[key] = value;
|
|
119
|
+
});
|
|
120
|
+
return headerRecord;
|
|
121
|
+
}
|
|
118
122
|
constructor(options = {}) {
|
|
119
123
|
let extractSchema;
|
|
120
124
|
if (options.extract) {
|
|
@@ -194,7 +198,7 @@ export class Spider {
|
|
|
194
198
|
await this.fetchSitemaps(baseUrl);
|
|
195
199
|
}
|
|
196
200
|
const pending = new Map();
|
|
197
|
-
const scheduleUrl = (item
|
|
201
|
+
const scheduleUrl = (item) => {
|
|
198
202
|
const normalized = normalizeUrl(item.url);
|
|
199
203
|
if (this.visited.has(normalized))
|
|
200
204
|
return;
|
|
@@ -230,7 +234,7 @@ export class Spider {
|
|
|
230
234
|
try {
|
|
231
235
|
const urlHost = new URL(sitemapUrl.loc).hostname;
|
|
232
236
|
if (urlHost === this.baseHost) {
|
|
233
|
-
scheduleUrl({ url: sitemapUrl.loc, depth: 1 }
|
|
237
|
+
scheduleUrl({ url: sitemapUrl.loc, depth: 1 });
|
|
234
238
|
}
|
|
235
239
|
}
|
|
236
240
|
catch {
|
|
@@ -303,7 +307,7 @@ export class Spider {
|
|
|
303
307
|
return {
|
|
304
308
|
status: response.status,
|
|
305
309
|
text: await response.text(),
|
|
306
|
-
headers:
|
|
310
|
+
headers: this.toHeaderRecord(response.headers),
|
|
307
311
|
};
|
|
308
312
|
};
|
|
309
313
|
try {
|
|
@@ -351,40 +355,49 @@ export class Spider {
|
|
|
351
355
|
}
|
|
352
356
|
buildSitemapAnalysis() {
|
|
353
357
|
const crawledUrls = new Set(this.results.map(r => normalizeUrl(r.url)));
|
|
354
|
-
const
|
|
358
|
+
const sitemapUrlSet = this.sitemapUrlSet.size > 0
|
|
359
|
+
? this.sitemapUrlSet
|
|
360
|
+
: new Set(this.sitemapUrls.map((u) => normalizeUrl(u.loc)));
|
|
361
|
+
const crawledFromSitemap = Array.from(sitemapUrlSet)
|
|
362
|
+
.filter(url => crawledUrls.has(url))
|
|
363
|
+
.length;
|
|
355
364
|
const linkedUrls = new Set();
|
|
356
|
-
|
|
357
|
-
for (const link of page.links) {
|
|
358
|
-
if (link.href) {
|
|
359
|
-
linkedUrls.add(normalizeUrl(link.href));
|
|
360
|
-
}
|
|
361
|
-
}
|
|
362
|
-
}
|
|
363
|
-
const orphanUrls = this.sitemapUrls
|
|
364
|
-
.filter(u => {
|
|
365
|
-
const normalized = normalizeUrl(u.loc);
|
|
366
|
-
return !linkedUrls.has(normalized) && crawledUrls.has(normalized);
|
|
367
|
-
})
|
|
368
|
-
.map(u => u.loc);
|
|
369
|
-
const missingFromSitemap = Array.from(crawledUrls)
|
|
370
|
-
.filter(url => !this.sitemapUrlSet.has(url));
|
|
371
|
-
const blockedBySitemapRobots = [];
|
|
365
|
+
const blockedBySitemapRobotsSet = new Set();
|
|
372
366
|
if (this.robotsData) {
|
|
373
367
|
for (const sitemapUrl of this.sitemapUrls) {
|
|
374
368
|
try {
|
|
369
|
+
const normalized = normalizeUrl(sitemapUrl.loc);
|
|
375
370
|
const urlPath = new URL(sitemapUrl.loc).pathname;
|
|
376
371
|
if (!isPathAllowed(this.robotsData, urlPath, this.options.userAgent)) {
|
|
377
|
-
|
|
372
|
+
blockedBySitemapRobotsSet.add(normalized);
|
|
378
373
|
}
|
|
379
374
|
}
|
|
380
375
|
catch {
|
|
381
376
|
}
|
|
382
377
|
}
|
|
383
378
|
}
|
|
379
|
+
for (const page of this.results) {
|
|
380
|
+
for (const link of page.links) {
|
|
381
|
+
if (link.href) {
|
|
382
|
+
linkedUrls.add(normalizeUrl(link.href));
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
const orphanUrlSet = new Set();
|
|
387
|
+
for (const u of this.sitemapUrls) {
|
|
388
|
+
const normalized = normalizeUrl(u.loc);
|
|
389
|
+
if (!linkedUrls.has(normalized) && !blockedBySitemapRobotsSet.has(normalized)) {
|
|
390
|
+
orphanUrlSet.add(normalized);
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
const orphanUrls = Array.from(orphanUrlSet);
|
|
394
|
+
const missingFromSitemap = Array.from(crawledUrls)
|
|
395
|
+
.filter(url => !sitemapUrlSet.has(url));
|
|
396
|
+
const blockedBySitemapRobots = Array.from(blockedBySitemapRobotsSet);
|
|
384
397
|
return {
|
|
385
398
|
found: this.sitemapUrls.length > 0,
|
|
386
|
-
url: this.
|
|
387
|
-
totalUrls:
|
|
399
|
+
url: this.sitemapUrls[0]?.loc,
|
|
400
|
+
totalUrls: sitemapUrlSet.size,
|
|
388
401
|
crawledFromSitemap,
|
|
389
402
|
orphanUrls,
|
|
390
403
|
missingFromSitemap,
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import { type BlockDetectionResult } from '../utils/block-detector.js';
|
|
2
|
+
type SearchTransport = 'auto' | 'undici' | 'curl';
|
|
3
|
+
export type { SearchTransport };
|
|
4
|
+
export interface GoogleSearchAdvancedOptions {
|
|
5
|
+
asQ?: string;
|
|
6
|
+
as_q?: string;
|
|
7
|
+
asEpq?: string;
|
|
8
|
+
as_epq?: string;
|
|
9
|
+
asOq?: string;
|
|
10
|
+
as_oq?: string;
|
|
11
|
+
asEq?: string;
|
|
12
|
+
as_eq?: string;
|
|
13
|
+
asSitesearch?: string;
|
|
14
|
+
as_sitesearch?: string;
|
|
15
|
+
asFiletype?: string;
|
|
16
|
+
as_filetype?: string;
|
|
17
|
+
asRights?: string;
|
|
18
|
+
as_rights?: string;
|
|
19
|
+
asNlo?: number | string;
|
|
20
|
+
as_nlo?: number | string;
|
|
21
|
+
asNhi?: number | string;
|
|
22
|
+
as_nhi?: number | string;
|
|
23
|
+
safe?: string;
|
|
24
|
+
tbm?: string;
|
|
25
|
+
num?: number;
|
|
26
|
+
start?: number;
|
|
27
|
+
tbs?: string;
|
|
28
|
+
lr?: string;
|
|
29
|
+
cr?: string;
|
|
30
|
+
country?: string;
|
|
31
|
+
gl?: string;
|
|
32
|
+
hl?: string;
|
|
33
|
+
transport?: SearchTransport;
|
|
34
|
+
timeout?: number;
|
|
35
|
+
maxResults?: number;
|
|
36
|
+
extraParams?: Record<string, string | number | boolean>;
|
|
37
|
+
userAgent?: string;
|
|
38
|
+
headers?: HeadersInit;
|
|
39
|
+
includeRawHtml?: boolean;
|
|
40
|
+
}
|
|
41
|
+
export interface GoogleSearchResult {
|
|
42
|
+
rank: number;
|
|
43
|
+
title: string;
|
|
44
|
+
url: string;
|
|
45
|
+
snippet?: string;
|
|
46
|
+
displayedUrl?: string;
|
|
47
|
+
source?: string;
|
|
48
|
+
}
|
|
49
|
+
export interface SearchTransportDetails {
|
|
50
|
+
requested: SearchTransport;
|
|
51
|
+
used: SearchTransport;
|
|
52
|
+
fallbackUsed: boolean;
|
|
53
|
+
impersonateAvailable: boolean;
|
|
54
|
+
}
|
|
55
|
+
export interface GoogleSearchResponse {
|
|
56
|
+
query: string;
|
|
57
|
+
searchUrl: string;
|
|
58
|
+
results: GoogleSearchResult[];
|
|
59
|
+
nextPageUrl?: string;
|
|
60
|
+
nextPageStart?: number;
|
|
61
|
+
resultStats?: number;
|
|
62
|
+
block?: BlockDetectionResult;
|
|
63
|
+
transport: SearchTransportDetails;
|
|
64
|
+
status?: number;
|
|
65
|
+
rawHtml?: string;
|
|
66
|
+
}
|
|
67
|
+
export declare function searchGoogleAdvanced(query: string, options?: GoogleSearchAdvancedOptions): Promise<GoogleSearchResponse>;
|
|
@@ -0,0 +1,480 @@
|
|
|
1
|
+
import { createClient } from '../core/client.js';
|
|
2
|
+
import { ValidationError } from '../core/errors.js';
|
|
3
|
+
import { HttpRequest } from '../core/request.js';
|
|
4
|
+
import { ScrapeDocument } from '../scrape/document.js';
|
|
5
|
+
import { detectBlock } from '../utils/block-detector.js';
|
|
6
|
+
import { getRandomUserAgent } from '../utils/user-agent.js';
|
|
7
|
+
const GOOGLE_SEARCH_BASE_URL = 'https://www.google.com/search';
|
|
8
|
+
const GOOGLE_SEARCH_ORIGIN = 'https://www.google.com';
|
|
9
|
+
const GOOGLE_RESULT_SNIPPET_SELECTOR_ORDER = [
|
|
10
|
+
'[data-sncf="1"]',
|
|
11
|
+
'div[data-sncf="1"]',
|
|
12
|
+
'span.aCOpRe',
|
|
13
|
+
'div.aCOpRe',
|
|
14
|
+
'div.VwiC3b',
|
|
15
|
+
'span.VwiC3b',
|
|
16
|
+
'div.BNeawe',
|
|
17
|
+
'div.yXK7lf',
|
|
18
|
+
'div[data-attrid="wa:/description"]',
|
|
19
|
+
];
|
|
20
|
+
const GOOGLE_RESULT_LINK_SELECTORS = [
|
|
21
|
+
'a[href^="/url?q="]',
|
|
22
|
+
'a[href^="https://www.google.com/url?"]',
|
|
23
|
+
'a[href^="http://www.google.com/url?"]',
|
|
24
|
+
];
|
|
25
|
+
const GOOGLE_RESULT_CONTAINER_SELECTORS = '[data-hveid], [data-ved], div[class*="g"], div[class*="MjjY"], div[class*="tF2Cxc"], [class*="xpd"]';
|
|
26
|
+
const COUNTRY_CODE_PATTERN = /^[a-z]{2}$/;
|
|
27
|
+
const COUNTRY_ALIASES = {
|
|
28
|
+
us: 'us',
|
|
29
|
+
usa: 'us',
|
|
30
|
+
united_states: 'us',
|
|
31
|
+
'united states': 'us',
|
|
32
|
+
br: 'br',
|
|
33
|
+
brasil: 'br',
|
|
34
|
+
brazil: 'br',
|
|
35
|
+
pt_br: 'br',
|
|
36
|
+
pt: 'br',
|
|
37
|
+
de: 'de',
|
|
38
|
+
germany: 'de',
|
|
39
|
+
deutschland: 'de',
|
|
40
|
+
gb: 'gb',
|
|
41
|
+
uk: 'gb',
|
|
42
|
+
england: 'gb',
|
|
43
|
+
britain: 'gb',
|
|
44
|
+
'united kingdom': 'gb',
|
|
45
|
+
fr: 'fr',
|
|
46
|
+
france: 'fr',
|
|
47
|
+
spain: 'es',
|
|
48
|
+
españa: 'es',
|
|
49
|
+
es: 'es',
|
|
50
|
+
ca: 'ca',
|
|
51
|
+
mexico: 'mx',
|
|
52
|
+
mx: 'mx',
|
|
53
|
+
it: 'it',
|
|
54
|
+
italy: 'it',
|
|
55
|
+
au: 'au',
|
|
56
|
+
india: 'in',
|
|
57
|
+
in: 'in',
|
|
58
|
+
argentina: 'ar',
|
|
59
|
+
ar: 'ar',
|
|
60
|
+
};
|
|
61
|
+
const COUNTRY_ALIASES_NORMALIZED = Object.entries(COUNTRY_ALIASES).reduce((acc, [key, value]) => {
|
|
62
|
+
acc[key.toLowerCase().replace(/[^a-z]/g, '_')] = value;
|
|
63
|
+
return acc;
|
|
64
|
+
}, {});
|
|
65
|
+
function cleanText(value) {
|
|
66
|
+
return value.replace(/\s+/g, ' ').trim();
|
|
67
|
+
}
|
|
68
|
+
function isDefined(value) {
|
|
69
|
+
return value !== undefined && value !== null;
|
|
70
|
+
}
|
|
71
|
+
function toParamValue(value) {
|
|
72
|
+
if (value === undefined)
|
|
73
|
+
return undefined;
|
|
74
|
+
if (typeof value === 'boolean')
|
|
75
|
+
return value ? '1' : '0';
|
|
76
|
+
const trimmed = String(value).trim();
|
|
77
|
+
return trimmed.length > 0 ? trimmed : undefined;
|
|
78
|
+
}
|
|
79
|
+
function pick(...values) {
|
|
80
|
+
for (const value of values) {
|
|
81
|
+
if (value !== undefined) {
|
|
82
|
+
return value;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
return undefined;
|
|
86
|
+
}
|
|
87
|
+
function normalizeCountryCode(input) {
|
|
88
|
+
const normalized = input.trim().toLowerCase().replace(/\s+/g, '_');
|
|
89
|
+
if (COUNTRY_CODE_PATTERN.test(normalized)) {
|
|
90
|
+
return normalized;
|
|
91
|
+
}
|
|
92
|
+
const cleaned = normalized.replace(/[^a-z_]/g, '');
|
|
93
|
+
const directAlias = COUNTRY_ALIASES_NORMALIZED[cleaned];
|
|
94
|
+
if (directAlias) {
|
|
95
|
+
return directAlias;
|
|
96
|
+
}
|
|
97
|
+
if (normalized.includes('_')) {
|
|
98
|
+
const tail = normalized.split('_').pop() || '';
|
|
99
|
+
if (COUNTRY_CODE_PATTERN.test(tail)) {
|
|
100
|
+
return tail;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
return '';
|
|
104
|
+
}
|
|
105
|
+
function resolveCountryCode(country, legacyGl) {
|
|
106
|
+
if (country !== undefined) {
|
|
107
|
+
const resolved = normalizeCountryCode(country);
|
|
108
|
+
if (!resolved) {
|
|
109
|
+
throw new ValidationError('Invalid country for Google search. Use ISO 3166-1 alpha-2 code or a known country name.', {
|
|
110
|
+
field: 'country',
|
|
111
|
+
value: country,
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
return resolved;
|
|
115
|
+
}
|
|
116
|
+
return legacyGl ? legacyGl.trim().toLowerCase() : '';
|
|
117
|
+
}
|
|
118
|
+
function normalizeOptions(query, options = {}) {
|
|
119
|
+
const normalizedQuery = cleanText(query);
|
|
120
|
+
if (!normalizedQuery) {
|
|
121
|
+
throw new ValidationError('Google query is required', {
|
|
122
|
+
field: 'query',
|
|
123
|
+
value: query,
|
|
124
|
+
});
|
|
125
|
+
}
|
|
126
|
+
const resolvedTransport = (options.transport ?? 'auto');
|
|
127
|
+
return {
|
|
128
|
+
...options,
|
|
129
|
+
as_q: pick(options.as_q, options.asQ) ?? '',
|
|
130
|
+
query: normalizedQuery,
|
|
131
|
+
asEpq: pick(options.as_epq, options.asEpq) ?? '',
|
|
132
|
+
asOq: pick(options.as_oq, options.asOq) ?? '',
|
|
133
|
+
asEq: pick(options.as_eq, options.asEq) ?? '',
|
|
134
|
+
as_sitesearch: pick(options.as_sitesearch, options.asSitesearch) ?? '',
|
|
135
|
+
as_filetype: pick(options.as_filetype, options.asFiletype) ?? '',
|
|
136
|
+
as_rights: pick(options.as_rights, options.asRights) ?? '',
|
|
137
|
+
as_nlo: toParamValue(pick(options.as_nlo, options.asNlo)) ?? '',
|
|
138
|
+
as_nhi: toParamValue(pick(options.as_nhi, options.asNhi)) ?? '',
|
|
139
|
+
gl: resolveCountryCode(options.country, options.gl),
|
|
140
|
+
transport: resolvedTransport,
|
|
141
|
+
includeRawHtml: options.includeRawHtml ?? false,
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
function buildSearchUrl(query, options) {
|
|
145
|
+
const params = new URLSearchParams();
|
|
146
|
+
params.set('q', query);
|
|
147
|
+
params.set('ie', 'UTF-8');
|
|
148
|
+
params.set('oe', 'UTF-8');
|
|
149
|
+
if (options.as_q)
|
|
150
|
+
params.set('as_q', options.as_q);
|
|
151
|
+
if (options.asEpq)
|
|
152
|
+
params.set('as_epq', options.asEpq);
|
|
153
|
+
if (options.asOq)
|
|
154
|
+
params.set('as_oq', options.asOq);
|
|
155
|
+
if (options.asEq)
|
|
156
|
+
params.set('as_eq', options.asEq);
|
|
157
|
+
if (options.as_sitesearch)
|
|
158
|
+
params.set('as_sitesearch', options.as_sitesearch);
|
|
159
|
+
if (options.as_filetype)
|
|
160
|
+
params.set('as_filetype', options.as_filetype);
|
|
161
|
+
if (options.as_rights)
|
|
162
|
+
params.set('as_rights', options.as_rights);
|
|
163
|
+
if (options.as_nlo)
|
|
164
|
+
params.set('as_nlo', options.as_nlo);
|
|
165
|
+
if (options.as_nhi)
|
|
166
|
+
params.set('as_nhi', options.as_nhi);
|
|
167
|
+
if (options.safe)
|
|
168
|
+
params.set('safe', options.safe);
|
|
169
|
+
if (options.tbm)
|
|
170
|
+
params.set('tbm', options.tbm);
|
|
171
|
+
if (options.lr)
|
|
172
|
+
params.set('lr', options.lr);
|
|
173
|
+
if (options.cr)
|
|
174
|
+
params.set('cr', options.cr);
|
|
175
|
+
if (options.gl)
|
|
176
|
+
params.set('gl', options.gl);
|
|
177
|
+
if (options.hl)
|
|
178
|
+
params.set('hl', options.hl);
|
|
179
|
+
if (isDefined(options.num)) {
|
|
180
|
+
const parsed = Number(options.num);
|
|
181
|
+
if (Number.isFinite(parsed) && parsed > 0) {
|
|
182
|
+
params.set('num', String(Math.min(100, Math.floor(parsed))));
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
if (isDefined(options.start)) {
|
|
186
|
+
const parsedStart = Number(options.start);
|
|
187
|
+
if (Number.isFinite(parsedStart) && parsedStart >= 0) {
|
|
188
|
+
params.set('start', String(Math.floor(parsedStart)));
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
if (options.tbs)
|
|
192
|
+
params.set('tbs', options.tbs);
|
|
193
|
+
if (options.extraParams) {
|
|
194
|
+
for (const [key, value] of Object.entries(options.extraParams)) {
|
|
195
|
+
const normalized = toParamValue(value);
|
|
196
|
+
if (normalized !== undefined) {
|
|
197
|
+
params.set(key, normalized);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
return `${GOOGLE_SEARCH_BASE_URL}?${params.toString()}`;
|
|
202
|
+
}
|
|
203
|
+
function normalizeRequestHeaders(inputHeaders, userAgent) {
|
|
204
|
+
const headers = new Headers({
|
|
205
|
+
accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
206
|
+
'accept-language': 'en-US,en;q=0.9',
|
|
207
|
+
'cache-control': 'max-age=0',
|
|
208
|
+
'sec-ch-ua-mobile': '?0',
|
|
209
|
+
'sec-ch-ua-platform': '"Windows"',
|
|
210
|
+
'user-agent': userAgent,
|
|
211
|
+
referer: GOOGLE_SEARCH_ORIGIN,
|
|
212
|
+
});
|
|
213
|
+
if (inputHeaders) {
|
|
214
|
+
const incoming = new Headers(inputHeaders);
|
|
215
|
+
incoming.forEach((value, key) => headers.set(key, value));
|
|
216
|
+
}
|
|
217
|
+
const merged = {};
|
|
218
|
+
headers.forEach((value, key) => {
|
|
219
|
+
merged[key] = value;
|
|
220
|
+
});
|
|
221
|
+
return merged;
|
|
222
|
+
}
|
|
223
|
+
async function hasImpersonateBinary() {
|
|
224
|
+
try {
|
|
225
|
+
const { hasImpersonate } = await import('../utils/binary-manager.js');
|
|
226
|
+
return hasImpersonate();
|
|
227
|
+
}
|
|
228
|
+
catch {
|
|
229
|
+
return false;
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
async function fetchWithCurl(url, headers, timeout) {
|
|
233
|
+
const { CurlTransport } = await import('../transport/curl.js');
|
|
234
|
+
const transport = new CurlTransport();
|
|
235
|
+
const request = new HttpRequest(url, {
|
|
236
|
+
method: 'GET',
|
|
237
|
+
headers,
|
|
238
|
+
timeout,
|
|
239
|
+
});
|
|
240
|
+
const response = await transport.dispatch(request);
|
|
241
|
+
const html = await response.text();
|
|
242
|
+
return { html, status: response.status };
|
|
243
|
+
}
|
|
244
|
+
async function fetchSearchResults(url, options) {
|
|
245
|
+
const headers = normalizeRequestHeaders(options.headers, options.userAgent ?? getRandomUserAgent('desktop.chrome'));
|
|
246
|
+
const requestTimeout = options.timeout;
|
|
247
|
+
const impersonateAvailable = options.transport !== 'undici' && (await hasImpersonateBinary());
|
|
248
|
+
if (options.transport === 'curl' && !impersonateAvailable) {
|
|
249
|
+
throw new ValidationError('Transport "curl" requires curl-impersonate; install it with `rek setup`', {
|
|
250
|
+
field: 'transport',
|
|
251
|
+
value: options.transport,
|
|
252
|
+
});
|
|
253
|
+
}
|
|
254
|
+
if (options.transport === 'curl') {
|
|
255
|
+
const directResponse = await fetchWithCurl(url, headers, requestTimeout);
|
|
256
|
+
const directBlock = detectBlock({ status: directResponse.status, headers: new Headers() }, directResponse.html);
|
|
257
|
+
return {
|
|
258
|
+
html: directResponse.html,
|
|
259
|
+
status: directResponse.status,
|
|
260
|
+
transport: 'curl',
|
|
261
|
+
fallbackUsed: false,
|
|
262
|
+
impersonateAvailable,
|
|
263
|
+
block: directBlock,
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
const client = createClient({ timeout: requestTimeout });
|
|
267
|
+
const performUndiciRequest = async () => {
|
|
268
|
+
const response = await client.get(url, { headers });
|
|
269
|
+
const html = await response.text();
|
|
270
|
+
const block = detectBlock({ status: response.status, headers: response.headers }, html);
|
|
271
|
+
return {
|
|
272
|
+
html,
|
|
273
|
+
status: response.status,
|
|
274
|
+
transport: 'undici',
|
|
275
|
+
fallbackUsed: false,
|
|
276
|
+
impersonateAvailable,
|
|
277
|
+
block,
|
|
278
|
+
};
|
|
279
|
+
};
|
|
280
|
+
if (options.transport === 'undici') {
|
|
281
|
+
return performUndiciRequest();
|
|
282
|
+
}
|
|
283
|
+
if (!impersonateAvailable) {
|
|
284
|
+
return performUndiciRequest();
|
|
285
|
+
}
|
|
286
|
+
try {
|
|
287
|
+
const primaryImpersonateResponse = await fetchWithCurl(url, headers, requestTimeout);
|
|
288
|
+
const primaryImpersonateBlock = detectBlock({ status: primaryImpersonateResponse.status, headers: new Headers() }, primaryImpersonateResponse.html);
|
|
289
|
+
if (!primaryImpersonateBlock.blocked || primaryImpersonateBlock.confidence <= 0.7) {
|
|
290
|
+
return {
|
|
291
|
+
html: primaryImpersonateResponse.html,
|
|
292
|
+
status: primaryImpersonateResponse.status,
|
|
293
|
+
transport: 'curl',
|
|
294
|
+
fallbackUsed: false,
|
|
295
|
+
impersonateAvailable,
|
|
296
|
+
block: primaryImpersonateBlock,
|
|
297
|
+
};
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
catch {
|
|
301
|
+
}
|
|
302
|
+
const fallback = await performUndiciRequest();
|
|
303
|
+
return {
|
|
304
|
+
...fallback,
|
|
305
|
+
transport: fallback.transport,
|
|
306
|
+
fallbackUsed: true,
|
|
307
|
+
impersonateAvailable,
|
|
308
|
+
};
|
|
309
|
+
}
|
|
310
|
+
function resolveSearchResultUrl(rawHref) {
|
|
311
|
+
try {
|
|
312
|
+
const normalized = rawHref.startsWith('//') ? `https:${rawHref}` : rawHref;
|
|
313
|
+
const parsed = new URL(normalized, GOOGLE_SEARCH_ORIGIN);
|
|
314
|
+
const rawResultUrl = parsed.searchParams.get('q') ?? parsed.searchParams.get('url');
|
|
315
|
+
if (!rawResultUrl)
|
|
316
|
+
return null;
|
|
317
|
+
const candidate = new URL(decodeURIComponent(rawResultUrl), GOOGLE_SEARCH_ORIGIN);
|
|
318
|
+
if (!candidate.protocol.startsWith('http'))
|
|
319
|
+
return null;
|
|
320
|
+
if (candidate.hostname === 'www.google.com' && candidate.pathname === '/search')
|
|
321
|
+
return null;
|
|
322
|
+
return candidate.toString();
|
|
323
|
+
}
|
|
324
|
+
catch {
|
|
325
|
+
return null;
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
function extractDisplayedUrl(linkUrl, containerText) {
|
|
329
|
+
const direct = (() => {
|
|
330
|
+
try {
|
|
331
|
+
return new URL(linkUrl).hostname;
|
|
332
|
+
}
|
|
333
|
+
catch {
|
|
334
|
+
return '';
|
|
335
|
+
}
|
|
336
|
+
})();
|
|
337
|
+
if (direct)
|
|
338
|
+
return cleanText(direct);
|
|
339
|
+
return containerText ? cleanText(containerText).slice(0, 120) : '';
|
|
340
|
+
}
|
|
341
|
+
function looksLikeSnippet(text, title) {
|
|
342
|
+
const cleaned = cleanText(text);
|
|
343
|
+
if (cleaned.length < 25 || cleaned.length > 600)
|
|
344
|
+
return false;
|
|
345
|
+
if (cleaned === title)
|
|
346
|
+
return false;
|
|
347
|
+
if (/^https?:\/\//i.test(cleaned))
|
|
348
|
+
return false;
|
|
349
|
+
return true;
|
|
350
|
+
}
|
|
351
|
+
function parseResultStats(text) {
|
|
352
|
+
const normalized = text.replace(/,/g, '');
|
|
353
|
+
const match = normalized.match(/([0-9]+)\s*(?:result|resultado)/i);
|
|
354
|
+
if (!match)
|
|
355
|
+
return undefined;
|
|
356
|
+
const parsed = Number.parseInt(match[1], 10);
|
|
357
|
+
return Number.isFinite(parsed) ? parsed : undefined;
|
|
358
|
+
}
|
|
359
|
+
function parseSearchPage(html, options) {
|
|
360
|
+
const doc = ScrapeDocument.createSync(html, { baseUrl: GOOGLE_SEARCH_ORIGIN });
|
|
361
|
+
const results = [];
|
|
362
|
+
const seen = new Set();
|
|
363
|
+
const maxResults = options.maxResults ? Number(options.maxResults) : undefined;
|
|
364
|
+
const linkSelector = GOOGLE_RESULT_LINK_SELECTORS.join(', ');
|
|
365
|
+
const anchors = doc.selectAll(linkSelector);
|
|
366
|
+
for (const anchor of anchors) {
|
|
367
|
+
const rawHref = anchor.attr('href');
|
|
368
|
+
if (!rawHref)
|
|
369
|
+
continue;
|
|
370
|
+
const resultUrl = resolveSearchResultUrl(rawHref);
|
|
371
|
+
if (!resultUrl || seen.has(resultUrl))
|
|
372
|
+
continue;
|
|
373
|
+
const titleText = (() => {
|
|
374
|
+
const fromHeading = anchor.find('h3').text();
|
|
375
|
+
if (fromHeading)
|
|
376
|
+
return cleanText(fromHeading);
|
|
377
|
+
const fromContainer = anchor.text();
|
|
378
|
+
return cleanText(fromContainer);
|
|
379
|
+
})();
|
|
380
|
+
if (!titleText)
|
|
381
|
+
continue;
|
|
382
|
+
const resultContainer = anchor.parents(GOOGLE_RESULT_CONTAINER_SELECTORS).first();
|
|
383
|
+
const snippet = (() => {
|
|
384
|
+
for (const selector of GOOGLE_RESULT_SNIPPET_SELECTOR_ORDER) {
|
|
385
|
+
const snippetNode = resultContainer.find(selector).first();
|
|
386
|
+
const snippetText = cleanText(snippetNode.text());
|
|
387
|
+
if (looksLikeSnippet(snippetText, titleText)) {
|
|
388
|
+
return snippetText;
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
const fallbackElements = resultContainer.find('span,div').toArray();
|
|
392
|
+
for (const fallbackElement of fallbackElements) {
|
|
393
|
+
const fallbackText = cleanText(fallbackElement.text());
|
|
394
|
+
if (looksLikeSnippet(fallbackText, titleText)) {
|
|
395
|
+
return fallbackText;
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
const rootFallback = cleanText(resultContainer.text());
|
|
399
|
+
if (looksLikeSnippet(rootFallback, titleText)) {
|
|
400
|
+
return rootFallback.slice(0, 240);
|
|
401
|
+
}
|
|
402
|
+
return undefined;
|
|
403
|
+
})();
|
|
404
|
+
const item = {
|
|
405
|
+
rank: results.length + 1,
|
|
406
|
+
title: titleText,
|
|
407
|
+
url: resultUrl,
|
|
408
|
+
snippet,
|
|
409
|
+
displayedUrl: extractDisplayedUrl(resultUrl, anchor.text()),
|
|
410
|
+
};
|
|
411
|
+
results.push(item);
|
|
412
|
+
seen.add(resultUrl);
|
|
413
|
+
if (typeof maxResults === 'number' && Number.isFinite(maxResults) && results.length >= maxResults) {
|
|
414
|
+
break;
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
const nextPageRaw = (() => {
|
|
418
|
+
const candidate = doc.selectFirst('a#pnnext, a[aria-label="Next"], a[id="pnnext"]').first();
|
|
419
|
+
if (candidate && candidate.length) {
|
|
420
|
+
const href = candidate.attr('href');
|
|
421
|
+
if (!href)
|
|
422
|
+
return undefined;
|
|
423
|
+
try {
|
|
424
|
+
return new URL(href, GOOGLE_SEARCH_ORIGIN).toString();
|
|
425
|
+
}
|
|
426
|
+
catch {
|
|
427
|
+
return undefined;
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
return undefined;
|
|
431
|
+
})();
|
|
432
|
+
const nextPageStart = (() => {
|
|
433
|
+
if (!nextPageRaw)
|
|
434
|
+
return undefined;
|
|
435
|
+
try {
|
|
436
|
+
const nextUrl = new URL(nextPageRaw);
|
|
437
|
+
const next = nextUrl.searchParams.get('start');
|
|
438
|
+
if (!next)
|
|
439
|
+
return undefined;
|
|
440
|
+
const parsed = Number.parseInt(next, 10);
|
|
441
|
+
return Number.isFinite(parsed) ? parsed : undefined;
|
|
442
|
+
}
|
|
443
|
+
catch {
|
|
444
|
+
return undefined;
|
|
445
|
+
}
|
|
446
|
+
})();
|
|
447
|
+
const resultStats = parseResultStats(doc.selectFirst('#result-stats').text());
|
|
448
|
+
return {
|
|
449
|
+
results,
|
|
450
|
+
nextPageUrl: nextPageRaw,
|
|
451
|
+
nextPageStart,
|
|
452
|
+
resultStats,
|
|
453
|
+
};
|
|
454
|
+
}
|
|
455
|
+
export async function searchGoogleAdvanced(query, options = {}) {
|
|
456
|
+
const normalized = normalizeOptions(query, options);
|
|
457
|
+
const searchUrl = buildSearchUrl(query, normalized);
|
|
458
|
+
const fetchResult = await fetchSearchResults(searchUrl, normalized);
|
|
459
|
+
const parsed = parseSearchPage(fetchResult.html, normalized);
|
|
460
|
+
const response = {
|
|
461
|
+
query: normalized.query,
|
|
462
|
+
searchUrl,
|
|
463
|
+
results: parsed.results,
|
|
464
|
+
transport: {
|
|
465
|
+
requested: normalized.transport,
|
|
466
|
+
used: fetchResult.transport,
|
|
467
|
+
fallbackUsed: fetchResult.fallbackUsed,
|
|
468
|
+
impersonateAvailable: fetchResult.impersonateAvailable,
|
|
469
|
+
},
|
|
470
|
+
status: fetchResult.status,
|
|
471
|
+
block: fetchResult.block,
|
|
472
|
+
nextPageUrl: parsed.nextPageUrl,
|
|
473
|
+
nextPageStart: parsed.nextPageStart,
|
|
474
|
+
resultStats: parsed.resultStats,
|
|
475
|
+
};
|
|
476
|
+
if (normalized.includeRawHtml) {
|
|
477
|
+
response.rawHtml = fetchResult.html;
|
|
478
|
+
}
|
|
479
|
+
return response;
|
|
480
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { searchGoogleAdvanced } from './google.js';
|
package/dist/seo/analyzer.d.ts
CHANGED
|
@@ -12,6 +12,7 @@ export declare class SeoAnalyzer {
|
|
|
12
12
|
static fromHtml(html: string, options?: SeoAnalyzerFullOptions): Promise<SeoAnalyzer>;
|
|
13
13
|
analyze(): SeoReport;
|
|
14
14
|
private getMainBody;
|
|
15
|
+
private detectPageType;
|
|
15
16
|
private getVisibleText;
|
|
16
17
|
private buildRuleContext;
|
|
17
18
|
private analyzeUrlQuality;
|
|
@@ -32,7 +33,6 @@ export declare class SeoAnalyzer {
|
|
|
32
33
|
private analyzeAnalytics;
|
|
33
34
|
private analyzeFeeds;
|
|
34
35
|
private analyzeConversionElements;
|
|
35
|
-
private analyzeAdvancedImages;
|
|
36
36
|
private calculateTextHtmlRatio;
|
|
37
37
|
private convertToCheckResults;
|
|
38
38
|
private buildSummary;
|