recker 1.0.73 → 1.0.75-next.2e5a94f
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -18
- package/dist/browser/core/client.d.ts +14 -8
- package/dist/browser/core/client.js +199 -17
- package/dist/browser/core/errors.d.ts +15 -1
- package/dist/browser/core/errors.js +140 -9
- package/dist/browser/core/request.d.ts +5 -0
- package/dist/browser/core/request.js +33 -2
- package/dist/browser/core-runtime/plugin-manifest.d.ts +24 -0
- package/dist/browser/core-runtime/plugin-manifest.js +159 -0
- package/dist/browser/core-runtime/request-context.d.ts +13 -0
- package/dist/browser/core-runtime/request-context.js +24 -0
- package/dist/browser/core-runtime/typed-events.d.ts +89 -0
- package/dist/browser/core-runtime/typed-events.js +34 -0
- package/dist/browser/index.iife.min.js +79 -79
- package/dist/browser/index.min.js +79 -79
- package/dist/browser/index.mini.iife.js +913 -97
- package/dist/browser/index.mini.iife.min.js +46 -46
- package/dist/browser/index.mini.min.js +46 -46
- package/dist/browser/index.mini.umd.js +913 -97
- package/dist/browser/index.mini.umd.min.js +46 -46
- package/dist/browser/index.umd.min.js +79 -79
- package/dist/browser/plugins/auth/aws-sigv4.d.ts +1 -0
- package/dist/browser/plugins/auth/aws-sigv4.js +19 -2
- package/dist/browser/plugins/retry.js +29 -1
- package/dist/browser/presets/aws.d.ts +1 -0
- package/dist/browser/presets/aws.js +62 -1
- package/dist/browser/runner/request-runner.d.ts +15 -5
- package/dist/browser/runner/request-runner.js +164 -30
- package/dist/browser/scrape/parser/nodes/html.d.ts +6 -0
- package/dist/browser/scrape/parser/nodes/html.js +70 -18
- package/dist/browser/scrape/parser/nodes/node.d.ts +1 -0
- package/dist/browser/scrape/parser/nodes/node.js +5 -0
- package/dist/browser/scrape/spider.d.ts +1 -0
- package/dist/browser/scrape/spider.js +39 -26
- package/dist/browser/seo/analyzer.d.ts +1 -1
- package/dist/browser/seo/analyzer.js +73 -42
- package/dist/browser/seo/index.d.ts +1 -1
- package/dist/browser/seo/rules/types.d.ts +2 -0
- package/dist/browser/seo/seo-spider.d.ts +2 -3
- package/dist/browser/seo/seo-spider.js +26 -202
- package/dist/browser/seo/types.d.ts +4 -0
- package/dist/browser/seo/validators/sitemap.js +9 -2
- package/dist/browser/transport/fetch.js +38 -5
- package/dist/browser/transport/undici.js +73 -11
- package/dist/browser/transport/worker.d.ts +0 -1
- package/dist/browser/transport/worker.js +1 -3
- package/dist/browser/types/index.d.ts +24 -0
- package/dist/cli/commands/mcp.js +5 -3
- package/dist/core/client.d.ts +14 -8
- package/dist/core/client.js +199 -17
- package/dist/core/errors.d.ts +15 -1
- package/dist/core/errors.js +140 -9
- package/dist/core/request.d.ts +5 -0
- package/dist/core/request.js +33 -2
- package/dist/core-runtime/plugin-manifest.d.ts +24 -0
- package/dist/core-runtime/plugin-manifest.js +159 -0
- package/dist/core-runtime/request-context.d.ts +13 -0
- package/dist/core-runtime/request-context.js +24 -0
- package/dist/core-runtime/typed-events.d.ts +89 -0
- package/dist/core-runtime/typed-events.js +34 -0
- package/dist/index.d.ts +2 -1
- package/dist/index.js +2 -1
- package/dist/mcp/cli.js +10 -8
- package/dist/mcp/profiles.d.ts +1 -1
- package/dist/mcp/profiles.js +31 -6
- package/dist/mcp/tools/categories.js +0 -1
- package/dist/plugins/auth/aws-sigv4.d.ts +1 -0
- package/dist/plugins/auth/aws-sigv4.js +19 -2
- package/dist/plugins/retry.js +29 -1
- package/dist/presets/aws.d.ts +1 -0
- package/dist/presets/aws.js +62 -1
- package/dist/recker.d.ts +3 -0
- package/dist/recker.js +5 -0
- package/dist/runner/request-runner.d.ts +15 -5
- package/dist/runner/request-runner.js +164 -30
- package/dist/scrape/parser/nodes/html.d.ts +6 -0
- package/dist/scrape/parser/nodes/html.js +70 -18
- package/dist/scrape/parser/nodes/node.d.ts +1 -0
- package/dist/scrape/parser/nodes/node.js +5 -0
- package/dist/scrape/spider.d.ts +1 -0
- package/dist/scrape/spider.js +39 -26
- package/dist/search/google.d.ts +67 -0
- package/dist/search/google.js +480 -0
- package/dist/search/index.d.ts +3 -0
- package/dist/search/index.js +1 -0
- package/dist/seo/analyzer.d.ts +1 -1
- package/dist/seo/analyzer.js +73 -42
- package/dist/seo/index.d.ts +1 -1
- package/dist/seo/rules/types.d.ts +2 -0
- package/dist/seo/seo-spider.d.ts +2 -3
- package/dist/seo/seo-spider.js +26 -202
- package/dist/seo/types.d.ts +4 -0
- package/dist/seo/validators/sitemap.js +9 -2
- package/dist/transport/fetch.js +38 -5
- package/dist/transport/undici.js +73 -11
- package/dist/transport/worker.d.ts +0 -1
- package/dist/transport/worker.js +1 -3
- package/dist/types/index.d.ts +24 -0
- package/dist/version.js +1 -1
- package/package.json +9 -1
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import { type BlockDetectionResult } from '../utils/block-detector.js';
|
|
2
|
+
type SearchTransport = 'auto' | 'undici' | 'curl';
|
|
3
|
+
export type { SearchTransport };
|
|
4
|
+
export interface GoogleSearchAdvancedOptions {
|
|
5
|
+
asQ?: string;
|
|
6
|
+
as_q?: string;
|
|
7
|
+
asEpq?: string;
|
|
8
|
+
as_epq?: string;
|
|
9
|
+
asOq?: string;
|
|
10
|
+
as_oq?: string;
|
|
11
|
+
asEq?: string;
|
|
12
|
+
as_eq?: string;
|
|
13
|
+
asSitesearch?: string;
|
|
14
|
+
as_sitesearch?: string;
|
|
15
|
+
asFiletype?: string;
|
|
16
|
+
as_filetype?: string;
|
|
17
|
+
asRights?: string;
|
|
18
|
+
as_rights?: string;
|
|
19
|
+
asNlo?: number | string;
|
|
20
|
+
as_nlo?: number | string;
|
|
21
|
+
asNhi?: number | string;
|
|
22
|
+
as_nhi?: number | string;
|
|
23
|
+
safe?: string;
|
|
24
|
+
tbm?: string;
|
|
25
|
+
num?: number;
|
|
26
|
+
start?: number;
|
|
27
|
+
tbs?: string;
|
|
28
|
+
lr?: string;
|
|
29
|
+
cr?: string;
|
|
30
|
+
country?: string;
|
|
31
|
+
gl?: string;
|
|
32
|
+
hl?: string;
|
|
33
|
+
transport?: SearchTransport;
|
|
34
|
+
timeout?: number;
|
|
35
|
+
maxResults?: number;
|
|
36
|
+
extraParams?: Record<string, string | number | boolean>;
|
|
37
|
+
userAgent?: string;
|
|
38
|
+
headers?: HeadersInit;
|
|
39
|
+
includeRawHtml?: boolean;
|
|
40
|
+
}
|
|
41
|
+
export interface GoogleSearchResult {
|
|
42
|
+
rank: number;
|
|
43
|
+
title: string;
|
|
44
|
+
url: string;
|
|
45
|
+
snippet?: string;
|
|
46
|
+
displayedUrl?: string;
|
|
47
|
+
source?: string;
|
|
48
|
+
}
|
|
49
|
+
export interface SearchTransportDetails {
|
|
50
|
+
requested: SearchTransport;
|
|
51
|
+
used: SearchTransport;
|
|
52
|
+
fallbackUsed: boolean;
|
|
53
|
+
impersonateAvailable: boolean;
|
|
54
|
+
}
|
|
55
|
+
export interface GoogleSearchResponse {
|
|
56
|
+
query: string;
|
|
57
|
+
searchUrl: string;
|
|
58
|
+
results: GoogleSearchResult[];
|
|
59
|
+
nextPageUrl?: string;
|
|
60
|
+
nextPageStart?: number;
|
|
61
|
+
resultStats?: number;
|
|
62
|
+
block?: BlockDetectionResult;
|
|
63
|
+
transport: SearchTransportDetails;
|
|
64
|
+
status?: number;
|
|
65
|
+
rawHtml?: string;
|
|
66
|
+
}
|
|
67
|
+
export declare function searchGoogleAdvanced(query: string, options?: GoogleSearchAdvancedOptions): Promise<GoogleSearchResponse>;
|
|
@@ -0,0 +1,480 @@
|
|
|
1
|
+
import { createClient } from '../core/client.js';
|
|
2
|
+
import { ValidationError } from '../core/errors.js';
|
|
3
|
+
import { HttpRequest } from '../core/request.js';
|
|
4
|
+
import { ScrapeDocument } from '../scrape/document.js';
|
|
5
|
+
import { detectBlock } from '../utils/block-detector.js';
|
|
6
|
+
import { getRandomUserAgent } from '../utils/user-agent.js';
|
|
7
|
+
const GOOGLE_SEARCH_BASE_URL = 'https://www.google.com/search';
|
|
8
|
+
const GOOGLE_SEARCH_ORIGIN = 'https://www.google.com';
|
|
9
|
+
const GOOGLE_RESULT_SNIPPET_SELECTOR_ORDER = [
|
|
10
|
+
'[data-sncf="1"]',
|
|
11
|
+
'div[data-sncf="1"]',
|
|
12
|
+
'span.aCOpRe',
|
|
13
|
+
'div.aCOpRe',
|
|
14
|
+
'div.VwiC3b',
|
|
15
|
+
'span.VwiC3b',
|
|
16
|
+
'div.BNeawe',
|
|
17
|
+
'div.yXK7lf',
|
|
18
|
+
'div[data-attrid="wa:/description"]',
|
|
19
|
+
];
|
|
20
|
+
const GOOGLE_RESULT_LINK_SELECTORS = [
|
|
21
|
+
'a[href^="/url?q="]',
|
|
22
|
+
'a[href^="https://www.google.com/url?"]',
|
|
23
|
+
'a[href^="http://www.google.com/url?"]',
|
|
24
|
+
];
|
|
25
|
+
const GOOGLE_RESULT_CONTAINER_SELECTORS = '[data-hveid], [data-ved], div[class*="g"], div[class*="MjjY"], div[class*="tF2Cxc"], [class*="xpd"]';
|
|
26
|
+
const COUNTRY_CODE_PATTERN = /^[a-z]{2}$/;
|
|
27
|
+
const COUNTRY_ALIASES = {
|
|
28
|
+
us: 'us',
|
|
29
|
+
usa: 'us',
|
|
30
|
+
united_states: 'us',
|
|
31
|
+
'united states': 'us',
|
|
32
|
+
br: 'br',
|
|
33
|
+
brasil: 'br',
|
|
34
|
+
brazil: 'br',
|
|
35
|
+
pt_br: 'br',
|
|
36
|
+
pt: 'br',
|
|
37
|
+
de: 'de',
|
|
38
|
+
germany: 'de',
|
|
39
|
+
deutschland: 'de',
|
|
40
|
+
gb: 'gb',
|
|
41
|
+
uk: 'gb',
|
|
42
|
+
england: 'gb',
|
|
43
|
+
britain: 'gb',
|
|
44
|
+
'united kingdom': 'gb',
|
|
45
|
+
fr: 'fr',
|
|
46
|
+
france: 'fr',
|
|
47
|
+
spain: 'es',
|
|
48
|
+
españa: 'es',
|
|
49
|
+
es: 'es',
|
|
50
|
+
ca: 'ca',
|
|
51
|
+
mexico: 'mx',
|
|
52
|
+
mx: 'mx',
|
|
53
|
+
it: 'it',
|
|
54
|
+
italy: 'it',
|
|
55
|
+
au: 'au',
|
|
56
|
+
india: 'in',
|
|
57
|
+
in: 'in',
|
|
58
|
+
argentina: 'ar',
|
|
59
|
+
ar: 'ar',
|
|
60
|
+
};
|
|
61
|
+
const COUNTRY_ALIASES_NORMALIZED = Object.entries(COUNTRY_ALIASES).reduce((acc, [key, value]) => {
|
|
62
|
+
acc[key.toLowerCase().replace(/[^a-z]/g, '_')] = value;
|
|
63
|
+
return acc;
|
|
64
|
+
}, {});
|
|
65
|
+
function cleanText(value) {
|
|
66
|
+
return value.replace(/\s+/g, ' ').trim();
|
|
67
|
+
}
|
|
68
|
+
function isDefined(value) {
|
|
69
|
+
return value !== undefined && value !== null;
|
|
70
|
+
}
|
|
71
|
+
function toParamValue(value) {
|
|
72
|
+
if (value === undefined)
|
|
73
|
+
return undefined;
|
|
74
|
+
if (typeof value === 'boolean')
|
|
75
|
+
return value ? '1' : '0';
|
|
76
|
+
const trimmed = String(value).trim();
|
|
77
|
+
return trimmed.length > 0 ? trimmed : undefined;
|
|
78
|
+
}
|
|
79
|
+
function pick(...values) {
|
|
80
|
+
for (const value of values) {
|
|
81
|
+
if (value !== undefined) {
|
|
82
|
+
return value;
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
return undefined;
|
|
86
|
+
}
|
|
87
|
+
function normalizeCountryCode(input) {
|
|
88
|
+
const normalized = input.trim().toLowerCase().replace(/\s+/g, '_');
|
|
89
|
+
if (COUNTRY_CODE_PATTERN.test(normalized)) {
|
|
90
|
+
return normalized;
|
|
91
|
+
}
|
|
92
|
+
const cleaned = normalized.replace(/[^a-z_]/g, '');
|
|
93
|
+
const directAlias = COUNTRY_ALIASES_NORMALIZED[cleaned];
|
|
94
|
+
if (directAlias) {
|
|
95
|
+
return directAlias;
|
|
96
|
+
}
|
|
97
|
+
if (normalized.includes('_')) {
|
|
98
|
+
const tail = normalized.split('_').pop() || '';
|
|
99
|
+
if (COUNTRY_CODE_PATTERN.test(tail)) {
|
|
100
|
+
return tail;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
return '';
|
|
104
|
+
}
|
|
105
|
+
function resolveCountryCode(country, legacyGl) {
|
|
106
|
+
if (country !== undefined) {
|
|
107
|
+
const resolved = normalizeCountryCode(country);
|
|
108
|
+
if (!resolved) {
|
|
109
|
+
throw new ValidationError('Invalid country for Google search. Use ISO 3166-1 alpha-2 code or a known country name.', {
|
|
110
|
+
field: 'country',
|
|
111
|
+
value: country,
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
return resolved;
|
|
115
|
+
}
|
|
116
|
+
return legacyGl ? legacyGl.trim().toLowerCase() : '';
|
|
117
|
+
}
|
|
118
|
+
function normalizeOptions(query, options = {}) {
|
|
119
|
+
const normalizedQuery = cleanText(query);
|
|
120
|
+
if (!normalizedQuery) {
|
|
121
|
+
throw new ValidationError('Google query is required', {
|
|
122
|
+
field: 'query',
|
|
123
|
+
value: query,
|
|
124
|
+
});
|
|
125
|
+
}
|
|
126
|
+
const resolvedTransport = (options.transport ?? 'auto');
|
|
127
|
+
return {
|
|
128
|
+
...options,
|
|
129
|
+
as_q: pick(options.as_q, options.asQ) ?? '',
|
|
130
|
+
query: normalizedQuery,
|
|
131
|
+
asEpq: pick(options.as_epq, options.asEpq) ?? '',
|
|
132
|
+
asOq: pick(options.as_oq, options.asOq) ?? '',
|
|
133
|
+
asEq: pick(options.as_eq, options.asEq) ?? '',
|
|
134
|
+
as_sitesearch: pick(options.as_sitesearch, options.asSitesearch) ?? '',
|
|
135
|
+
as_filetype: pick(options.as_filetype, options.asFiletype) ?? '',
|
|
136
|
+
as_rights: pick(options.as_rights, options.asRights) ?? '',
|
|
137
|
+
as_nlo: toParamValue(pick(options.as_nlo, options.asNlo)) ?? '',
|
|
138
|
+
as_nhi: toParamValue(pick(options.as_nhi, options.asNhi)) ?? '',
|
|
139
|
+
gl: resolveCountryCode(options.country, options.gl),
|
|
140
|
+
transport: resolvedTransport,
|
|
141
|
+
includeRawHtml: options.includeRawHtml ?? false,
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
function buildSearchUrl(query, options) {
|
|
145
|
+
const params = new URLSearchParams();
|
|
146
|
+
params.set('q', query);
|
|
147
|
+
params.set('ie', 'UTF-8');
|
|
148
|
+
params.set('oe', 'UTF-8');
|
|
149
|
+
if (options.as_q)
|
|
150
|
+
params.set('as_q', options.as_q);
|
|
151
|
+
if (options.asEpq)
|
|
152
|
+
params.set('as_epq', options.asEpq);
|
|
153
|
+
if (options.asOq)
|
|
154
|
+
params.set('as_oq', options.asOq);
|
|
155
|
+
if (options.asEq)
|
|
156
|
+
params.set('as_eq', options.asEq);
|
|
157
|
+
if (options.as_sitesearch)
|
|
158
|
+
params.set('as_sitesearch', options.as_sitesearch);
|
|
159
|
+
if (options.as_filetype)
|
|
160
|
+
params.set('as_filetype', options.as_filetype);
|
|
161
|
+
if (options.as_rights)
|
|
162
|
+
params.set('as_rights', options.as_rights);
|
|
163
|
+
if (options.as_nlo)
|
|
164
|
+
params.set('as_nlo', options.as_nlo);
|
|
165
|
+
if (options.as_nhi)
|
|
166
|
+
params.set('as_nhi', options.as_nhi);
|
|
167
|
+
if (options.safe)
|
|
168
|
+
params.set('safe', options.safe);
|
|
169
|
+
if (options.tbm)
|
|
170
|
+
params.set('tbm', options.tbm);
|
|
171
|
+
if (options.lr)
|
|
172
|
+
params.set('lr', options.lr);
|
|
173
|
+
if (options.cr)
|
|
174
|
+
params.set('cr', options.cr);
|
|
175
|
+
if (options.gl)
|
|
176
|
+
params.set('gl', options.gl);
|
|
177
|
+
if (options.hl)
|
|
178
|
+
params.set('hl', options.hl);
|
|
179
|
+
if (isDefined(options.num)) {
|
|
180
|
+
const parsed = Number(options.num);
|
|
181
|
+
if (Number.isFinite(parsed) && parsed > 0) {
|
|
182
|
+
params.set('num', String(Math.min(100, Math.floor(parsed))));
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
if (isDefined(options.start)) {
|
|
186
|
+
const parsedStart = Number(options.start);
|
|
187
|
+
if (Number.isFinite(parsedStart) && parsedStart >= 0) {
|
|
188
|
+
params.set('start', String(Math.floor(parsedStart)));
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
if (options.tbs)
|
|
192
|
+
params.set('tbs', options.tbs);
|
|
193
|
+
if (options.extraParams) {
|
|
194
|
+
for (const [key, value] of Object.entries(options.extraParams)) {
|
|
195
|
+
const normalized = toParamValue(value);
|
|
196
|
+
if (normalized !== undefined) {
|
|
197
|
+
params.set(key, normalized);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
return `${GOOGLE_SEARCH_BASE_URL}?${params.toString()}`;
|
|
202
|
+
}
|
|
203
|
+
function normalizeRequestHeaders(inputHeaders, userAgent) {
|
|
204
|
+
const headers = new Headers({
|
|
205
|
+
accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
206
|
+
'accept-language': 'en-US,en;q=0.9',
|
|
207
|
+
'cache-control': 'max-age=0',
|
|
208
|
+
'sec-ch-ua-mobile': '?0',
|
|
209
|
+
'sec-ch-ua-platform': '"Windows"',
|
|
210
|
+
'user-agent': userAgent,
|
|
211
|
+
referer: GOOGLE_SEARCH_ORIGIN,
|
|
212
|
+
});
|
|
213
|
+
if (inputHeaders) {
|
|
214
|
+
const incoming = new Headers(inputHeaders);
|
|
215
|
+
incoming.forEach((value, key) => headers.set(key, value));
|
|
216
|
+
}
|
|
217
|
+
const merged = {};
|
|
218
|
+
headers.forEach((value, key) => {
|
|
219
|
+
merged[key] = value;
|
|
220
|
+
});
|
|
221
|
+
return merged;
|
|
222
|
+
}
|
|
223
|
+
async function hasImpersonateBinary() {
|
|
224
|
+
try {
|
|
225
|
+
const { hasImpersonate } = await import('../utils/binary-manager.js');
|
|
226
|
+
return hasImpersonate();
|
|
227
|
+
}
|
|
228
|
+
catch {
|
|
229
|
+
return false;
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
async function fetchWithCurl(url, headers, timeout) {
|
|
233
|
+
const { CurlTransport } = await import('../transport/curl.js');
|
|
234
|
+
const transport = new CurlTransport();
|
|
235
|
+
const request = new HttpRequest(url, {
|
|
236
|
+
method: 'GET',
|
|
237
|
+
headers,
|
|
238
|
+
timeout,
|
|
239
|
+
});
|
|
240
|
+
const response = await transport.dispatch(request);
|
|
241
|
+
const html = await response.text();
|
|
242
|
+
return { html, status: response.status };
|
|
243
|
+
}
|
|
244
|
+
async function fetchSearchResults(url, options) {
|
|
245
|
+
const headers = normalizeRequestHeaders(options.headers, options.userAgent ?? getRandomUserAgent('desktop.chrome'));
|
|
246
|
+
const requestTimeout = options.timeout;
|
|
247
|
+
const impersonateAvailable = options.transport !== 'undici' && (await hasImpersonateBinary());
|
|
248
|
+
if (options.transport === 'curl' && !impersonateAvailable) {
|
|
249
|
+
throw new ValidationError('Transport "curl" requires curl-impersonate; install it with `rek setup`', {
|
|
250
|
+
field: 'transport',
|
|
251
|
+
value: options.transport,
|
|
252
|
+
});
|
|
253
|
+
}
|
|
254
|
+
if (options.transport === 'curl') {
|
|
255
|
+
const directResponse = await fetchWithCurl(url, headers, requestTimeout);
|
|
256
|
+
const directBlock = detectBlock({ status: directResponse.status, headers: new Headers() }, directResponse.html);
|
|
257
|
+
return {
|
|
258
|
+
html: directResponse.html,
|
|
259
|
+
status: directResponse.status,
|
|
260
|
+
transport: 'curl',
|
|
261
|
+
fallbackUsed: false,
|
|
262
|
+
impersonateAvailable,
|
|
263
|
+
block: directBlock,
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
const client = createClient({ timeout: requestTimeout });
|
|
267
|
+
const performUndiciRequest = async () => {
|
|
268
|
+
const response = await client.get(url, { headers });
|
|
269
|
+
const html = await response.text();
|
|
270
|
+
const block = detectBlock({ status: response.status, headers: response.headers }, html);
|
|
271
|
+
return {
|
|
272
|
+
html,
|
|
273
|
+
status: response.status,
|
|
274
|
+
transport: 'undici',
|
|
275
|
+
fallbackUsed: false,
|
|
276
|
+
impersonateAvailable,
|
|
277
|
+
block,
|
|
278
|
+
};
|
|
279
|
+
};
|
|
280
|
+
if (options.transport === 'undici') {
|
|
281
|
+
return performUndiciRequest();
|
|
282
|
+
}
|
|
283
|
+
if (!impersonateAvailable) {
|
|
284
|
+
return performUndiciRequest();
|
|
285
|
+
}
|
|
286
|
+
try {
|
|
287
|
+
const primaryImpersonateResponse = await fetchWithCurl(url, headers, requestTimeout);
|
|
288
|
+
const primaryImpersonateBlock = detectBlock({ status: primaryImpersonateResponse.status, headers: new Headers() }, primaryImpersonateResponse.html);
|
|
289
|
+
if (!primaryImpersonateBlock.blocked || primaryImpersonateBlock.confidence <= 0.7) {
|
|
290
|
+
return {
|
|
291
|
+
html: primaryImpersonateResponse.html,
|
|
292
|
+
status: primaryImpersonateResponse.status,
|
|
293
|
+
transport: 'curl',
|
|
294
|
+
fallbackUsed: false,
|
|
295
|
+
impersonateAvailable,
|
|
296
|
+
block: primaryImpersonateBlock,
|
|
297
|
+
};
|
|
298
|
+
}
|
|
299
|
+
}
|
|
300
|
+
catch {
|
|
301
|
+
}
|
|
302
|
+
const fallback = await performUndiciRequest();
|
|
303
|
+
return {
|
|
304
|
+
...fallback,
|
|
305
|
+
transport: fallback.transport,
|
|
306
|
+
fallbackUsed: true,
|
|
307
|
+
impersonateAvailable,
|
|
308
|
+
};
|
|
309
|
+
}
|
|
310
|
+
function resolveSearchResultUrl(rawHref) {
|
|
311
|
+
try {
|
|
312
|
+
const normalized = rawHref.startsWith('//') ? `https:${rawHref}` : rawHref;
|
|
313
|
+
const parsed = new URL(normalized, GOOGLE_SEARCH_ORIGIN);
|
|
314
|
+
const rawResultUrl = parsed.searchParams.get('q') ?? parsed.searchParams.get('url');
|
|
315
|
+
if (!rawResultUrl)
|
|
316
|
+
return null;
|
|
317
|
+
const candidate = new URL(decodeURIComponent(rawResultUrl), GOOGLE_SEARCH_ORIGIN);
|
|
318
|
+
if (!candidate.protocol.startsWith('http'))
|
|
319
|
+
return null;
|
|
320
|
+
if (candidate.hostname === 'www.google.com' && candidate.pathname === '/search')
|
|
321
|
+
return null;
|
|
322
|
+
return candidate.toString();
|
|
323
|
+
}
|
|
324
|
+
catch {
|
|
325
|
+
return null;
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
function extractDisplayedUrl(linkUrl, containerText) {
|
|
329
|
+
const direct = (() => {
|
|
330
|
+
try {
|
|
331
|
+
return new URL(linkUrl).hostname;
|
|
332
|
+
}
|
|
333
|
+
catch {
|
|
334
|
+
return '';
|
|
335
|
+
}
|
|
336
|
+
})();
|
|
337
|
+
if (direct)
|
|
338
|
+
return cleanText(direct);
|
|
339
|
+
return containerText ? cleanText(containerText).slice(0, 120) : '';
|
|
340
|
+
}
|
|
341
|
+
function looksLikeSnippet(text, title) {
|
|
342
|
+
const cleaned = cleanText(text);
|
|
343
|
+
if (cleaned.length < 25 || cleaned.length > 600)
|
|
344
|
+
return false;
|
|
345
|
+
if (cleaned === title)
|
|
346
|
+
return false;
|
|
347
|
+
if (/^https?:\/\//i.test(cleaned))
|
|
348
|
+
return false;
|
|
349
|
+
return true;
|
|
350
|
+
}
|
|
351
|
+
function parseResultStats(text) {
|
|
352
|
+
const normalized = text.replace(/,/g, '');
|
|
353
|
+
const match = normalized.match(/([0-9]+)\s*(?:result|resultado)/i);
|
|
354
|
+
if (!match)
|
|
355
|
+
return undefined;
|
|
356
|
+
const parsed = Number.parseInt(match[1], 10);
|
|
357
|
+
return Number.isFinite(parsed) ? parsed : undefined;
|
|
358
|
+
}
|
|
359
|
+
function parseSearchPage(html, options) {
|
|
360
|
+
const doc = ScrapeDocument.createSync(html, { baseUrl: GOOGLE_SEARCH_ORIGIN });
|
|
361
|
+
const results = [];
|
|
362
|
+
const seen = new Set();
|
|
363
|
+
const maxResults = options.maxResults ? Number(options.maxResults) : undefined;
|
|
364
|
+
const linkSelector = GOOGLE_RESULT_LINK_SELECTORS.join(', ');
|
|
365
|
+
const anchors = doc.selectAll(linkSelector);
|
|
366
|
+
for (const anchor of anchors) {
|
|
367
|
+
const rawHref = anchor.attr('href');
|
|
368
|
+
if (!rawHref)
|
|
369
|
+
continue;
|
|
370
|
+
const resultUrl = resolveSearchResultUrl(rawHref);
|
|
371
|
+
if (!resultUrl || seen.has(resultUrl))
|
|
372
|
+
continue;
|
|
373
|
+
const titleText = (() => {
|
|
374
|
+
const fromHeading = anchor.find('h3').text();
|
|
375
|
+
if (fromHeading)
|
|
376
|
+
return cleanText(fromHeading);
|
|
377
|
+
const fromContainer = anchor.text();
|
|
378
|
+
return cleanText(fromContainer);
|
|
379
|
+
})();
|
|
380
|
+
if (!titleText)
|
|
381
|
+
continue;
|
|
382
|
+
const resultContainer = anchor.parents(GOOGLE_RESULT_CONTAINER_SELECTORS).first();
|
|
383
|
+
const snippet = (() => {
|
|
384
|
+
for (const selector of GOOGLE_RESULT_SNIPPET_SELECTOR_ORDER) {
|
|
385
|
+
const snippetNode = resultContainer.find(selector).first();
|
|
386
|
+
const snippetText = cleanText(snippetNode.text());
|
|
387
|
+
if (looksLikeSnippet(snippetText, titleText)) {
|
|
388
|
+
return snippetText;
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
const fallbackElements = resultContainer.find('span,div').toArray();
|
|
392
|
+
for (const fallbackElement of fallbackElements) {
|
|
393
|
+
const fallbackText = cleanText(fallbackElement.text());
|
|
394
|
+
if (looksLikeSnippet(fallbackText, titleText)) {
|
|
395
|
+
return fallbackText;
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
const rootFallback = cleanText(resultContainer.text());
|
|
399
|
+
if (looksLikeSnippet(rootFallback, titleText)) {
|
|
400
|
+
return rootFallback.slice(0, 240);
|
|
401
|
+
}
|
|
402
|
+
return undefined;
|
|
403
|
+
})();
|
|
404
|
+
const item = {
|
|
405
|
+
rank: results.length + 1,
|
|
406
|
+
title: titleText,
|
|
407
|
+
url: resultUrl,
|
|
408
|
+
snippet,
|
|
409
|
+
displayedUrl: extractDisplayedUrl(resultUrl, anchor.text()),
|
|
410
|
+
};
|
|
411
|
+
results.push(item);
|
|
412
|
+
seen.add(resultUrl);
|
|
413
|
+
if (typeof maxResults === 'number' && Number.isFinite(maxResults) && results.length >= maxResults) {
|
|
414
|
+
break;
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
const nextPageRaw = (() => {
|
|
418
|
+
const candidate = doc.selectFirst('a#pnnext, a[aria-label="Next"], a[id="pnnext"]').first();
|
|
419
|
+
if (candidate && candidate.length) {
|
|
420
|
+
const href = candidate.attr('href');
|
|
421
|
+
if (!href)
|
|
422
|
+
return undefined;
|
|
423
|
+
try {
|
|
424
|
+
return new URL(href, GOOGLE_SEARCH_ORIGIN).toString();
|
|
425
|
+
}
|
|
426
|
+
catch {
|
|
427
|
+
return undefined;
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
return undefined;
|
|
431
|
+
})();
|
|
432
|
+
const nextPageStart = (() => {
|
|
433
|
+
if (!nextPageRaw)
|
|
434
|
+
return undefined;
|
|
435
|
+
try {
|
|
436
|
+
const nextUrl = new URL(nextPageRaw);
|
|
437
|
+
const next = nextUrl.searchParams.get('start');
|
|
438
|
+
if (!next)
|
|
439
|
+
return undefined;
|
|
440
|
+
const parsed = Number.parseInt(next, 10);
|
|
441
|
+
return Number.isFinite(parsed) ? parsed : undefined;
|
|
442
|
+
}
|
|
443
|
+
catch {
|
|
444
|
+
return undefined;
|
|
445
|
+
}
|
|
446
|
+
})();
|
|
447
|
+
const resultStats = parseResultStats(doc.selectFirst('#result-stats').text());
|
|
448
|
+
return {
|
|
449
|
+
results,
|
|
450
|
+
nextPageUrl: nextPageRaw,
|
|
451
|
+
nextPageStart,
|
|
452
|
+
resultStats,
|
|
453
|
+
};
|
|
454
|
+
}
|
|
455
|
+
export async function searchGoogleAdvanced(query, options = {}) {
|
|
456
|
+
const normalized = normalizeOptions(query, options);
|
|
457
|
+
const searchUrl = buildSearchUrl(query, normalized);
|
|
458
|
+
const fetchResult = await fetchSearchResults(searchUrl, normalized);
|
|
459
|
+
const parsed = parseSearchPage(fetchResult.html, normalized);
|
|
460
|
+
const response = {
|
|
461
|
+
query: normalized.query,
|
|
462
|
+
searchUrl,
|
|
463
|
+
results: parsed.results,
|
|
464
|
+
transport: {
|
|
465
|
+
requested: normalized.transport,
|
|
466
|
+
used: fetchResult.transport,
|
|
467
|
+
fallbackUsed: fetchResult.fallbackUsed,
|
|
468
|
+
impersonateAvailable: fetchResult.impersonateAvailable,
|
|
469
|
+
},
|
|
470
|
+
status: fetchResult.status,
|
|
471
|
+
block: fetchResult.block,
|
|
472
|
+
nextPageUrl: parsed.nextPageUrl,
|
|
473
|
+
nextPageStart: parsed.nextPageStart,
|
|
474
|
+
resultStats: parsed.resultStats,
|
|
475
|
+
};
|
|
476
|
+
if (normalized.includeRawHtml) {
|
|
477
|
+
response.rawHtml = fetchResult.html;
|
|
478
|
+
}
|
|
479
|
+
return response;
|
|
480
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { searchGoogleAdvanced } from './google.js';
|
package/dist/seo/analyzer.d.ts
CHANGED
|
@@ -12,6 +12,7 @@ export declare class SeoAnalyzer {
|
|
|
12
12
|
static fromHtml(html: string, options?: SeoAnalyzerFullOptions): Promise<SeoAnalyzer>;
|
|
13
13
|
analyze(): SeoReport;
|
|
14
14
|
private getMainBody;
|
|
15
|
+
private detectPageType;
|
|
15
16
|
private getVisibleText;
|
|
16
17
|
private buildRuleContext;
|
|
17
18
|
private analyzeUrlQuality;
|
|
@@ -32,7 +33,6 @@ export declare class SeoAnalyzer {
|
|
|
32
33
|
private analyzeAnalytics;
|
|
33
34
|
private analyzeFeeds;
|
|
34
35
|
private analyzeConversionElements;
|
|
35
|
-
private analyzeAdvancedImages;
|
|
36
36
|
private calculateTextHtmlRatio;
|
|
37
37
|
private convertToCheckResults;
|
|
38
38
|
private buildSummary;
|