recker 1.0.75 → 1.0.76-next.dfaea9d
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/search/google.d.ts +70 -0
- package/dist/browser/search/google.js +617 -0
- package/dist/browser/seo/index.d.ts +2 -0
- package/dist/browser/seo/index.js +1 -0
- package/dist/browser/seo/keyword-campaign.d.ts +107 -0
- package/dist/browser/seo/keyword-campaign.js +380 -0
- package/dist/search/google.d.ts +3 -0
- package/dist/search/google.js +137 -0
- package/dist/search/index.d.ts +1 -1
- package/dist/seo/index.d.ts +2 -0
- package/dist/seo/index.js +1 -0
- package/dist/seo/keyword-campaign.d.ts +107 -0
- package/dist/seo/keyword-campaign.js +380 -0
- package/dist/version.js +1 -1
- package/package.json +1 -1
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import { type BlockDetectionResult } from '../utils/block-detector.js';
|
|
2
|
+
type SearchTransport = 'auto' | 'undici' | 'curl';
|
|
3
|
+
export type { SearchTransport };
|
|
4
|
+
export interface GoogleSearchAdvancedOptions {
|
|
5
|
+
asQ?: string;
|
|
6
|
+
as_q?: string;
|
|
7
|
+
asEpq?: string;
|
|
8
|
+
as_epq?: string;
|
|
9
|
+
asOq?: string;
|
|
10
|
+
as_oq?: string;
|
|
11
|
+
asEq?: string;
|
|
12
|
+
as_eq?: string;
|
|
13
|
+
asSitesearch?: string;
|
|
14
|
+
as_sitesearch?: string;
|
|
15
|
+
asFiletype?: string;
|
|
16
|
+
as_filetype?: string;
|
|
17
|
+
asRights?: string;
|
|
18
|
+
as_rights?: string;
|
|
19
|
+
asNlo?: number | string;
|
|
20
|
+
as_nlo?: number | string;
|
|
21
|
+
asNhi?: number | string;
|
|
22
|
+
as_nhi?: number | string;
|
|
23
|
+
safe?: string;
|
|
24
|
+
tbm?: string;
|
|
25
|
+
num?: number;
|
|
26
|
+
start?: number;
|
|
27
|
+
tbs?: string;
|
|
28
|
+
lr?: string;
|
|
29
|
+
cr?: string;
|
|
30
|
+
country?: string;
|
|
31
|
+
gl?: string;
|
|
32
|
+
hl?: string;
|
|
33
|
+
transport?: SearchTransport;
|
|
34
|
+
timeout?: number;
|
|
35
|
+
maxResults?: number;
|
|
36
|
+
extraParams?: Record<string, string | number | boolean>;
|
|
37
|
+
userAgent?: string;
|
|
38
|
+
headers?: HeadersInit;
|
|
39
|
+
includeRawHtml?: boolean;
|
|
40
|
+
}
|
|
41
|
+
export type GoogleSearchResultPlacement = 'ad' | 'organic' | 'unknown';
|
|
42
|
+
export interface GoogleSearchResult {
|
|
43
|
+
rank: number;
|
|
44
|
+
title: string;
|
|
45
|
+
url: string;
|
|
46
|
+
snippet?: string;
|
|
47
|
+
displayedUrl?: string;
|
|
48
|
+
placement?: GoogleSearchResultPlacement;
|
|
49
|
+
placementHint?: string;
|
|
50
|
+
source?: string;
|
|
51
|
+
}
|
|
52
|
+
export interface SearchTransportDetails {
|
|
53
|
+
requested: SearchTransport;
|
|
54
|
+
used: SearchTransport;
|
|
55
|
+
fallbackUsed: boolean;
|
|
56
|
+
impersonateAvailable: boolean;
|
|
57
|
+
}
|
|
58
|
+
export interface GoogleSearchResponse {
|
|
59
|
+
query: string;
|
|
60
|
+
searchUrl: string;
|
|
61
|
+
results: GoogleSearchResult[];
|
|
62
|
+
nextPageUrl?: string;
|
|
63
|
+
nextPageStart?: number;
|
|
64
|
+
resultStats?: number;
|
|
65
|
+
block?: BlockDetectionResult;
|
|
66
|
+
transport: SearchTransportDetails;
|
|
67
|
+
status?: number;
|
|
68
|
+
rawHtml?: string;
|
|
69
|
+
}
|
|
70
|
+
export declare function searchGoogleAdvanced(query: string, options?: GoogleSearchAdvancedOptions): Promise<GoogleSearchResponse>;
|
|
@@ -0,0 +1,617 @@
|
|
|
1
|
+
import { createClient } from '../core/client.js';
|
|
2
|
+
import { ValidationError } from '../core/errors.js';
|
|
3
|
+
import { HttpRequest } from '../core/request.js';
|
|
4
|
+
import { ScrapeDocument } from '../scrape/document.js';
|
|
5
|
+
import { detectBlock } from '../utils/block-detector.js';
|
|
6
|
+
import { getRandomUserAgent } from '../utils/user-agent.js';
|
|
7
|
+
const GOOGLE_SEARCH_BASE_URL = 'https://www.google.com/search';
|
|
8
|
+
const GOOGLE_SEARCH_ORIGIN = 'https://www.google.com';
|
|
9
|
+
const GOOGLE_RESULT_SNIPPET_SELECTOR_ORDER = [
|
|
10
|
+
'[data-sncf="1"]',
|
|
11
|
+
'div[data-sncf="1"]',
|
|
12
|
+
'span.aCOpRe',
|
|
13
|
+
'div.aCOpRe',
|
|
14
|
+
'div.VwiC3b',
|
|
15
|
+
'span.VwiC3b',
|
|
16
|
+
'div.BNeawe',
|
|
17
|
+
'div.yXK7lf',
|
|
18
|
+
'div[data-attrid="wa:/description"]',
|
|
19
|
+
];
|
|
20
|
+
const GOOGLE_RESULT_LINK_SELECTORS = [
|
|
21
|
+
'a[href^="/url?q="]',
|
|
22
|
+
'a[href^="https://www.google.com/url?"]',
|
|
23
|
+
'a[href^="http://www.google.com/url?"]',
|
|
24
|
+
];
|
|
25
|
+
const GOOGLE_RESULT_CONTAINER_SELECTORS = '[data-hveid], [data-ved], div[class*="g"], div[class*="MjjY"], div[class*="tF2Cxc"], [class*="xpd"]';
|
|
26
|
+
const GOOGLE_AD_CONTAINER_CLASS_HINTS = [
|
|
27
|
+
'ad',
|
|
28
|
+
'ads',
|
|
29
|
+
'sponsored',
|
|
30
|
+
'ad_cx',
|
|
31
|
+
'pla',
|
|
32
|
+
'shopping',
|
|
33
|
+
'uEierd',
|
|
34
|
+
];
|
|
35
|
+
const GOOGLE_AD_TEXT_HINTS = [
|
|
36
|
+
'anúncio',
|
|
37
|
+
'anuncio',
|
|
38
|
+
'sponsored',
|
|
39
|
+
'patrocinado',
|
|
40
|
+
'patrocinada',
|
|
41
|
+
'patrocínio',
|
|
42
|
+
'publi',
|
|
43
|
+
'publicidade',
|
|
44
|
+
'ad',
|
|
45
|
+
'ads',
|
|
46
|
+
'anúncios',
|
|
47
|
+
'anunciante',
|
|
48
|
+
];
|
|
49
|
+
const COUNTRY_CODE_PATTERN = /^[a-z]{2}$/;
|
|
50
|
+
const COUNTRY_ALIASES = {
|
|
51
|
+
us: 'us',
|
|
52
|
+
usa: 'us',
|
|
53
|
+
united_states: 'us',
|
|
54
|
+
'united states': 'us',
|
|
55
|
+
br: 'br',
|
|
56
|
+
brasil: 'br',
|
|
57
|
+
brazil: 'br',
|
|
58
|
+
pt_br: 'br',
|
|
59
|
+
pt: 'br',
|
|
60
|
+
de: 'de',
|
|
61
|
+
germany: 'de',
|
|
62
|
+
deutschland: 'de',
|
|
63
|
+
gb: 'gb',
|
|
64
|
+
uk: 'gb',
|
|
65
|
+
england: 'gb',
|
|
66
|
+
britain: 'gb',
|
|
67
|
+
'united kingdom': 'gb',
|
|
68
|
+
fr: 'fr',
|
|
69
|
+
france: 'fr',
|
|
70
|
+
spain: 'es',
|
|
71
|
+
españa: 'es',
|
|
72
|
+
es: 'es',
|
|
73
|
+
ca: 'ca',
|
|
74
|
+
mexico: 'mx',
|
|
75
|
+
mx: 'mx',
|
|
76
|
+
it: 'it',
|
|
77
|
+
italy: 'it',
|
|
78
|
+
au: 'au',
|
|
79
|
+
india: 'in',
|
|
80
|
+
in: 'in',
|
|
81
|
+
argentina: 'ar',
|
|
82
|
+
ar: 'ar',
|
|
83
|
+
};
|
|
84
|
+
const COUNTRY_ALIASES_NORMALIZED = Object.entries(COUNTRY_ALIASES).reduce((acc, [key, value]) => {
|
|
85
|
+
acc[key.toLowerCase().replace(/[^a-z]/g, '_')] = value;
|
|
86
|
+
return acc;
|
|
87
|
+
}, {});
|
|
88
|
+
function cleanText(value) {
|
|
89
|
+
return value.replace(/\s+/g, ' ').trim();
|
|
90
|
+
}
|
|
91
|
+
function isDefined(value) {
|
|
92
|
+
return value !== undefined && value !== null;
|
|
93
|
+
}
|
|
94
|
+
function toParamValue(value) {
|
|
95
|
+
if (value === undefined)
|
|
96
|
+
return undefined;
|
|
97
|
+
if (typeof value === 'boolean')
|
|
98
|
+
return value ? '1' : '0';
|
|
99
|
+
const trimmed = String(value).trim();
|
|
100
|
+
return trimmed.length > 0 ? trimmed : undefined;
|
|
101
|
+
}
|
|
102
|
+
function pick(...values) {
|
|
103
|
+
for (const value of values) {
|
|
104
|
+
if (value !== undefined) {
|
|
105
|
+
return value;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
return undefined;
|
|
109
|
+
}
|
|
110
|
+
function normalizeCountryCode(input) {
|
|
111
|
+
const normalized = input.trim().toLowerCase().replace(/\s+/g, '_');
|
|
112
|
+
if (COUNTRY_CODE_PATTERN.test(normalized)) {
|
|
113
|
+
return normalized;
|
|
114
|
+
}
|
|
115
|
+
const cleaned = normalized.replace(/[^a-z_]/g, '');
|
|
116
|
+
const directAlias = COUNTRY_ALIASES_NORMALIZED[cleaned];
|
|
117
|
+
if (directAlias) {
|
|
118
|
+
return directAlias;
|
|
119
|
+
}
|
|
120
|
+
if (normalized.includes('_')) {
|
|
121
|
+
const tail = normalized.split('_').pop() || '';
|
|
122
|
+
if (COUNTRY_CODE_PATTERN.test(tail)) {
|
|
123
|
+
return tail;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
return '';
|
|
127
|
+
}
|
|
128
|
+
function resolveCountryCode(country, legacyGl) {
|
|
129
|
+
if (country !== undefined) {
|
|
130
|
+
const resolved = normalizeCountryCode(country);
|
|
131
|
+
if (!resolved) {
|
|
132
|
+
throw new ValidationError('Invalid country for Google search. Use ISO 3166-1 alpha-2 code or a known country name.', {
|
|
133
|
+
field: 'country',
|
|
134
|
+
value: country,
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
return resolved;
|
|
138
|
+
}
|
|
139
|
+
return legacyGl ? legacyGl.trim().toLowerCase() : '';
|
|
140
|
+
}
|
|
141
|
+
function normalizeOptions(query, options = {}) {
|
|
142
|
+
const normalizedQuery = cleanText(query);
|
|
143
|
+
if (!normalizedQuery) {
|
|
144
|
+
throw new ValidationError('Google query is required', {
|
|
145
|
+
field: 'query',
|
|
146
|
+
value: query,
|
|
147
|
+
});
|
|
148
|
+
}
|
|
149
|
+
const resolvedTransport = (options.transport ?? 'auto');
|
|
150
|
+
return {
|
|
151
|
+
...options,
|
|
152
|
+
as_q: pick(options.as_q, options.asQ) ?? '',
|
|
153
|
+
query: normalizedQuery,
|
|
154
|
+
asEpq: pick(options.as_epq, options.asEpq) ?? '',
|
|
155
|
+
asOq: pick(options.as_oq, options.asOq) ?? '',
|
|
156
|
+
asEq: pick(options.as_eq, options.asEq) ?? '',
|
|
157
|
+
as_sitesearch: pick(options.as_sitesearch, options.asSitesearch) ?? '',
|
|
158
|
+
as_filetype: pick(options.as_filetype, options.asFiletype) ?? '',
|
|
159
|
+
as_rights: pick(options.as_rights, options.asRights) ?? '',
|
|
160
|
+
as_nlo: toParamValue(pick(options.as_nlo, options.asNlo)) ?? '',
|
|
161
|
+
as_nhi: toParamValue(pick(options.as_nhi, options.asNhi)) ?? '',
|
|
162
|
+
gl: resolveCountryCode(options.country, options.gl),
|
|
163
|
+
transport: resolvedTransport,
|
|
164
|
+
includeRawHtml: options.includeRawHtml ?? false,
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
function buildSearchUrl(query, options) {
|
|
168
|
+
const params = new URLSearchParams();
|
|
169
|
+
params.set('q', query);
|
|
170
|
+
params.set('ie', 'UTF-8');
|
|
171
|
+
params.set('oe', 'UTF-8');
|
|
172
|
+
if (options.as_q)
|
|
173
|
+
params.set('as_q', options.as_q);
|
|
174
|
+
if (options.asEpq)
|
|
175
|
+
params.set('as_epq', options.asEpq);
|
|
176
|
+
if (options.asOq)
|
|
177
|
+
params.set('as_oq', options.asOq);
|
|
178
|
+
if (options.asEq)
|
|
179
|
+
params.set('as_eq', options.asEq);
|
|
180
|
+
if (options.as_sitesearch)
|
|
181
|
+
params.set('as_sitesearch', options.as_sitesearch);
|
|
182
|
+
if (options.as_filetype)
|
|
183
|
+
params.set('as_filetype', options.as_filetype);
|
|
184
|
+
if (options.as_rights)
|
|
185
|
+
params.set('as_rights', options.as_rights);
|
|
186
|
+
if (options.as_nlo)
|
|
187
|
+
params.set('as_nlo', options.as_nlo);
|
|
188
|
+
if (options.as_nhi)
|
|
189
|
+
params.set('as_nhi', options.as_nhi);
|
|
190
|
+
if (options.safe)
|
|
191
|
+
params.set('safe', options.safe);
|
|
192
|
+
if (options.tbm)
|
|
193
|
+
params.set('tbm', options.tbm);
|
|
194
|
+
if (options.lr)
|
|
195
|
+
params.set('lr', options.lr);
|
|
196
|
+
if (options.cr)
|
|
197
|
+
params.set('cr', options.cr);
|
|
198
|
+
if (options.gl)
|
|
199
|
+
params.set('gl', options.gl);
|
|
200
|
+
if (options.hl)
|
|
201
|
+
params.set('hl', options.hl);
|
|
202
|
+
if (isDefined(options.num)) {
|
|
203
|
+
const parsed = Number(options.num);
|
|
204
|
+
if (Number.isFinite(parsed) && parsed > 0) {
|
|
205
|
+
params.set('num', String(Math.min(100, Math.floor(parsed))));
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
if (isDefined(options.start)) {
|
|
209
|
+
const parsedStart = Number(options.start);
|
|
210
|
+
if (Number.isFinite(parsedStart) && parsedStart >= 0) {
|
|
211
|
+
params.set('start', String(Math.floor(parsedStart)));
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
if (options.tbs)
|
|
215
|
+
params.set('tbs', options.tbs);
|
|
216
|
+
if (options.extraParams) {
|
|
217
|
+
for (const [key, value] of Object.entries(options.extraParams)) {
|
|
218
|
+
const normalized = toParamValue(value);
|
|
219
|
+
if (normalized !== undefined) {
|
|
220
|
+
params.set(key, normalized);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
return `${GOOGLE_SEARCH_BASE_URL}?${params.toString()}`;
|
|
225
|
+
}
|
|
226
|
+
function normalizeRequestHeaders(inputHeaders, userAgent) {
|
|
227
|
+
const headers = new Headers({
|
|
228
|
+
accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
229
|
+
'accept-language': 'en-US,en;q=0.9',
|
|
230
|
+
'cache-control': 'max-age=0',
|
|
231
|
+
'sec-ch-ua-mobile': '?0',
|
|
232
|
+
'sec-ch-ua-platform': '"Windows"',
|
|
233
|
+
'user-agent': userAgent,
|
|
234
|
+
referer: GOOGLE_SEARCH_ORIGIN,
|
|
235
|
+
});
|
|
236
|
+
if (inputHeaders) {
|
|
237
|
+
const incoming = new Headers(inputHeaders);
|
|
238
|
+
incoming.forEach((value, key) => headers.set(key, value));
|
|
239
|
+
}
|
|
240
|
+
const merged = {};
|
|
241
|
+
headers.forEach((value, key) => {
|
|
242
|
+
merged[key] = value;
|
|
243
|
+
});
|
|
244
|
+
return merged;
|
|
245
|
+
}
|
|
246
|
+
async function hasImpersonateBinary() {
|
|
247
|
+
try {
|
|
248
|
+
const { hasImpersonate } = await import('../utils/binary-manager.js');
|
|
249
|
+
return hasImpersonate();
|
|
250
|
+
}
|
|
251
|
+
catch {
|
|
252
|
+
return false;
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
async function fetchWithCurl(url, headers, timeout) {
|
|
256
|
+
const { CurlTransport } = await import('../transport/curl.js');
|
|
257
|
+
const transport = new CurlTransport();
|
|
258
|
+
const request = new HttpRequest(url, {
|
|
259
|
+
method: 'GET',
|
|
260
|
+
headers,
|
|
261
|
+
timeout,
|
|
262
|
+
});
|
|
263
|
+
const response = await transport.dispatch(request);
|
|
264
|
+
const html = await response.text();
|
|
265
|
+
return { html, status: response.status };
|
|
266
|
+
}
|
|
267
|
+
async function fetchSearchResults(url, options) {
|
|
268
|
+
const headers = normalizeRequestHeaders(options.headers, options.userAgent ?? getRandomUserAgent('desktop.chrome'));
|
|
269
|
+
const requestTimeout = options.timeout;
|
|
270
|
+
const impersonateAvailable = options.transport !== 'undici' && (await hasImpersonateBinary());
|
|
271
|
+
if (options.transport === 'curl' && !impersonateAvailable) {
|
|
272
|
+
throw new ValidationError('Transport "curl" requires curl-impersonate; install it with `rek setup`', {
|
|
273
|
+
field: 'transport',
|
|
274
|
+
value: options.transport,
|
|
275
|
+
});
|
|
276
|
+
}
|
|
277
|
+
if (options.transport === 'curl') {
|
|
278
|
+
const directResponse = await fetchWithCurl(url, headers, requestTimeout);
|
|
279
|
+
const directBlock = detectBlock({ status: directResponse.status, headers: new Headers() }, directResponse.html);
|
|
280
|
+
return {
|
|
281
|
+
html: directResponse.html,
|
|
282
|
+
status: directResponse.status,
|
|
283
|
+
transport: 'curl',
|
|
284
|
+
fallbackUsed: false,
|
|
285
|
+
impersonateAvailable,
|
|
286
|
+
block: directBlock,
|
|
287
|
+
};
|
|
288
|
+
}
|
|
289
|
+
const client = createClient({ timeout: requestTimeout });
|
|
290
|
+
const performUndiciRequest = async () => {
|
|
291
|
+
const response = await client.get(url, { headers });
|
|
292
|
+
const html = await response.text();
|
|
293
|
+
const block = detectBlock({ status: response.status, headers: response.headers }, html);
|
|
294
|
+
return {
|
|
295
|
+
html,
|
|
296
|
+
status: response.status,
|
|
297
|
+
transport: 'undici',
|
|
298
|
+
fallbackUsed: false,
|
|
299
|
+
impersonateAvailable,
|
|
300
|
+
block,
|
|
301
|
+
};
|
|
302
|
+
};
|
|
303
|
+
if (options.transport === 'undici') {
|
|
304
|
+
return performUndiciRequest();
|
|
305
|
+
}
|
|
306
|
+
if (!impersonateAvailable) {
|
|
307
|
+
return performUndiciRequest();
|
|
308
|
+
}
|
|
309
|
+
try {
|
|
310
|
+
const primaryImpersonateResponse = await fetchWithCurl(url, headers, requestTimeout);
|
|
311
|
+
const primaryImpersonateBlock = detectBlock({ status: primaryImpersonateResponse.status, headers: new Headers() }, primaryImpersonateResponse.html);
|
|
312
|
+
if (!primaryImpersonateBlock.blocked || primaryImpersonateBlock.confidence <= 0.7) {
|
|
313
|
+
return {
|
|
314
|
+
html: primaryImpersonateResponse.html,
|
|
315
|
+
status: primaryImpersonateResponse.status,
|
|
316
|
+
transport: 'curl',
|
|
317
|
+
fallbackUsed: false,
|
|
318
|
+
impersonateAvailable,
|
|
319
|
+
block: primaryImpersonateBlock,
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
catch {
|
|
324
|
+
}
|
|
325
|
+
const fallback = await performUndiciRequest();
|
|
326
|
+
return {
|
|
327
|
+
...fallback,
|
|
328
|
+
transport: fallback.transport,
|
|
329
|
+
fallbackUsed: true,
|
|
330
|
+
impersonateAvailable,
|
|
331
|
+
};
|
|
332
|
+
}
|
|
333
|
+
function resolveSearchResultUrl(rawHref) {
|
|
334
|
+
try {
|
|
335
|
+
const normalized = rawHref.startsWith('//') ? `https:${rawHref}` : rawHref;
|
|
336
|
+
const parsed = new URL(normalized, GOOGLE_SEARCH_ORIGIN);
|
|
337
|
+
const rawResultUrl = parsed.searchParams.get('q') ?? parsed.searchParams.get('url');
|
|
338
|
+
if (!rawResultUrl)
|
|
339
|
+
return null;
|
|
340
|
+
const candidate = new URL(decodeURIComponent(rawResultUrl), GOOGLE_SEARCH_ORIGIN);
|
|
341
|
+
if (!candidate.protocol.startsWith('http'))
|
|
342
|
+
return null;
|
|
343
|
+
if (candidate.hostname === 'www.google.com' && candidate.pathname === '/search')
|
|
344
|
+
return null;
|
|
345
|
+
return candidate.toString();
|
|
346
|
+
}
|
|
347
|
+
catch {
|
|
348
|
+
return null;
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
function extractDisplayedUrl(linkUrl, containerText) {
|
|
352
|
+
const direct = (() => {
|
|
353
|
+
try {
|
|
354
|
+
return new URL(linkUrl).hostname;
|
|
355
|
+
}
|
|
356
|
+
catch {
|
|
357
|
+
return '';
|
|
358
|
+
}
|
|
359
|
+
})();
|
|
360
|
+
if (direct)
|
|
361
|
+
return cleanText(direct);
|
|
362
|
+
return containerText ? cleanText(containerText).slice(0, 120) : '';
|
|
363
|
+
}
|
|
364
|
+
function looksLikeSnippet(text, title) {
|
|
365
|
+
const cleaned = cleanText(text);
|
|
366
|
+
if (cleaned.length < 25 || cleaned.length > 600)
|
|
367
|
+
return false;
|
|
368
|
+
if (cleaned === title)
|
|
369
|
+
return false;
|
|
370
|
+
if (/^https?:\/\//i.test(cleaned))
|
|
371
|
+
return false;
|
|
372
|
+
return true;
|
|
373
|
+
}
|
|
374
|
+
function escapeRegex(value) {
|
|
375
|
+
return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
376
|
+
}
|
|
377
|
+
function hasAdHintText(value) {
|
|
378
|
+
if (!value)
|
|
379
|
+
return false;
|
|
380
|
+
const normalized = cleanText(value).toLowerCase();
|
|
381
|
+
if (!normalized)
|
|
382
|
+
return false;
|
|
383
|
+
return GOOGLE_AD_TEXT_HINTS.some((hint) => {
|
|
384
|
+
const pattern = new RegExp(`(^|\\W)${escapeRegex(hint)}(\\W|$)`, 'i');
|
|
385
|
+
return pattern.test(normalized);
|
|
386
|
+
});
|
|
387
|
+
}
|
|
388
|
+
function hasAdClassHint(className) {
|
|
389
|
+
if (!className)
|
|
390
|
+
return false;
|
|
391
|
+
const normalized = cleanText(className).toLowerCase();
|
|
392
|
+
if (!normalized)
|
|
393
|
+
return false;
|
|
394
|
+
return GOOGLE_AD_CONTAINER_CLASS_HINTS.some((hint) => {
|
|
395
|
+
const classes = normalized.split(/\s+/);
|
|
396
|
+
return classes.some((token) => token === hint || token.startsWith(`${hint}-`));
|
|
397
|
+
});
|
|
398
|
+
}
|
|
399
|
+
function detectResultPlacement(anchor, container) {
|
|
400
|
+
const isScrapeElementLike = (value) => {
|
|
401
|
+
if (value === null || value === undefined)
|
|
402
|
+
return false;
|
|
403
|
+
if (typeof value !== 'object')
|
|
404
|
+
return false;
|
|
405
|
+
const candidate = value;
|
|
406
|
+
return 'length' in candidate
|
|
407
|
+
&& typeof candidate.length === 'number'
|
|
408
|
+
&& typeof candidate.attrs === 'function';
|
|
409
|
+
};
|
|
410
|
+
const isResultContainer = (node) => {
|
|
411
|
+
const tag = (() => {
|
|
412
|
+
const raw = node;
|
|
413
|
+
if (typeof raw.tagName === 'function')
|
|
414
|
+
return raw.tagName().toLowerCase();
|
|
415
|
+
if (typeof raw.tagName === 'string')
|
|
416
|
+
return raw.tagName.toLowerCase();
|
|
417
|
+
return '';
|
|
418
|
+
})();
|
|
419
|
+
return tag !== 'body' && tag !== 'html';
|
|
420
|
+
};
|
|
421
|
+
const anchorParent = typeof anchor.parent === 'function' ? anchor.parent() : undefined;
|
|
422
|
+
const containerParent = typeof container.parent === 'function' ? container.parent() : undefined;
|
|
423
|
+
const anchorNext = typeof anchor.next === 'function' ? anchor.next() : undefined;
|
|
424
|
+
const anchorPrev = typeof anchor.prev === 'function' ? anchor.prev() : undefined;
|
|
425
|
+
const checkList = [
|
|
426
|
+
anchor,
|
|
427
|
+
anchorParent,
|
|
428
|
+
container,
|
|
429
|
+
containerParent,
|
|
430
|
+
anchorNext,
|
|
431
|
+
anchorPrev,
|
|
432
|
+
].filter((node) => {
|
|
433
|
+
if (!isScrapeElementLike(node) || node.length === 0)
|
|
434
|
+
return false;
|
|
435
|
+
return isResultContainer(node);
|
|
436
|
+
});
|
|
437
|
+
for (const node of checkList) {
|
|
438
|
+
const attributes = node.attrs();
|
|
439
|
+
const className = node.attr('class');
|
|
440
|
+
const dataAttributeKeys = Object.keys(attributes).filter((key) => {
|
|
441
|
+
const normalized = key.toLowerCase();
|
|
442
|
+
return normalized === 'data-text-ad'
|
|
443
|
+
|| normalized === 'data-rw'
|
|
444
|
+
|| normalized === 'data-snc'
|
|
445
|
+
|| normalized.startsWith('data-ad-')
|
|
446
|
+
|| normalized === 'data-ved'
|
|
447
|
+
|| normalized === 'data-pcu';
|
|
448
|
+
});
|
|
449
|
+
if (dataAttributeKeys.length > 0) {
|
|
450
|
+
return {
|
|
451
|
+
placement: 'ad',
|
|
452
|
+
placementHint: cleanText(`${dataAttributeKeys.join(' ')} ${node.text()}`.slice(0, 120)),
|
|
453
|
+
};
|
|
454
|
+
}
|
|
455
|
+
const dataAttributes = Object.entries(attributes)
|
|
456
|
+
.filter(([key]) => key.startsWith('data-'))
|
|
457
|
+
.map(([, value]) => value)
|
|
458
|
+
.filter((value) => typeof value === 'string');
|
|
459
|
+
const text = cleanText(node.text());
|
|
460
|
+
const dataText = dataAttributes.join(' ');
|
|
461
|
+
if (hasAdHintText(node.attr('data-ved'))) {
|
|
462
|
+
return {
|
|
463
|
+
placement: 'ad',
|
|
464
|
+
placementHint: cleanText(`${text} ${dataText}`.slice(0, 120)),
|
|
465
|
+
};
|
|
466
|
+
}
|
|
467
|
+
if (hasAdClassHint(className) || hasAdHintText(dataText)) {
|
|
468
|
+
return {
|
|
469
|
+
placement: 'ad',
|
|
470
|
+
placementHint: cleanText(`${className} ${dataText}`.slice(0, 120)),
|
|
471
|
+
};
|
|
472
|
+
}
|
|
473
|
+
if (hasAdHintText(text)) {
|
|
474
|
+
return {
|
|
475
|
+
placement: 'ad',
|
|
476
|
+
placementHint: text,
|
|
477
|
+
};
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
return {
|
|
481
|
+
placement: 'organic',
|
|
482
|
+
placementHint: undefined,
|
|
483
|
+
};
|
|
484
|
+
}
|
|
485
|
+
function parseResultStats(text) {
|
|
486
|
+
const normalized = text.replace(/,/g, '');
|
|
487
|
+
const match = normalized.match(/([0-9]+)\s*(?:result|resultado)/i);
|
|
488
|
+
if (!match)
|
|
489
|
+
return undefined;
|
|
490
|
+
const parsed = Number.parseInt(match[1], 10);
|
|
491
|
+
return Number.isFinite(parsed) ? parsed : undefined;
|
|
492
|
+
}
|
|
493
|
+
function parseSearchPage(html, options) {
|
|
494
|
+
const doc = ScrapeDocument.createSync(html, { baseUrl: GOOGLE_SEARCH_ORIGIN });
|
|
495
|
+
const results = [];
|
|
496
|
+
const seen = new Set();
|
|
497
|
+
const maxResults = options.maxResults ? Number(options.maxResults) : undefined;
|
|
498
|
+
const linkSelector = GOOGLE_RESULT_LINK_SELECTORS.join(', ');
|
|
499
|
+
const anchors = doc.selectAll(linkSelector);
|
|
500
|
+
for (const anchor of anchors) {
|
|
501
|
+
const rawHref = anchor.attr('href');
|
|
502
|
+
if (!rawHref)
|
|
503
|
+
continue;
|
|
504
|
+
const resultUrl = resolveSearchResultUrl(rawHref);
|
|
505
|
+
if (!resultUrl || seen.has(resultUrl))
|
|
506
|
+
continue;
|
|
507
|
+
const titleText = (() => {
|
|
508
|
+
const fromHeading = anchor.find('h3').text();
|
|
509
|
+
if (fromHeading)
|
|
510
|
+
return cleanText(fromHeading);
|
|
511
|
+
const fromContainer = anchor.text();
|
|
512
|
+
return cleanText(fromContainer);
|
|
513
|
+
})();
|
|
514
|
+
if (!titleText)
|
|
515
|
+
continue;
|
|
516
|
+
const resultContainer = anchor.parents(GOOGLE_RESULT_CONTAINER_SELECTORS).first();
|
|
517
|
+
const placement = detectResultPlacement(anchor, resultContainer);
|
|
518
|
+
const snippet = (() => {
|
|
519
|
+
for (const selector of GOOGLE_RESULT_SNIPPET_SELECTOR_ORDER) {
|
|
520
|
+
const snippetNode = resultContainer.find(selector).first();
|
|
521
|
+
const snippetText = cleanText(snippetNode.text());
|
|
522
|
+
if (looksLikeSnippet(snippetText, titleText)) {
|
|
523
|
+
return snippetText;
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
const fallbackElements = resultContainer.find('span,div').toArray();
|
|
527
|
+
for (const fallbackElement of fallbackElements) {
|
|
528
|
+
const fallbackText = cleanText(fallbackElement.text());
|
|
529
|
+
if (looksLikeSnippet(fallbackText, titleText)) {
|
|
530
|
+
return fallbackText;
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
const rootFallback = cleanText(resultContainer.text());
|
|
534
|
+
if (looksLikeSnippet(rootFallback, titleText)) {
|
|
535
|
+
return rootFallback.slice(0, 240);
|
|
536
|
+
}
|
|
537
|
+
return undefined;
|
|
538
|
+
})();
|
|
539
|
+
const item = {
|
|
540
|
+
rank: results.length + 1,
|
|
541
|
+
title: titleText,
|
|
542
|
+
url: resultUrl,
|
|
543
|
+
snippet,
|
|
544
|
+
displayedUrl: extractDisplayedUrl(resultUrl, anchor.text()),
|
|
545
|
+
placement: placement.placement,
|
|
546
|
+
placementHint: placement.placementHint,
|
|
547
|
+
};
|
|
548
|
+
results.push(item);
|
|
549
|
+
seen.add(resultUrl);
|
|
550
|
+
if (typeof maxResults === 'number' && Number.isFinite(maxResults) && results.length >= maxResults) {
|
|
551
|
+
break;
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
const nextPageRaw = (() => {
|
|
555
|
+
const candidate = doc.selectFirst('a#pnnext, a[aria-label="Next"], a[id="pnnext"]').first();
|
|
556
|
+
if (candidate && candidate.length) {
|
|
557
|
+
const href = candidate.attr('href');
|
|
558
|
+
if (!href)
|
|
559
|
+
return undefined;
|
|
560
|
+
try {
|
|
561
|
+
return new URL(href, GOOGLE_SEARCH_ORIGIN).toString();
|
|
562
|
+
}
|
|
563
|
+
catch {
|
|
564
|
+
return undefined;
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
return undefined;
|
|
568
|
+
})();
|
|
569
|
+
const nextPageStart = (() => {
|
|
570
|
+
if (!nextPageRaw)
|
|
571
|
+
return undefined;
|
|
572
|
+
try {
|
|
573
|
+
const nextUrl = new URL(nextPageRaw);
|
|
574
|
+
const next = nextUrl.searchParams.get('start');
|
|
575
|
+
if (!next)
|
|
576
|
+
return undefined;
|
|
577
|
+
const parsed = Number.parseInt(next, 10);
|
|
578
|
+
return Number.isFinite(parsed) ? parsed : undefined;
|
|
579
|
+
}
|
|
580
|
+
catch {
|
|
581
|
+
return undefined;
|
|
582
|
+
}
|
|
583
|
+
})();
|
|
584
|
+
const resultStats = parseResultStats(doc.selectFirst('#result-stats').text());
|
|
585
|
+
return {
|
|
586
|
+
results,
|
|
587
|
+
nextPageUrl: nextPageRaw,
|
|
588
|
+
nextPageStart,
|
|
589
|
+
resultStats,
|
|
590
|
+
};
|
|
591
|
+
}
|
|
592
|
+
export async function searchGoogleAdvanced(query, options = {}) {
|
|
593
|
+
const normalized = normalizeOptions(query, options);
|
|
594
|
+
const searchUrl = buildSearchUrl(query, normalized);
|
|
595
|
+
const fetchResult = await fetchSearchResults(searchUrl, normalized);
|
|
596
|
+
const parsed = parseSearchPage(fetchResult.html, normalized);
|
|
597
|
+
const response = {
|
|
598
|
+
query: normalized.query,
|
|
599
|
+
searchUrl,
|
|
600
|
+
results: parsed.results,
|
|
601
|
+
transport: {
|
|
602
|
+
requested: normalized.transport,
|
|
603
|
+
used: fetchResult.transport,
|
|
604
|
+
fallbackUsed: fetchResult.fallbackUsed,
|
|
605
|
+
impersonateAvailable: fetchResult.impersonateAvailable,
|
|
606
|
+
},
|
|
607
|
+
status: fetchResult.status,
|
|
608
|
+
block: fetchResult.block,
|
|
609
|
+
nextPageUrl: parsed.nextPageUrl,
|
|
610
|
+
nextPageStart: parsed.nextPageStart,
|
|
611
|
+
resultStats: parsed.resultStats,
|
|
612
|
+
};
|
|
613
|
+
if (normalized.includeRawHtml) {
|
|
614
|
+
response.rawHtml = fetchResult.html;
|
|
615
|
+
}
|
|
616
|
+
return response;
|
|
617
|
+
}
|
|
@@ -15,3 +15,5 @@ export { parseSitemap, validateSitemap, discoverSitemaps, fetchAndValidateSitema
|
|
|
15
15
|
export type { SitemapUrl, SitemapIndex, SitemapParseResult, SitemapValidationIssue, SitemapValidationResult, } from './validators/sitemap.js';
|
|
16
16
|
export { parseLlmsTxt, validateLlmsTxt, fetchAndValidateLlmsTxt, generateLlmsTxtTemplate, } from './validators/llms-txt.js';
|
|
17
17
|
export type { LlmsTxtLink, LlmsTxtSection, LlmsTxtParseResult, LlmsTxtValidationIssue, LlmsTxtValidationResult, } from './validators/llms-txt.js';
|
|
18
|
+
export { analyzeKeywordCampaign, extractKeywordCampaignSeedsFromReport, } from './keyword-campaign.js';
|
|
19
|
+
export type { CampaignActivitySignal, CampaignResultPlacement, KeywordCampaignCompetitorResult, KeywordCampaignCompetitorSummary, KeywordCampaignExtractionOptions, KeywordCampaignOptions, KeywordCampaignPageStats, KeywordCampaignReport, KeywordCampaignResult, KeywordCampaignSeed, KeywordCampaignSeedInput, KeywordCampaignSource, KeywordCampaignSummary, } from './keyword-campaign.js';
|
|
@@ -6,3 +6,4 @@ export { generateSeoFilename, resolveOutputPath, writeReport, formatReportForJso
|
|
|
6
6
|
export { parseRobotsTxt, validateRobotsTxt, isPathAllowed, fetchAndValidateRobotsTxt, } from './validators/robots.js';
|
|
7
7
|
export { parseSitemap, validateSitemap, discoverSitemaps, fetchAndValidateSitemap, } from './validators/sitemap.js';
|
|
8
8
|
export { parseLlmsTxt, validateLlmsTxt, fetchAndValidateLlmsTxt, generateLlmsTxtTemplate, } from './validators/llms-txt.js';
|
|
9
|
+
export { analyzeKeywordCampaign, extractKeywordCampaignSeedsFromReport, } from './keyword-campaign.js';
|